summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore8
-rw-r--r--.mailmap84
-rw-r--r--COPYING50
-rw-r--r--PendingReleaseNotes35
-rw-r--r--README28
-rw-r--r--admin/doc-requirements.txt2
-rw-r--r--ceph.spec.in11
-rw-r--r--configure.ac57
-rw-r--r--debian/changelog18
-rw-r--r--debian/control1
-rw-r--r--debian/copyright62
-rwxr-xr-xdo_autogen.sh9
-rw-r--r--doc/architecture.rst2
-rw-r--r--doc/changelog/v0.56.6.txt40
-rw-r--r--doc/changelog/v0.56.7.txt454
-rw-r--r--doc/changelog/v0.67.3.txt700
-rw-r--r--doc/changelog/v0.67.4.txt550
-rw-r--r--doc/dev/cache-pool.rst70
-rw-r--r--doc/dev/corpus.rst72
-rw-r--r--doc/dev/generatedocs.rst2
-rw-r--r--doc/dev/mon-bootstrap.rst2
-rw-r--r--doc/dev/osd_internals/erasure_coding.rst26
-rw-r--r--doc/dev/osd_internals/erasure_coding/PGBackend-h.rst156
-rw-r--r--doc/dev/osd_internals/erasure_coding/developer_notes.rst376
-rw-r--r--doc/dev/osd_internals/erasure_coding/jerasure.rst22
-rw-r--r--doc/dev/osd_internals/erasure_coding/pgbackend.rst74
-rw-r--r--doc/dev/osd_internals/snaps.rst11
-rw-r--r--doc/dev/release-process.rst60
-rw-r--r--doc/dev/repo-lab-access.rst30
-rw-r--r--doc/dev/versions.rst42
-rw-r--r--doc/index.rst1
-rw-r--r--doc/install/index.rst69
-rw-r--r--doc/install/libvirt-deb.rst43
-rw-r--r--doc/install/libvirt-rpm.rst19
-rw-r--r--doc/install/qemu-deb.rst26
-rw-r--r--doc/install/qemu-rpm.rst56
-rw-r--r--doc/install/rpm.rst167
-rw-r--r--doc/install/yum-priorities.rst20
-rw-r--r--doc/man/8/ceph-conf.rst1
-rw-r--r--doc/man/8/ceph-osd.rst3
-rw-r--r--doc/man/8/ceph-post-file.rst4
-rw-r--r--doc/man/8/ceph.rst1
-rw-r--r--doc/man/8/crushtool.rst32
-rw-r--r--doc/man/8/mkcephfs.rst123
-rw-r--r--doc/man/8/monmaptool.rst1
-rw-r--r--doc/man/8/osdmaptool.rst1
-rw-r--r--doc/man/8/rbd.rst4
-rw-r--r--doc/rados/configuration/ceph-conf.rst12
-rw-r--r--doc/rados/configuration/journal-ref.rst6
-rw-r--r--doc/rados/deployment/ceph-deploy-osd.rst4
-rw-r--r--doc/rados/man/index.rst6
-rw-r--r--doc/rados/operations/add-or-rm-mons.rst14
-rw-r--r--doc/rados/operations/authentication.rst15
-rw-r--r--doc/rados/operations/operating.rst266
-rw-r--r--doc/rados/operations/pools.rst6
-rw-r--r--doc/rados/troubleshooting/log-and-debug.rst2
-rw-r--r--doc/radosgw/adminops.rst39
-rw-r--r--doc/radosgw/config-ref.rst829
-rw-r--r--doc/radosgw/config.rst5
-rw-r--r--doc/radosgw/manual-install.rst66
-rw-r--r--doc/rbd/libvirt.rst54
-rw-r--r--doc/rbd/qemu-rbd.rst26
-rw-r--r--doc/rbd/rbd-openstack.rst2
-rw-r--r--doc/release-notes.rst303
-rw-r--r--doc/start/hardware-recommendations.rst (renamed from doc/install/hardware-recommendations.rst)0
-rw-r--r--doc/start/index.rst39
-rw-r--r--doc/start/intro.rst70
-rw-r--r--doc/start/os-recommendations.rst (renamed from doc/install/os-recommendations.rst)33
-rw-r--r--doc/start/quick-ceph-deploy.rst390
-rw-r--r--doc/start/quick-cephfs.rst4
-rw-r--r--doc/start/quick-rbd.rst56
-rw-r--r--doc/start/quick-rgw.rst8
-rw-r--r--doc/start/quick-start-preflight.rst188
-rw-r--r--fusetrace/fusetrace_ll.cc2
-rw-r--r--m4/ax_check_compile_flag.m472
-rw-r--r--man/rbd.85
-rw-r--r--qa/run_xfstests.sh3
-rwxr-xr-xqa/workunits/cephtool/test.sh45
-rwxr-xr-xqa/workunits/mon/crush_ops.sh13
-rwxr-xr-xqa/workunits/mon/pool_ops.sh3
-rwxr-xr-xqa/workunits/mon/rbd_snaps_ops.sh39
-rwxr-xr-xqa/workunits/rados/caching_redirects.sh59
-rwxr-xr-xqa/workunits/rados/test_tmap_to_omap.sh28
-rwxr-xr-xqa/workunits/rbd/copy.sh12
-rwxr-xr-xqa/workunits/rbd/import_export.sh8
-rwxr-xr-xqa/workunits/snaps/snap-rm-diff.sh1
-rwxr-xr-xqa/workunits/snaps/snaptest-0.sh12
-rwxr-xr-xqa/workunits/snaps/snaptest-1.sh2
-rwxr-xr-xqa/workunits/snaps/snaptest-2.sh2
-rwxr-xr-xqa/workunits/snaps/snaptest-authwb.sh2
-rwxr-xr-xqa/workunits/snaps/snaptest-capwb.sh2
-rwxr-xr-xqa/workunits/snaps/snaptest-dir-rename.sh2
-rwxr-xr-xqa/workunits/snaps/snaptest-double-null.sh2
-rwxr-xr-xqa/workunits/snaps/snaptest-estale.sh2
-rwxr-xr-xqa/workunits/snaps/snaptest-git-ceph.sh2
-rwxr-xr-xqa/workunits/snaps/snaptest-intodir.sh2
-rwxr-xr-xqa/workunits/snaps/snaptest-multiple-capsnaps.sh2
-rw-r--r--qa/workunits/snaps/snaptest-parents.sh2
-rwxr-xr-xqa/workunits/snaps/snaptest-snap-rm-cmp.sh2
-rwxr-xr-xqa/workunits/snaps/snaptest-upchildrealms.sh2
-rwxr-xr-xqa/workunits/snaps/snaptest-xattrwb.sh2
-rwxr-xr-xqa/workunits/snaps/untar_snap_rm.sh2
-rwxr-xr-xqa/workunits/suites/fsstress.sh2
-rw-r--r--src/.gitignore1
-rw-r--r--src/Makefile-env.am179
-rw-r--r--src/Makefile.am2395
-rw-r--r--src/arch/Makefile.am11
-rw-r--r--src/arch/intel.c7
-rw-r--r--src/arch/neon.c51
-rw-r--r--src/arch/neon.h16
-rw-r--r--src/arch/probe.cc2
-rw-r--r--src/auth/AuthMethodList.h3
-rw-r--r--src/auth/Makefile.am46
-rw-r--r--src/auth/cephx/CephxKeyServer.cc6
-rwxr-xr-xsrc/ceph-create-keys2
-rwxr-xr-xsrc/ceph-disk31
-rwxr-xr-xsrc/ceph-post-file.in8
-rwxr-xr-xsrc/ceph-rest-api2
-rwxr-xr-xsrc/ceph.in9
-rw-r--r--src/ceph_osd.cc147
-rw-r--r--src/client/Client.cc109
-rw-r--r--src/client/Client.h17
-rw-r--r--src/client/Makefile.am35
-rw-r--r--src/client/SyntheticClient.cc68
-rw-r--r--src/client/SyntheticClient.h3
-rw-r--r--src/client/fuse_ll.cc34
-rw-r--r--src/client/hadoop/CephFSInterface.cc993
-rw-r--r--src/client/hadoop/CephFSInterface.h236
-rw-r--r--src/client/hadoop/HADOOP-ceph.patch2234
-rw-r--r--src/client/hadoop/Readme17
-rw-r--r--src/client/hadoop/ceph/CephFS.java250
-rw-r--r--src/client/hadoop/ceph/CephFaker.java483
-rw-r--r--src/client/hadoop/ceph/CephFileSystem.java804
-rw-r--r--src/client/hadoop/ceph/CephInputStream.java254
-rw-r--r--src/client/hadoop/ceph/CephOutputStream.java219
-rw-r--r--src/client/hadoop/ceph/CephTalker.java91
-rw-r--r--src/client/hadoop/ceph/LICENSE4
-rw-r--r--src/client/hadoop/ceph/TestCeph.java45
-rw-r--r--src/client/hadoop/ceph/package.html101
-rw-r--r--src/client/hadoop/org_apache_hadoop_fs_ceph_CephFS.h13
-rw-r--r--src/client/hadoop/org_apache_hadoop_fs_ceph_CephFileSystem.h31
-rw-r--r--src/client/hadoop/org_apache_hadoop_fs_ceph_CephFileSystem_CephStat.h13
-rw-r--r--src/client/hadoop/org_apache_hadoop_fs_ceph_CephFileSystem_Stat.h13
-rw-r--r--src/client/hadoop/org_apache_hadoop_fs_ceph_CephInputStream.h47
-rw-r--r--src/client/hadoop/org_apache_hadoop_fs_ceph_CephOutputStream.h37
-rw-r--r--src/client/hadoop/org_apache_hadoop_fs_ceph_CephTalker.h197
-rw-r--r--src/client/ioctl.h3
-rw-r--r--src/cls/Makefile.am122
-rw-r--r--src/cls/rbd/cls_rbd.cc4
-rw-r--r--src/cls/rgw/cls_rgw.cc6
-rw-r--r--src/cls/rgw/cls_rgw_client.cc39
-rw-r--r--src/cls/rgw/cls_rgw_client.h8
-rw-r--r--src/common/Cond.h4
-rw-r--r--src/common/Formatter.cc3
-rw-r--r--src/common/Formatter.h6
-rw-r--r--src/common/Makefile.am202
-rw-r--r--src/common/Mutex.h4
-rw-r--r--src/common/OutputDataSocket.cc3
-rw-r--r--src/common/SloppyCRCMap.cc180
-rw-r--r--src/common/SloppyCRCMap.h78
-rw-r--r--src/common/TrackedOp.cc265
-rw-r--r--src/common/TrackedOp.h154
-rw-r--r--src/common/WorkQueue.h37
-rw-r--r--src/common/addr_parsing.c (renamed from src/include/addr_parsing.c)0
-rw-r--r--src/common/admin_socket.cc3
-rw-r--r--src/common/admin_socket_client.cc3
-rw-r--r--src/common/blkdev.cc4
-rw-r--r--src/common/bloom_filter.cc137
-rw-r--r--src/common/bloom_filter.hpp700
-rw-r--r--src/common/buffer.cc18
-rw-r--r--src/common/ceph_argparse.cc17
-rw-r--r--src/common/ceph_frag.cc (renamed from src/include/ceph_frag.cc)2
-rw-r--r--src/common/ceph_fs.cc (renamed from src/include/ceph_fs.cc)2
-rw-r--r--src/common/ceph_hash.cc (renamed from src/include/ceph_hash.cc)2
-rw-r--r--src/common/ceph_json.cc4
-rw-r--r--src/common/ceph_strings.cc (renamed from src/include/ceph_strings.cc)7
-rw-r--r--src/common/code_environment.cc24
-rw-r--r--src/common/config_opts.h36
-rw-r--r--src/common/crc32c_intel_baseline.c3
-rw-r--r--src/common/crc32c_intel_baseline.h2
-rw-r--r--src/common/crc32c_intel_fast.c22
-rw-r--r--src/common/crc32c_intel_fast.h2
-rw-r--r--src/common/hobject.cc (renamed from src/os/hobject.cc)87
-rw-r--r--src/common/hobject.h (renamed from src/os/hobject.h)125
-rw-r--r--src/common/lru_map.h109
-rw-r--r--src/common/obj_bencher.cc36
-rw-r--r--src/common/obj_bencher.h5
-rw-r--r--src/common/perf_counters.cc3
-rw-r--r--src/common/safe_io.c80
-rw-r--r--src/common/safe_io.h9
-rw-r--r--src/common/sharedptr_registry.hpp23
-rw-r--r--src/common/util.cc1
-rw-r--r--src/crush/CrushCompiler.cc14
-rw-r--r--src/crush/CrushWrapper.cc13
-rw-r--r--src/crush/CrushWrapper.h5
-rw-r--r--src/crush/Makefile.am30
-rw-r--r--src/crush/builder.c10
-rw-r--r--src/crush/builder.h2
-rw-r--r--src/crush/crush.h3
-rw-r--r--src/crush/hash.c3
-rw-r--r--src/crush/mapper.c2
-rw-r--r--src/global/Makefile.am14
-rw-r--r--src/global/signal_handler.cc4
-rw-r--r--src/include/CompatSet.h46
-rw-r--r--src/include/Context.h20
-rw-r--r--src/include/Makefile.am80
-rw-r--r--src/include/bloom_filter.hpp544
-rw-r--r--src/include/buffer.h17
-rw-r--r--src/include/ceph_features.h2
-rw-r--r--src/include/ceph_fs.h1
-rw-r--r--src/include/crc32c.h2
-rw-r--r--src/include/encoding.h15
-rw-r--r--src/include/histogram.h76
-rw-r--r--src/include/int_types.h75
-rw-r--r--src/include/inttypes.h28
-rw-r--r--src/include/linux_fiemap.h4
-rw-r--r--src/include/rados.h16
-rw-r--r--src/include/rados/librados.h5
-rw-r--r--src/include/rados/librados.hpp69
-rw-r--r--src/include/types.h11
-rw-r--r--src/init-ceph.in24
-rw-r--r--src/init-radosgw2
-rw-r--r--src/init-radosgw.sysv2
-rw-r--r--src/java/Makefile.am3
-rw-r--r--src/java/test/com/ceph/fs/CephAllTests.java13
-rw-r--r--src/json_spirit/Makefile.am18
-rw-r--r--src/key_value_store/Makefile.am10
-rw-r--r--src/librados/AioCompletionImpl.h7
-rw-r--r--src/librados/IoCtxImpl.cc15
-rw-r--r--src/librados/IoCtxImpl.h6
-rw-r--r--src/librados/Makefile.am20
-rw-r--r--src/librados/PoolAsyncCompletionImpl.h5
-rw-r--r--src/librados/RadosClient.cc2
-rw-r--r--src/librados/librados.cc36
-rw-r--r--src/librbd/AioRequest.h4
-rw-r--r--src/librbd/ImageCtx.h2
-rw-r--r--src/librbd/Makefile.am24
-rw-r--r--src/librbd/SnapInfo.h2
-rw-r--r--src/librbd/WatchCtx.h2
-rw-r--r--src/librbd/internal.cc5
-rw-r--r--src/librbd/internal.h2
-rw-r--r--src/librbd/librbd.cc2
-rw-r--r--src/log/Makefile.am11
-rw-r--r--src/mds/CDentry.cc12
-rw-r--r--src/mds/CDentry.h4
-rw-r--r--src/mds/CDir.cc17
-rw-r--r--src/mds/CDir.h1
-rw-r--r--src/mds/CInode.cc10
-rw-r--r--src/mds/CInode.h3
-rw-r--r--src/mds/Locker.cc5
-rw-r--r--src/mds/LogEvent.cc14
-rw-r--r--src/mds/MDCache.cc291
-rw-r--r--src/mds/MDCache.h34
-rw-r--r--src/mds/MDLog.cc8
-rw-r--r--src/mds/MDS.cc5
-rw-r--r--src/mds/MDSMap.cc11
-rw-r--r--src/mds/MDSMap.h15
-rw-r--r--src/mds/Makefile.am92
-rw-r--r--src/mds/Server.cc18
-rw-r--r--src/mds/flock.h2
-rw-r--r--src/mds/locks.c4
-rw-r--r--src/mds/mdstypes.cc5
-rw-r--r--src/mds/mdstypes.h18
-rw-r--r--src/messages/MOSDOpReply.h77
-rw-r--r--src/messages/MOSDSubOp.h12
-rw-r--r--src/messages/Makefile.am113
-rw-r--r--src/mon/DataHealthService.cc42
-rw-r--r--src/mon/DataHealthService.h1
-rw-r--r--src/mon/MDSMonitor.cc31
-rw-r--r--src/mon/Makefile.am45
-rw-r--r--src/mon/MonClient.cc6
-rw-r--r--src/mon/MonCommands.h57
-rw-r--r--src/mon/Monitor.cc267
-rw-r--r--src/mon/Monitor.h14
-rw-r--r--src/mon/MonitorDBStore.h11
-rw-r--r--src/mon/MonmapMonitor.cc39
-rw-r--r--src/mon/OSDMonitor.cc541
-rw-r--r--src/mon/OSDMonitor.h6
-rw-r--r--src/mon/PGMap.cc58
-rw-r--r--src/mon/PGMap.h37
-rw-r--r--src/mon/PGMonitor.cc73
-rw-r--r--src/mon/PGMonitor.h1
-rw-r--r--src/mon/Paxos.cc11
-rw-r--r--src/mon/Paxos.h7
-rw-r--r--src/mon/mon_types.h68
-rw-r--r--src/msg/Makefile.am20
-rw-r--r--src/msg/Pipe.cc13
-rw-r--r--src/msg/Pipe.h11
-rw-r--r--src/msg/msg_types.cc2
-rw-r--r--src/objclass/class_api.cc4
-rwxr-xr-xsrc/objsync/boto_del.py2
-rw-r--r--src/os/BtrfsFileStoreBackend.cc10
-rw-r--r--src/os/CollectionIndex.h20
-rw-r--r--src/os/DBObjectMap.cc203
-rw-r--r--src/os/DBObjectMap.h98
-rw-r--r--src/os/FDCache.h10
-rw-r--r--src/os/FileStore.cc706
-rw-r--r--src/os/FileStore.h194
-rw-r--r--src/os/FlatIndex.cc47
-rw-r--r--src/os/FlatIndex.h14
-rw-r--r--src/os/GenericFileStoreBackend.cc120
-rw-r--r--src/os/GenericFileStoreBackend.h15
-rw-r--r--src/os/HashIndex.cc88
-rw-r--r--src/os/HashIndex.h32
-rw-r--r--src/os/IndexManager.cc2
-rw-r--r--src/os/KeyValueDB.h2
-rw-r--r--src/os/LFNIndex.cc275
-rw-r--r--src/os/LFNIndex.h90
-rw-r--r--src/os/LevelDBStore.h68
-rw-r--r--src/os/Makefile.am50
-rw-r--r--src/os/ObjectMap.h44
-rw-r--r--src/os/ObjectStore.cc96
-rw-r--r--src/os/ObjectStore.h183
-rw-r--r--src/os/WBThrottle.cc24
-rw-r--r--src/os/WBThrottle.h42
-rw-r--r--src/os/ZFSFileStoreBackend.cc5
-rw-r--r--src/os/chain_xattr.cc7
-rw-r--r--src/osd/Ager.cc9
-rw-r--r--src/osd/Ager.h4
-rw-r--r--src/osd/ClassHandler.cc4
-rw-r--r--src/osd/ClassHandler.h5
-rw-r--r--src/osd/ErasureCodeInterface.h240
-rw-r--r--src/osd/ErasureCodePlugin.cc137
-rw-r--r--src/osd/ErasureCodePlugin.h70
-rw-r--r--src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.cc427
-rw-r--r--src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.h227
-rw-r--r--src/osd/ErasureCodePluginJerasure/ErasureCodePluginJerasure.cc70
-rw-r--r--src/osd/ErasureCodePluginJerasure/Makefile.am22
-rwxr-xr-xsrc/osd/ErasureCodePluginJerasure/cauchy.c408
-rwxr-xr-xsrc/osd/ErasureCodePluginJerasure/cauchy.h53
-rwxr-xr-xsrc/osd/ErasureCodePluginJerasure/cauchy_best_r6.c1985
-rwxr-xr-xsrc/osd/ErasureCodePluginJerasure/galois.c821
-rwxr-xr-xsrc/osd/ErasureCodePluginJerasure/galois.h111
-rwxr-xr-xsrc/osd/ErasureCodePluginJerasure/jerasure.c1376
-rwxr-xr-xsrc/osd/ErasureCodePluginJerasure/jerasure.h300
-rwxr-xr-xsrc/osd/ErasureCodePluginJerasure/liberation.c265
-rwxr-xr-xsrc/osd/ErasureCodePluginJerasure/liberation.h56
-rwxr-xr-xsrc/osd/ErasureCodePluginJerasure/reed_sol.c368
-rwxr-xr-xsrc/osd/ErasureCodePluginJerasure/reed_sol.h59
-rw-r--r--src/osd/Makefile.am44
-rw-r--r--src/osd/OSD.cc980
-rw-r--r--src/osd/OSD.h133
-rw-r--r--src/osd/OSDMap.cc12
-rw-r--r--src/osd/OSDMap.h6
-rw-r--r--src/osd/OpRequest.cc267
-rw-r--r--src/osd/OpRequest.h99
-rw-r--r--src/osd/PG.cc403
-rw-r--r--src/osd/PG.h48
-rw-r--r--src/osd/PGBackend.h230
-rw-r--r--src/osd/PGLog.cc87
-rw-r--r--src/osd/PGLog.h43
-rw-r--r--src/osd/ReplicatedBackend.cc268
-rw-r--r--src/osd/ReplicatedBackend.h329
-rw-r--r--src/osd/ReplicatedPG.cc2757
-rw-r--r--src/osd/ReplicatedPG.h730
-rw-r--r--src/osd/SnapMapper.h2
-rw-r--r--src/osd/osd_types.cc229
-rw-r--r--src/osd/osd_types.h318
-rw-r--r--src/osdc/Makefile.am17
-rw-r--r--src/osdc/ObjectCacher.cc12
-rw-r--r--src/osdc/Objecter.cc120
-rw-r--r--src/osdc/Objecter.h206
-rw-r--r--src/perfglue/Makefile.am23
-rw-r--r--src/perfglue/heap_profiler.cc10
-rw-r--r--src/pybind/ceph_argparse.py26
-rwxr-xr-xsrc/pybind/ceph_rest_api.py14
-rw-r--r--src/rbd.cc125
-rw-r--r--src/rbd_fuse/rbd-fuse.c5
-rw-r--r--src/rgw/Makefile.am152
-rw-r--r--src/rgw/rgw_admin.cc111
-rw-r--r--src/rgw/rgw_auth_s3.cc10
-rw-r--r--src/rgw/rgw_bucket.cc8
-rw-r--r--src/rgw/rgw_cache.cc5
-rw-r--r--src/rgw/rgw_cache.h3
-rw-r--r--src/rgw/rgw_common.cc5
-rw-r--r--src/rgw/rgw_common.h28
-rw-r--r--src/rgw/rgw_cors.cc3
-rw-r--r--src/rgw/rgw_cors.h12
-rw-r--r--src/rgw/rgw_cors_s3.cc4
-rw-r--r--src/rgw/rgw_http_errors.h1
-rw-r--r--src/rgw/rgw_json_enc.cc88
-rw-r--r--src/rgw/rgw_keystone.cc108
-rw-r--r--src/rgw/rgw_keystone.h106
-rw-r--r--src/rgw/rgw_main.cc23
-rw-r--r--src/rgw/rgw_metadata.cc4
-rw-r--r--src/rgw/rgw_op.cc268
-rw-r--r--src/rgw/rgw_op.h22
-rw-r--r--src/rgw/rgw_quota.cc332
-rw-r--r--src/rgw/rgw_quota.h74
-rw-r--r--src/rgw/rgw_rados.cc152
-rw-r--r--src/rgw/rgw_rados.h39
-rw-r--r--src/rgw/rgw_replica_log.cc9
-rw-r--r--src/rgw/rgw_rest.cc32
-rw-r--r--src/rgw/rgw_rest.h15
-rw-r--r--src/rgw/rgw_rest_s3.cc281
-rw-r--r--src/rgw/rgw_rest_s3.h52
-rw-r--r--src/rgw/rgw_rest_swift.cc28
-rw-r--r--src/rgw/rgw_swift.cc200
-rw-r--r--src/rgw/rgw_swift.h20
-rw-r--r--src/rgw/rgw_user.cc6
-rw-r--r--src/rgw/rgw_user.h13
-rwxr-xr-xsrc/script/perf-watch.py2
-rw-r--r--src/test/Makefile.am906
-rw-r--r--src/test/ObjectMap/KeyValueDBMemory.h18
-rw-r--r--src/test/ObjectMap/test_object_map.cc54
-rw-r--r--src/test/ObjectMap/test_store_tool/test_store_tool.cc104
-rw-r--r--src/test/barclass.cc (renamed from src/barclass.cc)0
-rw-r--r--src/test/buildtest_skeleton.cc (renamed from src/test/test_libcommon_build.cc)0
-rw-r--r--src/test/ceph_compatset.cc164
-rw-r--r--src/test/cli-integration/rbd/formatted-output.t22
-rw-r--r--src/test/cli/radosgw-admin/help.t19
-rw-r--r--src/test/cli/rbd/help.t1
-rw-r--r--src/test/common/get_command_descriptions.cc116
-rw-r--r--src/test/common/test_bloom_filter.cc289
-rw-r--r--src/test/common/test_crc32c.cc84
-rw-r--r--src/test/common/test_sharedptr_registry.cc24
-rw-r--r--src/test/common/test_sloppy_crc_map.cc113
-rw-r--r--src/test/common/test_util.cc1
-rw-r--r--src/test/encoding/ceph_dencoder.cc4
-rw-r--r--src/test/encoding/types.h16
-rw-r--r--src/test/filestore/FileStoreDiff.cc12
-rwxr-xr-xsrc/test/filestore/run_seed_to_range.sh2
-rw-r--r--src/test/filestore/store_test.cc183
-rw-r--r--src/test/filestore/workload_generator.cc4
-rw-r--r--src/test/fooclass.cc (renamed from src/fooclass.cc)0
-rw-r--r--src/test/libcephfs/caps.cc2
-rw-r--r--src/test/librados/misc.cc157
-rw-r--r--src/test/librbd/test_librbd.cc2
-rw-r--r--src/test/os/TestFlatIndex.cc12
-rw-r--r--src/test/os/TestLFNIndex.cc99
-rw-r--r--src/test/osd/ErasureCodeExample.h161
-rw-r--r--src/test/osd/ErasureCodePluginExample.cc36
-rw-r--r--src/test/osd/ErasureCodePluginFailToInitialize.cc23
-rw-r--r--src/test/osd/ErasureCodePluginFailToRegister.cc22
-rw-r--r--src/test/osd/ErasureCodePluginHangs.cc24
-rw-r--r--src/test/osd/ErasureCodePluginMissingEntryPoint.cc1
-rw-r--r--src/test/osd/Object.cc9
-rw-r--r--src/test/osd/Object.h5
-rw-r--r--src/test/osd/RadosModel.h433
-rw-r--r--src/test/osd/TestErasureCodeExample.cc173
-rw-r--r--src/test/osd/TestErasureCodeJerasure.cc306
-rw-r--r--src/test/osd/TestErasureCodePlugin.cc108
-rw-r--r--src/test/osd/TestErasureCodePluginJerasure.cc74
-rw-r--r--src/test/osd/TestRados.cc72
-rw-r--r--src/test/perf_counters.cc8
-rwxr-xr-xsrc/test/pybind/test_ceph_argparse.py1056
-rw-r--r--src/test/streamtest.cc (renamed from src/streamtest.cc)0
-rw-r--r--src/test/test_arch.c19
-rw-r--r--src/test/test_osd_types.cc3
-rw-r--r--src/test/test_trans.cc (renamed from src/test_trans.cc)0
-rw-r--r--src/test/testclass.cc (renamed from src/testclass.cc)0
-rw-r--r--src/test/testcrypto.cc (renamed from src/testcrypto.cc)0
-rw-r--r--src/test/testkeys.cc (renamed from src/testkeys.cc)0
-rw-r--r--src/test/testmsgr.cc (renamed from src/testmsgr.cc)0
-rw-r--r--src/tools/Makefile.am87
-rw-r--r--src/tools/ceph-filestore-dump.cc135
-rw-r--r--src/tools/ceph-osdomap-tool.cc10
-rw-r--r--src/tools/ceph_authtool.cc (renamed from src/ceph_authtool.cc)0
-rw-r--r--src/tools/ceph_conf.cc (renamed from src/ceph_conf.cc)0
-rw-r--r--src/tools/crushtool.cc (renamed from src/crushtool.cc)6
-rw-r--r--src/tools/dupstore.cc (renamed from src/dupstore.cc)6
-rw-r--r--src/tools/mon_store_converter.cc (renamed from src/mon_store_converter.cc)0
-rw-r--r--src/tools/monmaptool.cc (renamed from src/monmaptool.cc)0
-rw-r--r--src/tools/osdmaptool.cc (renamed from src/osdmaptool.cc)0
-rw-r--r--src/tools/psim.cc (renamed from src/psim.cc)0
-rw-r--r--src/tools/rados/rados.cc (renamed from src/rados.cc)68
-rw-r--r--src/tools/rados/rados_export.cc (renamed from src/rados_export.cc)2
-rw-r--r--src/tools/rados/rados_import.cc (renamed from src/rados_import.cc)2
-rw-r--r--src/tools/rados/rados_sync.cc (renamed from src/rados_sync.cc)2
-rw-r--r--src/tools/rados/rados_sync.h (renamed from src/rados_sync.h)0
-rw-r--r--src/tools/radosacl.cc (renamed from src/radosacl.cc)0
-rw-r--r--src/tools/rest_bench.cc11
-rw-r--r--src/tools/scratchtool.c (renamed from src/scratchtool.c)0
-rw-r--r--src/tools/scratchtoolpp.cc (renamed from src/scratchtoolpp.cc)0
-rwxr-xr-xsrc/vstart.sh96
475 files changed, 30520 insertions, 15783 deletions
diff --git a/.gitignore b/.gitignore
index 211c09cbba7..7e637866366 100644
--- a/.gitignore
+++ b/.gitignore
@@ -69,4 +69,10 @@ web/*.html
# dir from coverity tools
cov-int/
-/test-driver \ No newline at end of file
+/test-driver
+
+# gtags(1) generated files
+GPATH
+GRTAGS
+GSYMS
+GTAGS
diff --git a/.mailmap b/.mailmap
new file mode 100644
index 00000000000..fc4a1eb9ce9
--- /dev/null
+++ b/.mailmap
@@ -0,0 +1,84 @@
+Sage Weil <sage@inktank.com> <sage@newdream.net>
+Sage Weil <sage@inktank.com> <sage.weil@dreamhost.com>
+Sage Weil <sage@inktank.com> <sageweil@29311d96-e01e-0410-9327-a35deaab8ce9>
+Sage Weil <sage@inktank.com> <sage@29311d96-e01e-0410-9327-a35deaab8ce9>
+Sage Weil <sage@inktank.com> <sage@ceph0.dreamhost.com>
+Sage Weil <sage@inktank.com> <sage@skinny.ops.newdream.net>
+Sage Weil <sage@inktank.com> <sage@foil.westwood.newdream.net>
+Sage Weil <sage@inktank.com> <sage@vapre.localdomain>
+Sage Weil <sage@inktank.com> <sage.weil@inktank.com>
+Yehuda Sadeh <yehuda@inktank.com> <yehuda@hq.newdream.net>
+Yehuda Sadeh <yehuda@inktank.com> <yehuda.sadeh@dreamhost.com>
+Yehuda Sadeh <yehuda@inktank.com> <yehuda@yehuda.infit.com>
+Yehuda Sadeh <yehuda@inktank.com> <yehuda@yehuda>
+Yehuda Sadeh <yehuda@inktank.com> <yehudasa@fatty.ops.newdream.net>
+Yehuda Sadeh <yehuda@inktank.com> <yehudasa@gmail.com>
+Yehuda Sadeh <yehuda@inktank.com> <yehudasa@ceph0.dreamhost.com>
+Colin P. McCabe <colinm@hq.newdream.net> <cmccabe@alumni.cmu.edu>
+Colin P. McCabe <colinm@hq.newdream.net> <cmccabe@fatty.ops.newdream.net>
+Greg Farnum <greg@inktank.com> <gregf@hq.newdream.net>
+Greg Farnum <greg@inktank.com> <gregory.farnum@dreamhost.com>
+Greg Farnum <greg@inktank.com> Gregory Farnum <greg@inktank.com>
+Greg Farnum <greg@inktank.com> <greg@gregs42.com>
+Greg Farnum <greg@inktank.com> <gregf@skinny.ops.newdream.net>
+Greg Farnum <greg@inktank.com> <gfarnum@GF-Macbook.local>
+Samuel Just <sam.just@inktank.com> <samuel.just@dreamhost.com>
+Samuel Just <sam.just@inktank.com> <rexludorum@gmail.com>
+Samuel Just <sam.just@inktank.com> <samuelj@hq.newdream.net>
+Samuel Just <sam.just@inktank.com> <sam.just@dreamhost.com>
+Samuel Just <sam.just@inktank.com> <sam@Pondermatic.(none)>
+John Wilkins <john.wilkins@inktank.com> <john.wilkins@dreamhost.com>
+John Wilkins <john.wilkins@inktank.com> <john@admin-host.(none)>
+John Wilkins <john.wilkins@inktank.com> <johnw@johnw7664.(none)>
+Josh Durgin <josh.durgin@inktank.com> <josh.durgin@dreamhost.com>
+Josh Durgin <josh.durgin@inktank.com> <joshd@hq.newdream.net>
+Dan Mick <dan.mick@inktank.com> <dan.mick@dreamhost.com>
+Dan Mick <dan.mick@inktank.com> <dmick@danceorelse.org>
+Tommi Virtanen <tv@inktank.com> <tommi.virtanen@dreamhost.com>
+Tommi Virtanen <tv@inktank.com> <tv@hq.newdream.net>
+Tommi Virtanen <tv@inktank.com> <tv@eagain.net>
+João Eduardo Luís <joao.luis@inktank.com> <jecluis@gmail.com>
+João Eduardo Luís <joao.luis@inktank.com> Joao Eduardo Luis <joao.luis@inktank.com>
+Sam Lang <sam.lang@inktank.com> <samlang@gmail.com>
+Noah Watkins <noahwatkins@gmail.com> <jayhawk@cs.ucsc.edu>
+Gary Lowell <gary.lowell@inktank.com> <glowell@flab.ops.newdream.net>
+Gary Lowell <gary.lowell@inktank.com> <glowell@inktank.com>
+Patience Warnick <patience@cranium.pelton.net> <patiencew@29311d96-e01e-0410-9327-a35deaab8ce9>
+Wido den Hollander <wido@42on.com> <wido@widodh.nl>
+Michael Rodriguez <michael@newdream.net> <michael@squid.newdream.net>
+Michael Rodriguez <michael@newdream.net> <michael@newdream.net>
+Caleb Miles <caleb.miles@inktank.com> caleb miles <caselim@gmail.com>
+Caleb Miles <caleb.miles@inktank.com> caleb miles <caleb.miles@inktank.com>
+Caleb Miles <caleb.miles@inktank.com> Caleb Miles <caselim@gmail.com>
+Joe Buck <jbbuck@gmail.com> <buck@soe.ucsc.edu>
+Laszlo Boszormenyi <gcs@debian.hu> Laszlo Boszormenyi (GCS) <gcs@debian.hu>
+Roald J. van Loon <roaldvanloon@gmail.com> Roald van Loon <roaldvanloon@gmail.com>
+Alex Elder <elder@inktank.com> <elder@dreamhost.com>
+Alex Elder <elder@inktank.com> <elder@doink.(none)>
+Alex Elder <elder@inktank.com> <elder@speedy.(none)>
+Alexandre Marangone <alexandre.marangone@inktank.com> <a.marangone@gmail.com>
+Alexandre Oliva <oliva@gnu.org> <oliva@lsd.ic.unicamp.br>
+Alexandre Oliva <oliva@gnu.org> <lxoliva@fsfla.org>
+Ross Turk <ross.turk@inktank.com> <ross@inktank.com>
+Ross Turk <ross.turk@inktank.com> <ross.turk@dreamhost.com>
+Patrick McGarry <patrick@inktank.com> <pmcgarry@gmail.com>
+Patrick McGarry <patrick@inktank.com> scuttlemonkey <patrick@inktank.com>
+Mark Nelson <mark.nelson@inktank.com> <mark.a.nelson@gmail.com>
+Tamil Muthamizhan <tamil.muthamizhan@inktank.com> <tamil@ubuntu.(none)>
+Tamil Muthamizhan <tamil.muthamizhan@inktank.com> tamil <tamil.muthamizhan@inktank.com>
+Tamil Muthamizhan <tamil.muthamizhan@inktank.com> <tamil@tamil-VirtualBox.(none)>
+Christian Brunner <christian@brunner-muc.de> <chb@muc.de>
+Henry C Chang <henry_c_chang@tcloudcomputing.com> <henry.cy.chang@gmail.com>
+Alfredo Deza <alfredo.deza@inktank.com> <alfredo@deza.pe>
+Sylvain Munaut <s.munaut@whatever-company.com> <tnt@246tNt.com>
+Erwin, Brock A <Brock.Erwin@pnl.gov> <Brock.Erwin@pnl.govgit>
+Kacper Kowalik <xarthisius@gentoo.org> Kacper Kowalik (Xarthisius) <xarthisius@gentoo.org>
+Neil Levine <neil.levine@inktank.com> <levine@yoyo.org>
+Guilhem Lettron <guilhem@lettron.fr> <guilhem+github@lettron.fr>
+Holger Macht <hmacht@suse.de> <holger@homac.de>
+Volker Assmann <volker@twisted-nerve.de> <volker@stan.local>
+Volker Assmann <volker@twisted-nerve.de> <volker@36-135.mops.RWTH-Aachen.DE>
+Sebastien Han <sebastien.han@enovance.com> <sebastien.han@enovance.com>
+Matthew Roy <matthew@royhousehold.net> <matthew@matthew-ubuntu.(none)>
+Matthew Roy <matthew@royhousehold.net> <mroy@sandbox-ed.com>
+Matthew Wodrich <matthew.wodrich@dreamhost.com> <mattheww@Mattsbox.(none)>
diff --git a/COPYING b/COPYING
index 28d88ebb7fa..a0034d58c3b 100644
--- a/COPYING
+++ b/COPYING
@@ -1,3 +1,8 @@
+Format-Specification: http://anonscm.debian.org/viewvc/dep/web/deps/dep5/copyright-format.xml?revision=279&view=markup
+Name: ceph
+Maintainer: Sage Weil <sage@newdream.net>
+Source: http://ceph.com/
+
Files: *
Copyright: (c) 2004-2010 by Sage Weil <sage@newdream.net>
License: LGPL2.1 (see COPYING-LGPL2.1)
@@ -6,10 +11,6 @@ Files: doc/*
Copyright: (c) 2010-2012 New Dream Network and contributors
License: Creative Commons Attribution-ShareAlike (CC BY-SA)
-Files: src/client/hadoop/ceph
-Copyright: Copyright (C) New Dream Network and contributors
-License: Apache License v2
-
Files: src/mount/canonicalize.c
Copyright: Copyright (C) 1993 Rick Sladkey <jrs@world.std.com>
License: LGPL2 or later
@@ -22,6 +23,10 @@ Files: src/include/ceph_hash.cc
Copyright: None
License: Public domain
+Files: src/common/bloom_filter.hpp
+Copyright: Copyright (C) 2000 Arash Partow <arash@partow.net>
+License: Boost Software License, Version 1.0
+
Files: m4/acx_pthread.m4
Copyright: Steven G. Johnson <stevenj@alum.mit.edu>
License: GPLWithACException
@@ -96,3 +101,40 @@ License:
Files: src/test/common/Throttle.cc src/test/filestore/chain_xattr.cc
Copyright: Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
License: LGPL2 or later
+
+Files: src/osd/ErasureCodePluginJerasure/*.{c,h}
+Copyright: Copyright (c) 2011, James S. Plank <plank@cs.utk.edu>
+License:
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ - Neither the name of the University of Tennessee nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+
+Packaging:
+ Copyright (C) 2004-2009 by Sage Weil <sage@newdream.net>
+ Copyright (C) 2010 Canonical, Ltd.
+ Licensed under LGPL-2.1
diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index ccbe0596b70..a3ec73290f3 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -1,20 +1,23 @@
-v0.68
+v0.71
~~~~~
-* 'ceph osd crush set <id> <weight> <loc..>' no longer adds the osd to the
- specified location, as that's a job for 'ceph osd crush add'. It will
- however continue to work just the same as long as the osd already exists
- in the crush map.
+* The MDS now disallows snapshots by default as they are not
+ considered stable. The command 'ceph mds set allow_snaps' will
+ enable them.
-* The OSD now enforces that class write methods cannot both mutate an
- object and return data. The rbd.assign_bid method, the lone
- offender, has been removed. This breaks compatibility with
- pre-bobtail librbd clients by preventing them from creating new
- images.
+* For clusters that were created before v0.44 (pre-argonaut, Spring
+ 2012) and store radosgw data, the auto-upgrade from TMAP to OMAP
+ objects has been disabled. Before upgrading, make sure that any
+ buckets created on pre-argonaut releases have been modified (e.g.,
+ by PUTing and then DELETEing an object from each bucket). Any
+ cluster created with argonaut (v0.48) or a later release or not
+ using radosgw never relied on the automatic conversion and is not
+ affected by this change.
-* librados now returns on commit instead of ack for synchronous calls.
- This is a bit safer in the case where both OSDs and the client crash, and
- is probably how it should have been acting from the beginning. Users are
- unlikely to notice but it could result in lower performance in some
- circumstances. Those who care should switch to using the async interfaces,
- which let you specify safety semantics precisely. \ No newline at end of file
+* Any direct users of the 'tmap' portion of the librados API should be
+ aware that the automatic tmap -> omap conversion functionality has
+ been removed.
+
+* Most output that used K or KB (e.g., for kilobyte) now uses a
+ lower-case k to match the official SI convention. Any scripts that
+ parse output and check for an upper-case K will need to be modified.
diff --git a/README b/README
index 1dcf94512ac..3662d0ea2cb 100644
--- a/README
+++ b/README
@@ -97,7 +97,11 @@ To build the documentation, ensure that you are in the top-level `/ceph director
Build Prerequisites
--------------------
+===================
+
+
+debian-based
+------------
To build the source code, you must install the following:
- automake
@@ -132,3 +136,25 @@ To build the source code, you must install the following:
For example:
$ apt-get install automake autoconf pkg-config gcc g++ make libboost-dev libedit-dev libssl-dev libtool libfcgi libfcgi-dev libfuse-dev linux-kernel-headers libcrypto++-dev libaio-dev libgoogle-perftools-dev libkeyutils-dev uuid-dev libatomic-ops-dev libboost-program-options-dev libboost-thread-dev libexpat1-dev libleveldb-dev libsnappy-dev libcurl4-gnutls-dev python-argparse python-flask
+
+rpm-based
+---------
+These are the rpm packages needed to install in an rpm-based OS:
+
+ autoconf
+ automake
+ gcc
+ make
+ libtool
+ python-argparse
+ python-flask
+ libuuid-devel
+ nss-devel
+ fuse-devel
+ gperftools-devel
+ libedit-devel
+ libatomic_ops-devel
+ snappy-devel
+ leveldb-devel
+ libaio-devel
+ boost-devel
diff --git a/admin/doc-requirements.txt b/admin/doc-requirements.txt
index b088df083db..ce4654c6600 100644
--- a/admin/doc-requirements.txt
+++ b/admin/doc-requirements.txt
@@ -1,3 +1,3 @@
-Sphinx >=1.1.2
+Sphinx == 1.1.3
-e git+https://github.com/ceph/sphinx-ditaa.git#egg=sphinx-ditaa
-e git+https://github.com/ceph/asphyxiate.git#egg=asphyxiate
diff --git a/ceph.spec.in b/ceph.spec.in
index 11a962d8bdc..bcb1214cc93 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -37,6 +37,7 @@ BuildRequires: perl
BuildRequires: gdbm
BuildRequires: pkgconfig
BuildRequires: python
+BuildRequires: python-nose
BuildRequires: libaio-devel
BuildRequires: libcurl-devel
BuildRequires: libxml2-devel
@@ -126,7 +127,6 @@ Requires: apache2-mod_fcgid
%else
BuildRequires: expat-devel
BuildRequires: fcgi-devel
-Requires: mod_fcgid
%endif
%description radosgw
radosgw is an S3 HTTP REST gateway for the RADOS object store. It is
@@ -239,14 +239,8 @@ License: LGPL-2.0
Requires: java
Requires: libcephfs_jni1 = %{version}-%{release}
BuildRequires: java-devel
-%if 0%{?suse_version} > 1220
Requires: junit4
BuildRequires: junit4
-%else
-Requires: junit
-BuildRequires: junit
-%endif
-BuildRequires: junit
%description -n cephfs-java
This package contains the Java libraries for the Ceph File System.
@@ -279,7 +273,6 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
--localstatedir=/var \
--sysconfdir=/etc \
--docdir=%{_docdir}/ceph \
- --without-hadoop \
--with-nss \
--without-cryptopp \
--with-rest-bench \
@@ -405,7 +398,6 @@ fi
%{_bindir}/ceph-osd
%{_bindir}/ceph-rbdnamer
%{_bindir}/ceph-dencoder
-%{_bindir}/ceph-rest-api
%{_bindir}/librados-config
%{_bindir}/rados
%{_bindir}/rbd
@@ -423,6 +415,7 @@ fi
/sbin/mount.ceph
%dir %{_libdir}/rados-classes
%{_libdir}/rados-classes/libcls_rbd.so*
+%{_libdir}/rados-classes/libcls_hello.so*
%{_libdir}/rados-classes/libcls_rgw.so*
%{_libdir}/rados-classes/libcls_lock.so*
%{_libdir}/rados-classes/libcls_kvs.so*
diff --git a/configure.ac b/configure.ac
index 5dba606cf3c..1eee4609ec1 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,7 +8,7 @@ AC_PREREQ(2.59)
# VERSION define is not used by the code. It gets a version string
# from 'git describe'; see src/ceph_ver.[ch]
-AC_INIT([ceph], [0.67], [ceph-devel@vger.kernel.org])
+AC_INIT([ceph], [0.70], [ceph-devel@vger.kernel.org])
# Create release string. Used with VERSION for RPMs.
RPM_RELEASE=0
@@ -61,6 +61,8 @@ if test "$CXX" = no || test "$CXX:$GXX" = "g++:"; then
AC_MSG_ERROR([no C++ compiler found])
fi
+AM_CONDITIONAL(CLANG, test x"$CXX" = x"clang++")
+
#AC_PROG_CC
AC_PROG_MAKE_SET
AC_PROG_LIBTOOL
@@ -104,6 +106,13 @@ AC_DEFUN([AC_CHECK_CC_FLAG],
AC_CHECK_CC_FLAG([-Wtype-limits], [WARN_TYPE_LIMITS])
AC_CHECK_CC_FLAG([-Wignored-qualifiers], [WARN_IGNORED_QUALIFIERS])
+# Checks for architecture stuff
+AM_CONDITIONAL([ENABLE_FPU_NEON], [case $target_cpu in arm*) true;; *) false;; esac])
+
+# Check for compiler VTA support
+AX_CHECK_COMPILE_FLAG([-fvar-tracking-assignments], [HAS_VTA_SUPPORT=1], [HAS_VTA_SUPPORT=0])
+AM_CONDITIONAL(COMPILER_HAS_VTA, [test "$HAS_VTA_SUPPORT" = 1])
+
# Checks for libraries.
ACX_PTHREAD
AC_CHECK_LIB([uuid], [uuid_parse], [true], AC_MSG_FAILURE([libuuid not found]))
@@ -324,14 +333,18 @@ if test "x$enable_cephfs_java" = "xyes"; then
# setup defaults for Debian default-jdk package (without --with-jdk-dir)
AS_IF([test -z "$with_jdk_dir"], [
- # This works with Debian's default-jdk package
- dir='/usr/lib/jvm/default-java/'
- javac_prog=`find $dir -name javac | head -n 1`
- AS_IF([test -x "$javac_prog"], [
- EXTRA_JDK_BIN_DIR=`dirname $javac_prog`])
- jnih=`find $dir -name jni.h | head -n 1`
- AS_IF([test -r "$jnih"], [
- EXTRA_JDK_INC_DIR=`dirname $jnih`])])
+ # This works with Debian's and CentOS' default-jdk package
+ for dir in '/usr/lib/jvm/default-java/' '/usr/lib/jvm/java/' ; do
+ # only test if a suitable path has not yet been found
+ AS_IF([test "$EXTRA_JDK_BIN_DIR" == ""], [
+ AS_IF([test -x "$javac_prog"], [
+ EXTRA_JDK_BIN_DIR=`dirname $javac_prog`])
+ jnih=`find $dir -name jni.h | head -n 1`
+ AS_IF([test -r "$jnih"], [
+ EXTRA_JDK_INC_DIR=`dirname $jnih`])
+ ])
+ done
+ ])
# cephfs_java_test only makes sense if java is already turned on
# setup CLASSPATH for Debian default junit4.jar package
@@ -359,10 +372,6 @@ if test "x$enable_cephfs_java" = "xyes"; then
CLASSPATH=$CLASSPATH:$EXTRA_CLASSPATH_JAR
export CLASSPATH
AC_MSG_NOTICE([classpath - $CLASSPATH])
- AS_IF([test "$have_junit4" = "1"], [
- AC_CHECK_CLASS([org.junit.rules.ExternalResource], [], [
- AC_MSG_NOTICE(Could not find org.junit.rules.ExternalResource)
- have_junit4=0])])
# Check for jni.h
CPPFLAGS_save=$CPPFLAGS
@@ -382,25 +391,6 @@ if test "x$enable_cephfs_java" = "xyes"; then
fi
AM_CONDITIONAL(HAVE_JUNIT4, [test "$have_junit4" = "1"])
-# jni?
-# clear cache (from java above) -- this whole thing will get
-# folded into the bigger java package later -- for now maintain
-# backward compat
-AS_UNSET(ac_cv_header_jni_h)
-AC_ARG_WITH([hadoop],
- [AS_HELP_STRING([--with-hadoop], [build hadoop client])],
- [],
- [with_hadoop=check])
-AS_IF([test "x$with_hadoop" != xno],
- [AC_CHECK_HEADER([jni.h],
- [HAVE_JNI=1],
- [if test "x$with_hadoop" != xcheck; then
- AC_MSG_FAILURE(
- [--with-hadoop was given but jni.h not found])
- fi
- ])])
-AM_CONDITIONAL(WITH_HADOOPCLIENT, [test "$HAVE_JNI" = "1"])
-
#
# FreeBSD has it in base.
#
@@ -546,6 +536,9 @@ AC_CHECK_FUNC([fallocate],
[])
+AC_CHECK_HEADERS([sys/prctl.h])
+AC_CHECK_FUNCS([prctl])
+
# Checks for typedefs, structures, and compiler characteristics.
#AC_HEADER_STDBOOL
#AC_C_CONST
diff --git a/debian/changelog b/debian/changelog
index 3203f4271b1..4628bb52175 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,21 @@
+ceph (0.70-1) stable; urgency=low
+
+ * New upstream release
+
+ -- Gary Lowell <gary.lowell@inktank.com> Fri, 04 Oct 2013 20:11:51 +0000
+
+ceph (0.69-1) precise; urgency=low
+
+ * New upstream release
+
+ -- Gary Lowell <gary.lowell@inktank.com> Wed, 18 Sep 2013 01:39:47 +0000
+
+ceph (0.68-1) precise; urgency=low
+
+ * New upstream release
+
+ -- Gary Lowell <gary.lowell@inktank.com> Tue, 03 Sep 2013 16:10:11 -0700
+
ceph (0.67-1) precise; urgency=low
* New upstream release
diff --git a/debian/control b/debian/control
index 44ee725efd4..1aec592c9f8 100644
--- a/debian/control
+++ b/debian/control
@@ -34,6 +34,7 @@ Build-Depends: autoconf,
libxml2-dev,
pkg-config,
python (>= 2.6.6-3~),
+ python-nose,
uuid-dev,
yasm
Standards-Version: 3.9.3
diff --git a/debian/copyright b/debian/copyright
index aa91a149853..d3906c44d35 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -1,15 +1,15 @@
-Format-Specification: http://svn.debian.org/wsvn/dep/web/deps/dep5.mdwn?op=file&rev=135
+Format-Specification: http://anonscm.debian.org/viewvc/dep/web/deps/dep5/copyright-format.xml?revision=279&view=markup
Name: ceph
Maintainer: Sage Weil <sage@newdream.net>
Source: http://ceph.com/
Files: *
Copyright: (c) 2004-2010 by Sage Weil <sage@newdream.net>
-License: LGPL2.1 (see /usr/share/common-licenses/LGPL-2.1)
+License: LGPL2.1 (see COPYING-LGPL2.1)
-Files: src/client/hadoop/ceph
-Copyright: Copyright (C) New Dream Network and contributors
-License: Apache License v2
+Files: doc/*
+Copyright: (c) 2010-2012 New Dream Network and contributors
+License: Creative Commons Attribution-ShareAlike (CC BY-SA)
Files: src/mount/canonicalize.c
Copyright: Copyright (C) 1993 Rick Sladkey <jrs@world.std.com>
@@ -23,6 +23,10 @@ Files: src/include/ceph_hash.cc
Copyright: None
License: Public domain
+Files: src/common/bloom_filter.hpp
+Copyright: Copyright (C) 2000 Arash Partow
+License: Boost Software License, Version 1.0
+
Files: m4/acx_pthread.m4
Copyright: Steven G. Johnson <stevenj@alum.mit.edu>
License: GPLWithACException
@@ -32,25 +36,25 @@ Copyright:
Copyright 2012-2013 Intel Corporation All Rights Reserved.
License: BSD 3-clause
-Files: src/common/sctp_crc32.c:
+Files: src/common/sctp_crc32.c:
Copyright:
Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
Copyright (c) 2004-2006 Intel Corporation - All Rights Reserved
License:
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
-
+
a) Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
-
+
b) Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the distribution.
-
+
c) Neither the name of Cisco Systems, Inc. nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
-
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@@ -92,6 +96,44 @@ License:
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
+
+
+Files: src/test/common/Throttle.cc src/test/filestore/chain_xattr.cc
+Copyright: Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+License: LGPL2 or later
+
+Files: src/osd/ErasureCodePluginJerasure/*.{c,h}
+Copyright: Copyright (c) 2011, James S. Plank <plank@cs.utk.edu>
+License:
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ - Neither the name of the University of Tennessee nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+
Packaging:
Copyright (C) 2004-2009 by Sage Weil <sage@newdream.net>
Copyright (C) 2010 Canonical, Ltd.
diff --git a/do_autogen.sh b/do_autogen.sh
index baf2dc1eba7..bc6749e9e5d 100755
--- a/do_autogen.sh
+++ b/do_autogen.sh
@@ -10,7 +10,6 @@ do_autogen.sh: make a ceph build by running autogen, etc.
level 1: -g
level 3: -Wextra
level 4: even more...
--H --with-hadoop
-T --without-tcmalloc
-e <path> dump encoded objects to <path>
-P profiling build
@@ -46,8 +45,6 @@ do
h) usage
exit 0;;
- H) CONFIGURE_FLAGS="$CONFIGURE_FLAGS --with-hadoop";;
-
T) CONFIGURE_FLAGS="$CONFIGURE_FLAGS --without-tcmalloc";;
j) CONFIGURE_FLAGS="$CONFIGURE_FLAGS --enable-cephfs-java";;
@@ -82,7 +79,11 @@ if [ "${debug_level}" -ge 3 ]; then
-Wno-missing-field-initializers -Wno-missing-declarations"
fi
if [ "${debug_level}" -ge 4 ]; then
- CXXFLAGS="${CXXFLAGS} -Wstrict-null-sentinel -Woverloaded-virtual"
+ if [ "${CXX}" -ne "clang++" ]; then
+ CXXFLAGS="${CXXFLAGS} -Wstrict-null-sentinel -Woverloaded-virtual"
+ else
+ CXXFLAGS="${CXXFLAGS} -Woverloaded-virtual"
+ fi
CFLAGS="${CFLAGS} \
-Wuninitialized -Winit-self \
-Wformat=2 -Wunused -Wfloat-equal \
diff --git a/doc/architecture.rst b/doc/architecture.rst
index 9f57bbbd58a..988475f53b6 100644
--- a/doc/architecture.rst
+++ b/doc/architecture.rst
@@ -387,7 +387,7 @@ steps to compute PG IDs.
#. CRUSH calculates the hash modulo the number of OSDs. (e.g., ``0x58``) to get
a PG ID.
#. CRUSH gets the pool ID given the pool name (e.g., "liverpool" = ``4``)
-#. CRUSH prepends the pool ID to the pool ID to the PG ID (e.g., ``4.0x58``).
+#. CRUSH prepends the pool ID to the PG ID (e.g., ``4.0x58``).
Computing object locations is much faster than performing object location query
over a chatty session. The :abbr:`CRUSH (Controlled Replication Under Scalable
diff --git a/doc/changelog/v0.56.6.txt b/doc/changelog/v0.56.6.txt
new file mode 100644
index 00000000000..17818d2787b
--- /dev/null
+++ b/doc/changelog/v0.56.6.txt
@@ -0,0 +1,40 @@
+commit 95a0bda7f007a33b0dc7adf4b330778fa1e5d70c
+Author: Gary Lowell <gary.lowell@inktank.com>
+Date: Fri May 3 12:53:00 2013 -0700
+
+ v0.56.6
+
+commit 6dbdcf5a210febb5e0dd585e0e599ac807642210
+Author: Gary Lowell <gary.lowell@inktank.com>
+Date: Fri May 3 12:45:24 2013 -0700
+
+ ceph.spec.in: Fix platform dependecies
+
+ Picked up an incorrect dependency merging the rbd udev rules update.
+
+ Signed-off-by: Gary Lowell <gary.lowell@inktank.com>
+
+commit 05af17e697eb95b2a807d9c05cde39106c5ecee9
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Mon Apr 22 12:48:56 2013 -0700
+
+ rgw: don't send tail to gc if copying object to itself
+
+ Fixes: #4776
+ Backport: bobtail
+ Need to make sure that when copying an object into itself we don't
+ send the tail to the garbage collection.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ Reviewed-by: Greg Farnum <greg@inktank.com>
+ (cherry picked from commit de5d1da810732ee48f41e8be18257053d862301b)
+
+commit f0eb20a7b0f7c8afadc21cc063f1f289b5092bab
+Author: Sage Weil <sage@inktank.com>
+Date: Fri May 3 12:24:21 2013 -0700
+
+ ceph_common.sh: re-sync get_name_list with master
+
+ We backported various items but didn't catch all the changes! :(
+
+ Signed-off-by: Sage Weil <sage@inktank.com>
diff --git a/doc/changelog/v0.56.7.txt b/doc/changelog/v0.56.7.txt
new file mode 100644
index 00000000000..b01c0de105b
--- /dev/null
+++ b/doc/changelog/v0.56.7.txt
@@ -0,0 +1,454 @@
+commit 14f23ab86b0058a8651895b3dc972a29459f3a33
+Author: Gary Lowell <gary.lowell@inktank.com>
+Date: Tue Aug 27 10:44:32 2013 -0700
+
+ v0.56.7
+
+commit 8551be345c86837e0893fdf6c9c5b0af523f50f8
+Author: Josh Durgin <josh.durgin@inktank.com>
+Date: Wed Aug 21 14:28:49 2013 -0700
+
+ objecter: resend unfinished lingers when osdmap is no longer paused
+
+ Plain Ops that haven't finished yet need to be resent if the osdmap
+ transitions from full or paused to unpaused. If these Ops are
+ triggered by LingerOps, they will be cancelled instead (since
+ should_resend = false), but the LingerOps that triggered them will not
+ be resent.
+
+ Fix this by checking the registered flag for all linger ops, and
+ resending any of them that aren't paused anymore.
+
+ Fixes: #6070
+ Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
+ Reviewed-by: Sage Weil <sage.weil@inktank.com>
+ (cherry picked from commit 38a0ca66a79af4b541e6322467ae3a8a4483cc72)
+
+commit 1670a73b56e0b407b65334d8f03d3ebb9558ac8b
+Author: Sage Weil <sage@inktank.com>
+Date: Tue Aug 13 12:52:41 2013 -0700
+
+ librados: fix async aio completion wakeup
+
+ For aio flush, we register a wait on the most recent write. The write
+ completion code, however, was *only* waking the waiter if they were waiting
+ on that write, without regard to previous writes (completed or not).
+ For example, we might have 6 and 7 outstanding and wait on 7. If they
+ finish in order all is well, but if 7 finishes first we do the flush
+ completion early. Similarly, if we
+
+ - start 6
+ - start 7
+ - finish 7
+ - flush; wait on 7
+ - finish 6
+
+ we can hang forever.
+
+ Fix by doing any completions that are prior to the oldest pending write in
+ the aio write completion handler.
+
+ Refs: #5919
+
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+ Tested-by: Oliver Francke <Oliver.Francke@filoo.de>
+ (cherry picked from commit 16ed0b9af8bc08c7dabead1c1a7c1a22b1fb02fb)
+
+commit 6cf05375abea26d2645e3301c9082c64ddf31a55
+Author: Josh Durgin <josh.durgin@inktank.com>
+Date: Mon Aug 12 19:17:09 2013 -0700
+
+ librados: fix locking for AioCompletionImpl refcounting
+
+ Add an already-locked helper so that C_Aio{Safe,Complete} can
+ increment the reference count when their caller holds the
+ lock. C_AioCompleteAndSafe's caller is not holding the lock, so call
+ regular get() to ensure no racing updates can occur.
+
+ This eliminates all direct manipulations of AioCompletionImpl->ref,
+ and makes the necessary locking clear.
+
+ The only place C_AioCompleteAndSafe is used is in handling
+ aio_flush_async(). This could cause a missing completion.
+
+ Refs: #5919
+ Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ Tested-by: Oliver Francke <Oliver.Francke@filoo.de>
+ (cherry picked from commit 7a52e2ff5025754f3040eff3fc52d4893cafc389)
+
+commit 6a37a62b6f794026b82b88630519ec2cde4f20d6
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Mon Aug 12 10:05:44 2013 -0700
+
+ rgw: fix multi delete
+
+ Fixes: #5931
+ Backport: bobtail, cuttlefish
+
+ Fix a bad check, where we compare the wrong field. Instead of
+ comparing the ret code to 0, we compare the string value to 0
+ which generates implicit casting, hence the crash.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+ (cherry picked from commit f9f1c48ad799da2b4be0077bf9d61ae116da33d7)
+
+ Conflicts:
+ src/rgw/rgw_rest_s3.cc
+
+commit 586c68f544c95f9e379df7e4d2705a3090baca49
+Author: Sage Weil <sage@inktank.com>
+Date: Sun Jun 2 21:21:09 2013 -0700
+
+ ceph-fuse: create finisher threads after fork()
+
+ The ObjectCacher and MonClient classes both instantiate Finisher
+ threads. We need to make sure they are created *after* the fork(2)
+ or else the process will fail to join() them on shutdown, and the
+ threads will not exist while fuse is doing useful work.
+
+ Put CephFuse on the heap and move all this initalization into the child
+ block, and make sure errors are passed back to the parent.
+
+ Fix-proposed-by: Alexandre Marangone <alexandre.maragone@inktank.com>
+ Signed-off-by: Sage Weil <sage@inktank.com>
+
+commit c1198d680587928b390bb82c87442384331afd40
+Author: Sage Weil <sage@inktank.com>
+Date: Fri Jun 21 16:22:08 2013 -0700
+
+ debian: update postinst, prerm hooks
+
+ This syncs up the hooks with the latest master versions. In particular,
+ do not blindly stop/restart daemons on package upgrade!
+
+ Fixes: #5414
+ Signed-off-by: Sage Weil <sage@inktank.com>
+
+commit de8900dcd079207852b6ce0b51473037be9ae956
+Author: Sage Weil <sage@inktank.com>
+Date: Tue Jun 18 21:33:09 2013 -0700
+
+ os/FileStore: use fdatasync(2) instead of sync_file_range(2)
+
+ This fixes data corruption on XFS. Backported from
+ ffade3c85dfffa13a16edd9630a52d99eb8a413d.
+
+ Fixes: #4976
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ Reviewed-by: Samuel Just <sam.just@inktank.com>
+
+commit cbbad5b5d917fe74d6cbc50a259f9dbaeda54ca8
+Author: Sage Weil <sage@inktank.com>
+Date: Mon Jun 17 20:06:59 2013 -0700
+
+ chmod +x iogen.sh
+
+ Signed-off-by: Sage Weil <sage@inktank.com>
+
+commit fcd65139135d907098a47427d8503ac6d6042f81
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Fri Jun 14 14:53:54 2013 -0700
+
+ rgw: escape prefix correctly when listing objects
+
+ Fixes: #5362
+ When listing objects prefix needs to be escaped correctly (the
+ same as with the marker). Otherwise listing objects with prefix
+ that starts with underscore doesn't work.
+ Backport: bobtail, cuttlefish
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ Reviewed-by: Greg Farnum <greg@inktank.com>
+
+commit a8f9d57a15ad7a69d53aa8fc6090fd1b394b616a
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Sun Mar 31 00:02:15 2013 -0700
+
+ rgw: translate object marker to raw format
+
+ Fixes: #4600
+ Object marker should be treated as an object, so that name is formatted
+ correctly when getting the raw oid.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit b083dece36a050ec15ac41a275aeef0ece1ac009)
+
+commit e1d41901cde97a77fc1fda2d7f6e78a7cea61c5c
+Author: tamil <tamil.muthamizhan@inktank.com>
+Date: Thu Jun 13 13:50:56 2013 -0700
+
+ adding iogen.sh
+
+ Signed-off-by: tamil <tamil.muthamizhan@inktank.com>
+
+commit 32b2f11366418f9a0cc0659be22cf15fbafbe3b1
+Author: Sage Weil <sage@inktank.com>
+Date: Thu Jun 6 16:35:54 2013 -0700
+
+ osd: do not include logbl in scrub map
+
+ This is a potentially use object/file, usually prefixed by a zeroed region
+ on disk, that is not used by scrub at all. It dates back to
+ f51348dc8bdd5071b7baaf3f0e4d2e0496618f08 (2008) and the original version of
+ scrub.
+
+ This *might* fix #4179. It is not a leak per se, but I observed 1GB
+ scrub messages going over the write. Maybe the allocations are causing
+ fragmentation, or the sub_op queues are growing.
+
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ Reviewed-by: Samuel Just <sam.just@inktank.com>
+ (cherry picked from commit 0b036ecddbfd82e651666326d6f16b3c000ade18)
+
+commit 5047a4ff16f556ec25b6624fe8f1a07e3ec5f864
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Thu Jun 6 21:53:00 2013 -0700
+
+ rgw: handle deep uri resources
+
+ In case of deep uri resources (ones created beyond a single level
+ of hierarchy, e.g. auth/v1.0) we want to create a new empty
+ handlers for the path if no handlers exists. E.g., for
+ auth/v1.0 we need to have a handler for 'auth', otherwise
+ the default S3 handler will be used, which we don't want.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ Reviewed-by: Greg Farnum <greg@inktank.com>
+ (cherry picked from commit ad3934e335399f7844e45fcfd17f7802800d2cb3)
+
+commit 3ea45533c18174cb49af52024dae27533971fa01
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Thu Jun 6 21:47:21 2013 -0700
+
+ rgw: fix get_resource_mgr() to correctly identify resource
+
+ Fixes: #5262
+ The original test was not comparing the correct string, ended up
+ with the effect of just checking the substring of the uri to match
+ the resource.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ Reviewed-by: Greg Farnum <greg@inktank.com>
+ (cherry picked from commit 8d55b87f95d59dbfcfd0799c4601ca37ebb025f5)
+
+commit 4ee638cc3b1d74e678a3f78b3a57baf57a5d407c
+Author: Samuel Just <sam.just@inktank.com>
+Date: Mon Apr 15 16:33:48 2013 -0700
+
+ PG: don't write out pg map epoch every handle_activate_map
+
+ We don't actually need to write out the pg map epoch on every
+ activate_map as long as:
+ a) the osd does not trim past the oldest pg map persisted
+ b) the pg does update the persisted map epoch from time
+ to time.
+
+ To that end, we now keep a reference to the last map persisted.
+ The OSD already does not trim past the oldest live OSDMapRef.
+ Second, handle_activate_map will trim if the difference between
+ the current map and the last_persisted_map is large enough.
+
+ Fixes: #4731
+ Signed-off-by: Samuel Just <sam.just@inktank.com>
+ Reviewed-by: Greg Farnum <greg@inktank.com>
+
+commit 8c6a912ae46c4d3aeb7c1000d221f67e158ec5c8
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Thu May 30 12:58:11 2013 -0700
+
+ rgw: only append prefetched data if reading from head
+
+ Fixes: #5209
+ Backport: bobtail, cuttlefish
+ If the head object wrongfully contains data, but according to the
+ manifest we don't read from the head, we shouldn't copy the prefetched
+ data. Also fix the length calculation for that data.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ Reviewed-by: Greg Farnum <greg@inktank.com>
+ (cherry picked from commit c5fc52ae0fc851444226abd54a202af227d7cf17)
+
+commit f42e84ee3148dcc59a05c8a12ce39996eb854e26
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Thu May 30 09:34:21 2013 -0700
+
+ rgw: don't copy object idtag when copying object
+
+ Fixes: #5204
+ When copying object we ended up also copying the original
+ object idtag which overrode the newly generated one. When
+ refcount put is called with the wrong idtag the count
+ does't go down.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ Reviewed-by: Greg Farnum <greg@inktank.com>
+ (cherry picked from commit b1312f94edc016e604f1d05ccfe2c788677f51d1)
+
+commit d86b9a852b57553448709c2b978ac3a8637cd63f
+Author: Sage Weil <sage@inktank.com>
+Date: Wed May 29 16:50:04 2013 -0700
+
+ osd: initialize new_state field when we use it
+
+ If we use operator[] on a new int field its value is undefined; avoid
+ reading it or using |= et al until we initialize it.
+
+ Fixes: #4967
+ Backport: cuttlefish, bobtail
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ Reviewed-by: David Zafman <david.zafman@inktank.com>
+ (cherry picked from commit 50ac8917f175d1b107c18ecb025af1a7b103d634)
+
+commit fca54bfbb35f0c200fac52963cd2d1780467f59a
+Author: Samuel Just <sam.just@inktank.com>
+Date: Tue May 28 11:10:05 2013 -0700
+
+ HashIndex: sync top directory during start_split,merge,col_split
+
+ Otherwise, the links might be ordered after the in progress
+ operation tag write. We need the in progress operation tag to
+ correctly recover from an interrupted merge, split, or col_split.
+
+ Fixes: #5180
+ Backport: cuttlefish, bobtail
+ Signed-off-by: Samuel Just <sam.just@inktank.com>
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 5bca9c38ef5187c7a97916970a7fa73b342755ac)
+
+commit ac6c77830ad9182c7345f10943ea4f537c1629b0
+Merge: 3984b98 0b50d07
+Author: Samuel Just <sam.just@inktank.com>
+Date: Thu May 23 19:29:06 2013 -0700
+
+ Merge remote-tracking branch 'upstream/wip_scrub_tphandle_bobtail' into bobtail
+
+ Fixes: #5159
+ Reviewed-by: Sage Weil <sage@inktank.com>
+
+commit 0b50d073fab11f64682e20fec7bb71d4e931bd3c
+Author: Samuel Just <sam.just@inktank.com>
+Date: Thu May 23 17:40:44 2013 -0700
+
+ PG: ping tphandle during omap loop as well
+
+ Signed-off-by: Samuel Just <sam.just@inktank.com>
+
+commit 5979a26d521918d74fc3b62b75c9fb6afe109933
+Author: Samuel Just <sam.just@inktank.com>
+Date: Thu May 23 15:24:39 2013 -0700
+
+ PG: reset timeout in _scan_list for each object, read chunk
+
+ Signed-off-by: Samuel Just <sam.just@inktank.com>
+
+commit bde6b58baf5e78550980ee5076bbc2d72a52c0fc
+Author: Samuel Just <sam.just@inktank.com>
+Date: Thu May 23 15:23:05 2013 -0700
+
+ OSD,PG: pass tphandle down to _scan_list
+
+ Signed-off-by: Samuel Just <sam.just@inktank.com>
+
+commit 3984b98804bf18711a7a4fc3940143ab599ad5b5
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Wed May 22 21:34:52 2013 -0700
+
+ rgw: iterate usage entries from correct entry
+
+ Fixes: #5152
+ When iterating through usage entries, and when user id was
+ provided, we started at the user's first entry and not from
+ the entry indexed by the request start time.
+ This commit fixes the issue.
+
+ Backport: bobtail
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ Reviewed-by: Greg Farnum <greg@inktank.com>
+ (cherry picked from commit 8b3a04dec8be13559716667d4b16cde9e9543feb)
+
+commit 3d7f8f840fb9fd975089af32c85093eeb1eac338
+Author: Sage Weil <sage@inktank.com>
+Date: Wed May 22 12:45:27 2013 -0700
+
+ mon: be a bit more verbose about osd mark down events
+
+ Put these in the cluster log; they are interesting.
+
+ This is a backport of 87767fb1fb9a52d11b11f0b641cebbd9998f089e.
+
+ Signed-off-by: Sage Weil <sage@inktank.com>
+
+commit f21a7f7bf3db61bd533e777297bff1346112a0db
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Fri May 3 12:57:00 2013 -0700
+
+ rgw: protect ops log socket formatter
+
+ Fixes: #4905
+ Ops log (through the unix domain socket) uses a formatter, which wasn't
+ protected.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit d48f1edb07a4d8727ac956f70e663c1b4e33e1dd)
+
+commit 49c04c623466df45003ed2a18445c354c424a078
+Author: Josh Durgin <josh.durgin@inktank.com>
+Date: Thu May 16 15:28:40 2013 -0700
+
+ librbd: make image creation defaults configurable
+
+ Programs using older versions of the image creation functions can't
+ set newer parameters like image format and fancier striping.
+
+ Setting these options lets them use all the new functionality without
+ being patched and recompiled to use e.g. rbd_create3().
+ This is particularly useful for things like qemu-img, which does not
+ know how to create format 2 images yet.
+
+ Refs: #5067
+ backport: cuttlefish, bobtail
+ Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
+ (cherry picked from commit aacc9adc4e9ca90bbe73ac153cc754a3a5b2c0a1)
+
+commit 4d7058fe2254f335969f05bef649b1a27d470aa4
+Author: Josh Durgin <josh.durgin@inktank.com>
+Date: Thu May 16 15:21:24 2013 -0700
+
+ rbd.py: fix stripe_unit() and stripe_count()
+
+ These matched older versions of the functions, but would segfault
+ using the current versions.
+
+ backport: cuttlefish, bobtail
+ Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
+ (cherry picked from commit 53ee6f965e8f06c7256848210ad3c4f89d0cb5a0)
+
+commit 82a16c32a37dc46e3019cedc2a5407ae34f806e2
+Author: Josh Durgin <josh.durgin@inktank.com>
+Date: Thu May 16 15:19:46 2013 -0700
+
+ cls_rbd: make sure stripe_unit is not larger than object size
+
+ Test a few other cases too.
+
+ backport: cuttlefish, bobtail
+ Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
+ (cherry picked from commit 810306a2a76eec1c232fd28ec9c351e827fa3031)
+
+commit e0de00897468a434e94790a86fc812b77a59614c
+Author: Sage Weil <sage@inktank.com>
+Date: Fri May 10 22:14:05 2013 -0700
+
+ mon: fix validatation of mds ids in mon commands
+
+ Fixes: #4996
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 5c305d63043762027323052b4bb3ae3063665c6f)
+
+ Conflicts:
+
+ src/mon/MDSMonitor.cc
diff --git a/doc/changelog/v0.67.3.txt b/doc/changelog/v0.67.3.txt
new file mode 100644
index 00000000000..d6b1f2b2742
--- /dev/null
+++ b/doc/changelog/v0.67.3.txt
@@ -0,0 +1,700 @@
+commit 408cd61584c72c0d97b774b3d8f95c6b1b06341a
+Author: Gary Lowell <gary.lowell@inktank.com>
+Date: Mon Sep 9 12:50:11 2013 -0700
+
+ v0.67.3
+
+commit 17a7342b3b935c06610c58ab92a9a1d086923d32
+Merge: b4252bf 10433bb
+Author: Sage Weil <sage@inktank.com>
+Date: Sat Sep 7 13:34:45 2013 -0700
+
+ Merge pull request #574 from dalgaaf/fix/da-dumpling-cherry-picks
+
+ init-radosgw*: fix status return value if radosgw isn't running
+
+ Reviewed-by: Sage Weil <sage@inktank.com>
+
+commit 10433bbe72dbf8eae8fae836e557a043610eb54e
+Author: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
+Date: Sat Sep 7 11:30:15 2013 +0200
+
+ init-radosgw*: fix status return value if radosgw isn't running
+
+ Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
+ (cherry picked from commit b5137baf651eaaa9f67e3864509e437f9d5c3d5a)
+
+commit b4252bff79150a95e9d075dd0b5e146ba9bf2ee5
+Author: Samuel Just <sam.just@inktank.com>
+Date: Thu Aug 22 11:19:37 2013 -0700
+
+ FileStore: add config option to disable the wbthrottle
+
+ Backport: dumpling
+ Signed-off-by: Samuel Just <sam.just@inktank.com>
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 3528100a53724e7ae20766344e467bf762a34163)
+
+commit 699324e0910e5e07a1ac68df8cf1108e5671ec15
+Author: Samuel Just <sam.just@inktank.com>
+Date: Thu Aug 22 11:19:52 2013 -0700
+
+ WBThrottle: use fdatasync instead of fsync
+
+ Backport: dumpling
+ Signed-off-by: Samuel Just <sam.just@inktank.com>
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit d571825080f0bff1ed3666e95e19b78a738ecfe8)
+
+commit 074717b4b49ae1a55bc867e5c34d43c51edc84a5
+Author: Samuel Just <sam.just@inktank.com>
+Date: Thu Aug 29 15:08:58 2013 -0700
+
+ PGLog: initialize writeout_from in PGLog constructor
+
+ Fixes: 6151
+ Backport: dumpling
+ Signed-off-by: Samuel Just <sam.just@inktank.com>
+ Introduced: f808c205c503f7d32518c91619f249466f84c4cf
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 42d65b0a7057696f4b8094f7c686d467c075a64d)
+
+commit c22d980cf42e580818dc9f526327518c0ddf8ff5
+Author: Samuel Just <sam.just@inktank.com>
+Date: Tue Aug 27 08:49:14 2013 -0700
+
+ PGLog: maintain writeout_from and trimmed
+
+ This way, we can avoid omap_rmkeyrange in the common append
+ and trim cases.
+
+ Fixes: #6040
+ Backport: Dumpling
+ Signed-off-by: Samuel Just <sam.just@inktank.com>
+ (cherry picked from commit f808c205c503f7d32518c91619f249466f84c4cf)
+
+commit 53c7ab4db00ec7034f5aa555231f9ee167f43201
+Author: Samuel Just <sam.just@inktank.com>
+Date: Tue Aug 27 07:27:26 2013 -0700
+
+ PGLog: don't maintain log_keys_debug if the config is disabled
+
+ Fixes: #6040
+ Backport: Dumpling
+ Signed-off-by: Samuel Just <sam.just@inktank.com>
+ (cherry picked from commit 1c0d75db1075a58d893d30494a5d7280cb308899)
+
+commit 40dc489351383c2e35b91c3d4e76b633309716df
+Author: Samuel Just <sam.just@inktank.com>
+Date: Mon Aug 26 23:19:45 2013 -0700
+
+ PGLog: move the log size check after the early return
+
+ There really are stl implementations (like the one on my ubuntu 12.04
+ machine) which have a list::size() which is linear in the size of the
+ list. That assert, therefore, is quite expensive!
+
+ Fixes: #6040
+ Backport: Dumpling
+ Signed-off-by: Samuel Just <sam.just@inktank.com>
+ (cherry picked from commit fe68b15a3d82349f8941f5b9f70fcbb5d4bc7f97)
+
+commit 4261eb5ec105b9c27605360910602dc367fd79f5
+Author: Sage Weil <sage@inktank.com>
+Date: Tue Aug 13 17:16:08 2013 -0700
+
+ rbd.cc: relicense as LGPL2
+
+ All past authors for rbd.cc have consented to relicensing from GPL to
+ LGPL2 via email:
+
+ ---
+
+ Date: Sat, 27 Jul 2013 01:59:36 +0200
+ From: Sylvain Munaut <s.munaut@whatever-company.com>
+ Subject: Re: Ceph rbd.cc GPL -> LGPL2 license change
+
+ I hereby consent to the relicensing of any contribution I made to the
+ aforementioned rbd.cc file from GPL to LGPL2.1.
+
+ (I hope that'll be impressive enough, I did my best :p)
+
+ btw, tnt@246tNt.com and s.munaut@whatever-company.com are both me.
+
+ Cheers,
+
+ Sylvain
+
+ ---
+
+ Date: Fri, 26 Jul 2013 17:00:48 -0700
+ From: Yehuda Sadeh <yehuda@inktank.com>
+ Subject: Re: Ceph rbd.cc GPL -> LGPL2 license change
+
+ I consent.
+
+ ---
+
+ Date: Fri, 26 Jul 2013 17:02:24 -0700
+ From: Josh Durgin <josh.durgin@inktank.com>
+ Subject: Re: Ceph rbd.cc GPL -> LGPL2 license change
+
+ I consent.
+
+ ---
+
+ Date: Fri, 26 Jul 2013 18:17:46 -0700
+ From: Stanislav Sedov <stas@freebsd.org>
+ Subject: Re: Ceph rbd.cc GPL -> LGPL2 license change
+
+ I consent.
+
+ Thanks for taking care of it!
+
+ ---
+
+ Date: Fri, 26 Jul 2013 18:24:15 -0700
+ From: Colin McCabe <cmccabe@alumni.cmu.edu>
+
+ I consent.
+
+ cheers,
+ Colin
+
+ ---
+
+ Date: Sat, 27 Jul 2013 07:08:12 +0200
+ From: Christian Brunner <christian@brunner-muc.de>
+ Subject: Re: Ceph rbd.cc GPL -> LGPL2 license change
+
+ I consent
+
+ Christian
+
+ ---
+
+ Date: Sat, 27 Jul 2013 12:17:34 +0300
+ From: Stratos Psomadakis <psomas@grnet.gr>
+ Subject: Re: Ceph rbd.cc GPL -> LGPL2 license change
+
+ Hi,
+
+ I consent with the GPL -> LGL2.1 re-licensing.
+
+ Thanks
+ Stratos
+
+ ---
+
+ Date: Sat, 27 Jul 2013 16:13:13 +0200
+ From: Wido den Hollander <wido@42on.com>
+ Subject: Re: Ceph rbd.cc GPL -> LGPL2 license change
+
+ I consent!
+
+ You have my permission to re-license the code I wrote for rbd.cc to LGPL2.1
+
+ ---
+
+ Date: Sun, 11 Aug 2013 10:40:32 +0200
+ From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
+ Subject: Re: btw
+
+ Hi Sage,
+
+ I agree to switch the license of ceph_argparse.py and rbd.cc from GPL2
+ to LGPL2.
+
+ Regards
+
+ Danny Al-Gaaf
+
+ ---
+
+ Date: Tue, 13 Aug 2013 17:15:24 -0700
+ From: Dan Mick <dan.mick@inktank.com>
+ Subject: Re: Ceph rbd.cc GPL -> LGPL2 license change
+
+ I consent to relicense any contributed code that I wrote under LGPL2.1 license.
+
+ ---
+
+ ...and I consent too. Drop the exception from COPYING and debian/copyright
+ files.
+
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 2206f55761c675b31078dea4e7dd66f2666d7d03)
+
+commit 211c5f13131e28b095a1f3b72426128f1db22218
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Fri Aug 23 15:39:20 2013 -0700
+
+ rgw: flush pending data when completing multipart part upload
+
+ Fixes: #6111
+ Backport: dumpling
+ When completing the part upload we need to flush any data that we
+ aggregated and didn't flush yet. With earlier code didn't have to deal
+ with it as for multipart upload we didn't have any pending data.
+ What we do now is we call the regular atomic data completion
+ function that takes care of it.
+
+ Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit 9a551296e0811f2b65972377b25bb28dbb42f575)
+
+commit 1a9651010aab51c9be2edeccd80e9bd11f5177ce
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Mon Aug 26 19:46:43 2013 -0700
+
+ rgw: check object name after rebuilding it in S3 POST
+
+ Fixes: #6088
+ Backport: bobtail, cuttlefish, dumpling
+
+ When posting an object it is possible to provide a key
+ name that refers to the original filename, however we
+ need to verify that in the end we don't end up with an
+ empty object name.
+
+ Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit c8ec532fadc0df36e4b265fe20a2ff3e35319744)
+
+commit 1bd74a020b93f154b2d4129d512f6334387de7c7
+Author: Sage Weil <sage@inktank.com>
+Date: Thu Aug 22 17:46:45 2013 -0700
+
+ mon/MonClient: release pending outgoing messages on shutdown
+
+ This fixes a small memory leak when we have messages queued for the mon
+ when we shut down. It is harmless except for the valgrind leak check
+ noise that obscures real leaks.
+
+ Backport: dumpling
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 309569a6d0b7df263654b7f3f15b910a72f2918d)
+
+commit 24f2669783e2eb9d9af5ecbe106efed93366ba63
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Thu Aug 29 13:06:33 2013 -0700
+
+ rgw: change watch init ordering, don't distribute if can't
+
+ Backport: dumpling
+
+ Moving back the watch initialization after the zone init,
+ as the zone info holds the control pool name. Since zone
+ init might need to create a new system object (that needs
+ to distribute cache), don't try to distribute cache if
+ watch is not yet initialized.
+
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit 1d1f7f18dfbdc46fdb09a96ef973475cd29feef5)
+
+commit a708c8ab52e5b1476405a1f817c23b8845fbaab3
+Author: Sage Weil <sage@inktank.com>
+Date: Fri Aug 30 09:41:29 2013 -0700
+
+ ceph-post-file: use mktemp instead of tempfile
+
+ tempfile is a debian thing, apparently; mktemp is present everywhere.
+
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit e60d4e09e9f11e3c34a05cd122341e06c7c889bb)
+
+commit 625f13ee0d6cca48d61dfd65e00517d092552d1c
+Author: Sage Weil <sage@inktank.com>
+Date: Wed Aug 28 09:50:11 2013 -0700
+
+ mon: discover mon addrs, names during election state too
+
+ Currently we only detect new mon addrs and names during the probing phase.
+ For non-trivial clusters, this means we can get into a sticky spot when
+ we discover enough peers to form an quorum, but not all of them, and the
+ undiscovered ones are enough to break the mon ranks and prevent an
+ election.
+
+ One way to work around this is to continue addr and name discovery during
+ the election. We should also consider making the ranks less sensitive to
+ the undefined addrs; that is a separate change.
+
+ Fixes: #4924
+ Backport: dumpling
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ Tested-by: Bernhard Glomm <bernhard.glomm@ecologic.eu>
+ (cherry picked from commit c24028570015cacf1d9e154ffad80bec06a61e7c)
+
+commit 83cfd4386c1fd0fa41aea345704e27f82b524ece
+Author: Dan Mick <dan.mick@inktank.com>
+Date: Thu Aug 22 17:30:24 2013 -0700
+
+ ceph_rest_api.py: create own default for log_file
+
+ common/config thinks the default log_file for non-daemons should be "".
+ Override that so that the default is
+ /var/log/ceph/{cluster}-{name}.{pid}.log
+ since ceph-rest-api is more of a daemon than a client.
+
+ Fixes: #6099
+ Backport: dumpling
+ Signed-off-by: Dan Mick <dan.mick@inktank.com>
+ (cherry picked from commit 2031f391c3df68e0d9e381a1ef3fe58d8939f0a8)
+
+commit 8a1da62d9564a32f7b8963fe298e1ac3ad0ea3d9
+Author: Sage Weil <sage@inktank.com>
+Date: Fri Aug 16 17:59:11 2013 -0700
+
+ ceph-post-file: single command to upload a file to cephdrop
+
+ Use sftp to upload to a directory that only this user and ceph devs can
+ access.
+
+ Distribute an ssh key to connect to the account. This will let us revoke
+ the key in the future if we feel the need. Also distribute a known_hosts
+ file so that users have some confidence that they are connecting to the
+ real ceph drop account and not some third party.
+
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ Reviewed-by: Dan Mick <dan.mick@inktank.com>
+ (cherry picked from commit d08e05e463f1f7106a1f719d81b849435790a3b9)
+
+commit 3f8663477b585dcb528fdd7047c50d9a52d24b95
+Author: Gary Lowell <glowell@inktank.com>
+Date: Thu Aug 22 13:29:32 2013 -0700
+
+ ceph.spec.in: remove trailing paren in previous commit
+
+ Signed-off-by: Gary Lowell <gary.lowell@inktank.com>
+
+commit 23fb908cb3ac969c874ac12755d20ed2f636e1b9
+Author: Gary Lowell <glowell@inktank.com>
+Date: Thu Aug 22 11:07:16 2013 -0700
+
+ ceph.spec.in: Don't invoke debug_package macro on centos.
+
+ If the redhat-rpm-config package is installed, the debuginfo rpms will
+ be built by default. The build will fail when the package installed
+ and the specfile also invokes the macro.
+
+ Signed-off-by: Gary Lowell <gary.lowell@inktank.com>
+
+commit 11f5853d8178ab60ab948d373c1a1f67324ce3bd
+Author: Sage Weil <sage@inktank.com>
+Date: Sat Aug 24 14:04:09 2013 -0700
+
+ osd: install admin socket commands after signals
+
+ This lets us tell by the presence of the admin socket commands whether
+ a signal will make us shut down cleanly. See #5924.
+
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ Reviewed-by: Samuel Just <sam.just@inktank.com>
+ (cherry picked from commit c5b5ce120a8ce9116be52874dbbcc39adec48b5c)
+
+commit 39adc0195e6016ce36828885515be1bffbc10ae1
+Author: Sage Weil <sage@inktank.com>
+Date: Tue Aug 20 22:39:09 2013 -0700
+
+ ceph-disk: partprobe after creating journal partition
+
+ At least one user reports that a partprobe is needed after creating the
+ journal partition. It is not clear why sgdisk is not doing it, but this
+ fixes ceph-disk for them, and should be harmless for other users.
+
+ Fixes: #5599
+ Tested-by: lurbs in #ceph
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 2af59d5e81c5e3e3d7cfc50d9330d7364659c5eb)
+ (cherry picked from commit 3e42df221315679605d68b2875aab6c7eb6b3cc4)
+
+commit 6a4fe7b9b068ae990d6404921a46631fe9ebcd31
+Author: Sage Weil <sage@inktank.com>
+Date: Tue Aug 20 11:27:23 2013 -0700
+
+ mon/Paxos: always refresh after any store_state
+
+ If we store any new state, we need to refresh the services, even if we
+ are still in the midst of Paxos recovery. This is because the
+ subscription path will share any committed state even when paxos is
+ still recovering. This prevents a race like:
+
+ - we have maps 10..20
+ - we drop out of quorum
+ - we are elected leader, paxos recovery starts
+ - we get one LAST with committed states that trim maps 10..15
+ - we get a subscribe for map 10..20
+ - we crash because 10 is no longer on disk because the PaxosService
+ is out of sync with the on-disk state.
+
+ Fixes: #6045
+ Backport: dumpling
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ Reviewed-by: Joao Eduardo Luis <joao.luis@inktank.com>
+ (cherry picked from commit 981eda9f7787c83dc457f061452685f499e7dd27)
+
+commit 13d396e46ed9200e4b9f21db2f0a8efbc5998d82
+Author: Sage Weil <sage@inktank.com>
+Date: Tue Aug 20 11:27:09 2013 -0700
+
+ mon/Paxos: return whether store_state stored anything
+
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ Reviewed-by: Joao Eduardo Luis <joao.luis@inktank.com>
+ (cherry picked from commit 7e0848d8f88f156a05eef47a9f730b772b64fbf2)
+
+commit f248383bacff76203fa94716cfdf6cf766da24a7
+Author: Sage Weil <sage@inktank.com>
+Date: Tue Aug 20 11:26:57 2013 -0700
+
+ mon/Paxos: cleanup: use do_refresh from handle_commit
+
+ This avoid duplicated code by using the helper created exactly for this
+ purpose.
+
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ Reviewed-by: Joao Eduardo Luis <joao.luis@inktank.com>
+ (cherry picked from commit b9dee2285d9fe8533fa98c940d5af7b0b81f3d33)
+
+commit 02608a12d4e7592784148a62a47d568efc24079d
+Author: Sage Weil <sage@inktank.com>
+Date: Thu Aug 15 21:48:06 2013 -0700
+
+ osdc/ObjectCacher: do not merge rx buffers
+
+ We do not try to merge rx buffers currently. Make that explicit and
+ documented in the code that it is not supported. (Otherwise the
+ last_read_tid values will get lost and read results won't get applied
+ to the cache properly.)
+
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 1c50c446152ab0e571ae5508edb4ad7c7614c310)
+
+commit 0e2bfe71965eeef29b47e8032637ea820a7ce49c
+Author: Sage Weil <sage@inktank.com>
+Date: Thu Aug 15 21:47:18 2013 -0700
+
+ osdc/ObjectCacher: match reads with their original rx buffers
+
+ Consider a sequence like:
+
+ 1- start read on 100~200
+ 100~200 state rx
+ 2- truncate to 200
+ 100~100 state rx
+ 3- start read on 200~200
+ 100~100 state rx
+ 200~200 state rx
+ 4- get 100~200 read result
+
+ Currently this makes us crash on
+
+ osdc/ObjectCacher.cc: 738: FAILED assert(bh->length() <= start+(loff_t)length-opos)
+
+ when processing the second 200~200 bufferhead (it is too big). The
+ larger issue, though, is that we should not be looking at this data at
+ all; it has been truncated away.
+
+ Fix this by marking each rx buffer with the read request that is sent to
+ fill it, and only fill it from that read request. Then the first reply
+ will fill the first 100~100 extend but not touch the other extent; the
+ second read will do that.
+
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit b59f930ae147767eb4c9ff18c3821f6936a83227)
+
+commit 6b51c960715971a0351e8203d4896cb0c4138a3f
+Author: Sage Weil <sage@inktank.com>
+Date: Thu Aug 22 15:54:48 2013 -0700
+
+ mon/Paxos: fix another uncommitted value corner case
+
+ It is possible that we begin the paxos recovery with an uncommitted
+ value for, say, commit 100. During last/collect we discover 100 has been
+ committed already. But also, another node provides an uncommitted value
+ for 101 with the same pn. Currently, we refuse to learn it, because the
+ pn is not strictly > than our current uncommitted pn... even though it is
+ the next last_committed+1 value that we need.
+
+ There are two possible fixes here:
+
+ - make this a >= as we can accept newer values from the same pn.
+ - discard our uncommitted value metadata when we commit the value.
+
+ Let's do both!
+
+ Fixes: #6090
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit fe5010380a3a18ca85f39403e8032de1dddbe905)
+
+commit b3a280d5af9d06783d2698bd434940de94ab0fda
+Author: Sage Weil <sage@inktank.com>
+Date: Fri Aug 23 11:45:35 2013 -0700
+
+ os: make readdir_r buffers larger
+
+ PATH_MAX isn't quite big enough.
+
+ Backport: dumpling, cuttlefish, bobtail
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 99a2ff7da99f8cf70976f05d4fe7aa28dd7afae5)
+
+commit 989a664ef0d1c716cab967f249112f595cf98c43
+Author: Sage Weil <sage@inktank.com>
+Date: Fri Aug 23 11:45:08 2013 -0700
+
+ os: fix readdir_r buffer size
+
+ The buffer needs to be big or else we're walk all over the stack.
+
+ Backport: dumpling, cuttlefish, bobtail
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 2df66d9fa214e90eb5141df4d5755b57e8ba9413)
+
+ Conflicts:
+
+ src/os/BtrfsFileStoreBackend.cc
+
+commit a4cca31c82bf0e84272e01eb1b3188dfdb5b5615
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Thu Aug 22 10:53:12 2013 -0700
+
+ rgw: fix crash when creating new zone on init
+
+ Moving the watch/notify init before the zone init,
+ as we might need to send a notification.
+
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit 3d55534268de7124d29bd365ea65da8d2f63e501)
+
+commit 4cf6996803ef66f2b6083f73593259d45e2740a3
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Mon Aug 19 08:40:16 2013 -0700
+
+ rgw: change cache / watch-notify init sequence
+
+ Fixes: #6046
+ We were initializing the watch-notify (through the cache
+ init) before reading the zone info which was much too
+ early, as we didn't have the control pool name yet. Now
+ simplifying init/cleanup a bit, cache doesn't call watch/notify
+ init and cleanup directly, but rather states its need
+ through a virtual callback.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit d26ba3ab0374e77847c742dd00cb3bc9301214c2)
+
+commit aea6de532b0b843c3a8bb76d10bab8476f0d7c09
+Author: Alexandre Oliva <oliva@gnu.org>
+Date: Thu Aug 22 03:40:22 2013 -0300
+
+ enable mds rejoin with active inodes' old parent xattrs
+
+ When the parent xattrs of active inodes that the mds attempts to open
+ during rejoin lack pool info (struct_v < 5), this field will be filled
+ in with -1, causing the mds to retry fetching a backtrace with a pool
+ number that matches the expected value, which fails and causes the
+ err==-ENOENT branch to be taken and retry pool 1, which succeeds, but
+ with pool -1, and so keeps on bouncing between the two retry cases
+ forever.
+
+ This patch arranges for the mds to go along with pool -1 instead of
+ insisting that it be refetched, enabling it to complete recovery
+ instead of eating cpu, network bandwidth and metadata osd's resources
+ like there's no tomorrow, in what AFAICT is an infinite and very busy
+ loop.
+
+ This is not a new problem: I've had it even before upgrading from
+ Cuttlefish to Dumpling, I'd just never managed to track it down, and
+ force-unmounting the filesystem and then restarting the mds was an
+ easier (if inconvenient) work-around, particularly because it always
+ hit when the filesystem was under active, heavy-ish use (or there
+ wouldn't be much reason for caps recovery ;-)
+
+ There are two issues not addressed in this patch, however. One is
+ that nothing seems to proactively update the parent xattr when it is
+ found to be outdated, so it remains out of date forever. Not even
+ renaming top-level directories causes the xattrs to be recursively
+ rewritten. AFAICT that's a bug.
+
+ The other is that inodes that don't have a parent xattr (created by
+ even older versions of ceph) are reported as non-existing in the mds
+ rejoin message, because the absence of the parent xattr is signaled as
+ a missing inode (?failed to reconnect caps for missing inodes?). I
+ suppose this may cause more serious recovery problems.
+
+ I suppose a global pass over the filesystem tree updating parent
+ xattrs that are out-of-date would be desirable, if we find any parent
+ xattrs still lacking current information; it might make sense to
+ activate it as a background thread from the backtrace decoding
+ function, when it finds a parent xattr that's too out-of-date, or as a
+ separate client (ceph-fsck?).
+
+ Backport: dumpling, cuttlefish
+ Signed-off-by: Alexandre Oliva <oliva@gnu.org>
+ Reviewed-by: Zheng, Yan <zheng.z.yan@intel.com>
+ (cherry picked from commit 617dc36d477fd83b2d45034fe6311413aa1866df)
+
+commit 0738bdf92f5e5eb93add152a4135310ac7ea1c91
+Author: David Disseldorp <ddiss@suse.de>
+Date: Mon Jul 29 17:05:44 2013 +0200
+
+ mds: remove waiting lock before merging with neighbours
+
+ CephFS currently deadlocks under CTDB's ping_pong POSIX locking test
+ when run concurrently on multiple nodes.
+ The deadlock is caused by failed removal of a waiting_locks entry when
+ the waiting lock is merged with an existing lock, e.g:
+
+ Initial MDS state (two clients, same file):
+ held_locks -- start: 0, length: 1, client: 4116, pid: 7899, type: 2
+ start: 2, length: 1, client: 4110, pid: 40767, type: 2
+ waiting_locks -- start: 1, length: 1, client: 4116, pid: 7899, type: 2
+
+ Waiting lock entry 4116@1:1 fires:
+ handle_client_file_setlock: start: 1, length: 1,
+ client: 4116, pid: 7899, type: 2
+
+ MDS state after lock is obtained:
+ held_locks -- start: 0, length: 2, client: 4116, pid: 7899, type: 2
+ start: 2, length: 1, client: 4110, pid: 40767, type: 2
+ waiting_locks -- start: 1, length: 1, client: 4116, pid: 7899, type: 2
+
+ Note that the waiting 4116@1:1 lock entry is merged with the existing
+ 4116@0:1 held lock to become a 4116@0:2 held lock. However, the now
+ handled 4116@1:1 waiting_locks entry remains.
+
+ When handling a lock request, the MDS calls adjust_locks() to merge
+ the new lock with available neighbours. If the new lock is merged,
+ then the waiting_locks entry is not located in the subsequent
+ remove_waiting() call because adjust_locks changed the new lock to
+ include the old locks.
+ This fix ensures that the waiting_locks entry is removed prior to
+ modification during merge.
+
+ Signed-off-by: David Disseldorp <ddiss@suse.de>
+ Reviewed-by: Greg Farnum <greg@inktank.com>
+ (cherry picked from commit 476e4902907dfadb3709ba820453299ececf990b)
+
+commit a0ac88272511d670b5c3756dda2d02c93c2e9776
+Author: Dan Mick <dan.mick@inktank.com>
+Date: Tue Aug 20 11:10:42 2013 -0700
+
+ mon/PGMap: OSD byte counts 4x too large (conversion to bytes overzealous)
+
+ Fixes: #6049
+ Signed-off-by: Dan Mick <dan.mick@inktank.com>
+ (cherry picked from commit eca53bbf583027397f0d5e050a76498585ecb059)
+
+commit 87b19c33ce29e2ca4fc49a2adeb12d3f14ca90a9
+Author: Alfredo Deza <alfredo.deza@inktank.com>
+Date: Fri Aug 23 08:56:07 2013 -0400
+
+ ceph-disk: specify the filetype when mounting
+
+ Signed-off-by: Alfredo Deza <alfredo.deza@inktank.com>
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit f040020fb2a7801ebbed23439159755ff8a3edbd)
diff --git a/doc/changelog/v0.67.4.txt b/doc/changelog/v0.67.4.txt
new file mode 100644
index 00000000000..73b997ea304
--- /dev/null
+++ b/doc/changelog/v0.67.4.txt
@@ -0,0 +1,550 @@
+commit ad85b8bfafea6232d64cb7ba76a8b6e8252fa0c7
+Author: Gary Lowell <gary.lowell@inktank.com>
+Date: Thu Oct 3 22:41:31 2013 +0000
+
+ v0.67.4
+
+commit 5cd66d3b4bca92b402c95ab256fbc3f0329c446f
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Fri Sep 20 14:04:47 2013 -0700
+
+ rgw: fix keystone token expiration test
+
+ Fixes: #6360
+ The test was inverted, need expiration to be greater than
+ current time in order for token to be valid.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ Reviewed-by: Sage Weil <sage@inktank.com>
+
+commit e0203c61a3f45fdd6d3d3ece26fef6152bdc036d
+Author: David Zafman <david.zafman@inktank.com>
+Date: Wed Sep 11 16:55:06 2013 -0700
+
+ osd/OSD.cc: Use MIN() so that we don't exceed osd_recovery_max_active
+
+ Caused by 944f3b73531af791c90f0f061280160003545c63
+
+ Fixes: #6291
+
+ Backport: dumpling
+
+ Signed-off-by: David Zafman <david.zafman@inktank.com>
+ Reviewed-by: Samuel Just <sam.just@inktank.com>
+ (cherry picked from commit 139a714e13aa3c7f42091270b55dde8a17b3c4b8)
+
+ Conflicts:
+
+ src/osd/OSD.cc
+
+commit c376708358cedb5561fbb43e9b9e622df3ea7a58
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date: Wed Sep 25 22:08:24 2013 +0100
+
+ mon: OSDMonitor: do not write full_latest during trim
+
+ On commit 81983bab we patched OSDMonitor::update_from_paxos() such that we
+ write the latest full map version to 'full_latest' each time the latest
+ full map was built from the incremental versions.
+
+ This change however clashed with OSDMonitor::encode_trim_extra(), which
+ also wrote to 'full_latest' on each trim, writing instead the version of
+ the *oldest* full map. This duality of behaviors could lead the store
+ to an inconsistent state across the monitors (although there's no sign of
+ it actually imposing any issues besides rebuilding already existing full
+ maps on some monitors).
+
+ We now stop OSDMonitor::encode_trim_extra() from writing to 'full_latest'.
+ This function will still write out the oldest full map it has in the store,
+ but it will no longer write to full_latest, instead leaving it up to
+ OSDMonitor::update_from_paxos() to figure it out -- and it already does.
+
+ Fixes: #6378
+
+ Backport: dumpling
+
+ Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit bd0f29a2c28cca496ec830eac932477ebf3182ba)
+
+commit de40d0b3e35ab0124cd3c4ebfcaa435ab8abfab9
+Author: Sage Weil <sage@inktank.com>
+Date: Tue Oct 1 15:53:42 2013 -0700
+
+ crush: invalidate rmap on create (and thus decode)
+
+ If we have an existing CrushWrapper object and decode from a bufferlist,
+ reset build_rmaps so that they get rebuilt.
+
+ Remove the build_rmaps() all in decode that was useless on a redecode
+ (because have_rmaps == true in that case and it did nothing).
+
+ Fixes: #6442
+ Backport: dumpling, maybe cuttlefish
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ Reviewed-by: Joao Eduardo Luis <joao.luis@inktank.com>
+ (cherry picked from commit 9b7a2ae329b6a511064dd3d6e549ba61f52cfd21)
+
+commit 32f5233288c47d95b87c0a9cab5f9c2ffcf15417
+Author: Dan Mick <dan.mick@inktank.com>
+Date: Mon Sep 30 14:58:11 2013 -0700
+
+ Invoke python with /usr/bin/env python instead of directly
+
+ Fixes: #6311
+ Signed-off-by: Dan Mick <dan.mick@inktank.com>
+ (cherry picked from commit b9000b314b9166845ff302d4a827a996775d9a14)
+
+commit 66aeca5a9079be398403bbff67bd5bf68c6fb111
+Author: Sage Weil <sage@inktank.com>
+Date: Wed Sep 25 10:10:21 2013 -0700
+
+ qa/workunits/mon/crush_ops.sh: fix test
+
+ Fix root.
+
+ Fixes: #6392
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit c8cae87e9e08468cc86145e0fd60c05d12826239)
+
+commit beb366302a125dd422c4f092b12eb541cb3bc788
+Author: Sage Weil <sage@inktank.com>
+Date: Mon Sep 23 09:04:34 2013 -0700
+
+ Revert "ceph: parse CEPH_ARGS environment variable"
+
+ This reverts commit 67a95b9880c9bc6e858150352318d68d64ed74ad.
+
+ We now put CEPH_ARGS in the actual args we parse in python, which are passed
+ to rados piecemeal later. This lets you put things like --id ... in there
+ that need to be parsed before librados is initialized.
+ (cherry picked from commit 97f462be4829f0167ed3d65e6694dfc16f1f3243)
+
+commit b475ff9576f145d31c053213c699e13df76d2bcb
+Author: Benoît Knecht <benoit.knecht@fsfe.org>
+Date: Mon Sep 23 15:58:42 2013 +0200
+
+ Add CEPH_ARGS at the end of sys.argv
+
+ This allows, for instance, to pass a different client name to ceph by
+ exporting CEPH_ARGS="--id client_id".
+
+ Signed-off-by: Benoît Knecht <benoit.knecht@fsfe.org>
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 30abe3244c86cbbe1f5b005850c29c9c0eafcad4)
+
+commit 94548b4b67cca37366c7d8719209a6d2e7956811
+Author: Sage Weil <sage@inktank.com>
+Date: Tue Sep 24 15:26:03 2013 -0700
+
+ mon/OSDMonitor: fix 'ceph osd crush reweight ...'
+
+ The adjust method returns a count of adjusted items.
+
+ Add a test.
+
+ Fixes: #6382
+ Backport: dumpling
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ Reviewed-by: Dan Mick <dan.mick@inktank.com>
+ (cherry picked from commit 3de32562b55c6ece3a6ed783c36f8b9f21460339)
+
+commit 00ff7f5c20e13869d0694379739ba4e61d44b97c
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date: Tue Sep 10 00:20:41 2013 +0100
+
+ qa: workunits: mon: crush_ops: test 'ceph osd crush move'
+
+ Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+ (cherry picked from commit 3bc618b7b46496c5110edde0da9cae5d3e68e0e1)
+
+commit 0ff5b4a96833681e92cc41f019a569134474f4cf
+Author: Loic Dachary <loic@dachary.org>
+Date: Tue Sep 24 19:04:23 2013 +0200
+
+ osd: change warn_interval_multiplier to uint32_t
+
+ to prevent overflow in OpTracker::check_ops_in_flight when
+ multiplying warn_interval_multiplier *= 2
+
+ Backport: cuttlefish, dumpling
+
+ http://tracker.ceph.com/issues/6370 fixes #6370
+
+ Signed-off-by: Loic Dachary <loic@dachary.org>
+ (cherry picked from commit 1bce1f009bffd3e28025a08775fec189907a81db)
+
+commit fb15040b6cec6221baa550ddfffade823f784c4a
+Author: David Zafman <david.zafman@inktank.com>
+Date: Mon Sep 9 13:01:12 2013 -0700
+
+ crushtool: do not dump core with non-unique bucket IDs
+
+ Return -EEXIST on duplicate ID
+ BUG FIX: crush_add_bucket() mixes error returns and IDs
+ Add optional argument to return generated ID
+
+ Fixes: #6246
+
+ Signed-off-by: David Zafman <david.zafman@inktank.com>
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 8c76f3a0f9cf100ea2c941dc2b61c470aa5033d7)
+
+commit 410db3f30c6eb54b807908c1f251ad4026e7d446
+Author: Joao Eduardo Luis <jecluis@gmail.com>
+Date: Fri Sep 20 17:06:30 2013 +0100
+
+ qa: workunits: cephtool: check if 'heap' commands are parseable
+
+ Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
+ (cherry picked from commit b1eeaddd5f214c1b0883b44fc8cae07c649be7c4)
+
+commit 062060a38bb26ff260cc51accc534413d726de49
+Author: Joao Eduardo Luis <jecluis@gmail.com>
+Date: Fri Sep 20 17:50:27 2013 +0100
+
+ osd: OSD: add 'heap' command to known osd commands array
+
+ Must have been forgotten during the cli rework.
+
+ Backport: dumpling
+
+ Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
+ (cherry picked from commit 296f2d0db31e9f5a59a3a62a1e95b6c440430fa3)
+
+commit 3f32f57b98e0224a1d30b2a81d7d260be0f53800
+Author: Joao Eduardo Luis <jecluis@gmail.com>
+Date: Fri Sep 20 16:43:27 2013 +0100
+
+ mds: MDS: pass only heap profiler commands instead of the whole cmd vector
+
+ The heap profiler doesn't care, nor should it, what our command name is.
+ It only cares about the commands it handles.
+
+ Backport: dumpling
+
+ Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
+ (cherry picked from commit 238fe272c6bdb62d4e57fd8555c0136de99c8129)
+
+commit 46dcc46617d8f35ab8433540b22343ddcbcc3716
+Author: Joao Eduardo Luis <jecluis@gmail.com>
+Date: Fri Sep 20 16:41:14 2013 +0100
+
+ perfglue/heap_profiler.cc: expect args as first element on cmd vector
+
+ We used to pass 'heap' as the first element of the cmd vector when
+ handling commands. We haven't been doing so for a while now, so we
+ needed to fix this.
+
+ Not expecting 'heap' also makes sense, considering that what we need to
+ know when we reach this function is what command we should handle, and
+ we should not care what the caller calls us when handling his business.
+
+ Fixes: #6361
+ Backport: dumpling
+
+ Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
+ (cherry picked from commit c98b910d49bd2b46ceafdc430044a31524c29f5b)
+
+commit 9dc5f15fbae22244ad1f62925e17c9d81e856e55
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Mon Sep 16 14:35:25 2013 -0700
+
+ rgw: destroy get_obj handle in copy_obj()
+
+ Fixes: #6176
+ Backport: dumpling
+ We take different code paths in copy_obj, make sure we close the handle
+ when we exit the function. Move the call to finish_get_obj() out of
+ copy_obj_data() as we don't create the handle there, so that should
+ makes code less confusing and less prone to errors.
+ Also, note that RGWRados::get_obj() also calls finish_get_obj(). For
+ everything to work in concert we need to pass a pointer to the handle
+ and not the handle itself. Therefore we needed to also change the call
+ to copy_obj_data().
+
+ Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit 9e98620e4325d15c88440a890b267131613e1aa1)
+
+commit 471233e98a9f64ad513a4a196b7661b80534cb00
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date: Mon Sep 9 23:14:11 2013 +0100
+
+ mon: MonCommands: expect a CephString as 1st arg for 'osd crush move'
+
+ Fixes: #6230
+
+ Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+ (cherry picked from commit 7d3799fde19138f957f26ec6be10a8a0000fc1f0)
+
+commit 2908225092bd2aa1b8afcb7848c1cdac5bd9e638
+Author: Sage Weil <sage@inktank.com>
+Date: Mon Sep 23 16:23:33 2013 -0700
+
+ osd: revert 'osd max xattr size' limit
+
+ Set it to 0 (unlimited) for now.
+
+ Backport: dumpling
+
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit abb88d70643c3a76435b7a9d5b04ff29f7502361)
+
+commit b3d3b3747c1eef695138dac828e5fcb435309c7b
+Author: Greg Farnum <greg@inktank.com>
+Date: Wed Sep 11 16:24:32 2013 -0700
+
+ mds: be more careful about decoding LogEvents
+
+ We need to wrap the full decode section or we can abort the process
+ if there's an issue (which we may want to just skip by).
+
+ Signed-off-by: Greg Farnum <greg@inktank.com>
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 73289b34b0be5b6612e38944794d59b5e789f841)
+
+commit 06c58132199ed22413b509dfa751321ccdb24225
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date: Tue Sep 17 17:58:20 2013 +0100
+
+ mon: OSDMonitor: multiple rebuilt full maps per transaction
+
+ Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+ (cherry picked from commit 0d20cae0be701c5b6151a26ee5e4fe24d89aa20a)
+
+commit 65bbcaf4b68790dae4506c1f5db237077e1ff0ae
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date: Sun Sep 15 21:03:50 2013 +0100
+
+ mon: OSDMonitor: update latest_full while rebuilding full maps
+
+ Not doing so will make the monitor rebuild the osdmap full versions, even
+ though they may have been rebuilt before, every time the monitor starts.
+
+ This mostly happens when the cluster is left in an unhealthy state for
+ a long period of time and incremental versions build up. Even though we
+ build the full maps on update_from_paxos(), not updating 'full_latest'
+ leads to the situation initially described.
+
+ Fixes: #6322
+
+ Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+ (cherry picked from commit 81983bab3630520d6c7ee9b7e4a747bc17b8c5c3)
+
+commit 9b9edb04581cca15e67c567332529f5b3f426743
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date: Sun Sep 15 21:00:55 2013 +0100
+
+ mon: OSDMonitor: smaller transactions when rebuilding full versions
+
+ Otherwise, for considerably sized rebuilds, the monitor will not only
+ consume vast amounts of memory, but it will also have troubles committing
+ the transaction. Anyway, it's also a good idea to adjust transactions to
+ the granularity we want, and to be fair we care that each rebuilt full map
+ gets to disk, even if subsequent full maps don't (those can be rebuilt
+ later).
+
+ Fixes: #6323
+
+ Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+ (cherry picked from commit 4ac1570c5cdcd6556dc291cc6d7878fd92d343ae)
+
+commit 298811f7a15541b9ec1015c416ad2aa075be5691
+Author: Joao Eduardo Luis <jecluis@gmail.com>
+Date: Wed Aug 28 15:51:01 2013 +0100
+
+ mon: OSDMonitor: check if pool is on unmanaged snaps mode on mk/rmsnap
+
+ Backport: dumpling
+ Fixes: #6047
+
+ Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
+ (cherry picked from commit fab79543c54c2e446d3f76520d7906645c6b0075)
+
+commit a992664435db9dde3745eb7f354cce3fc5400a47
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Thu Sep 12 14:32:17 2013 -0700
+
+ lru_map: don't use list::size()
+
+ replace list::size() with map::size(), which should have
+ a constant time complexity.
+
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit 7c1d2ded8fa8061bf3f14932800998b963745dd1)
+
+commit 788546ea71c994ff35323747294ed9c177fe7020
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Thu Sep 12 14:30:19 2013 -0700
+
+ common/lru_map: rename tokens to entries
+
+ This code was originally used in a token cache, now
+ as a generic infrastructure rename token fields.
+
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit 532e41a9985a16b35a6e49cdcba38af0ad166fa8)
+
+commit babeb00c42af760b3e7575166479e95365cfcc0a
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Wed Sep 18 10:37:21 2013 -0700
+
+ rgw: use bufferlist::append() instead of bufferlist::push_back()
+
+ push_back() expects char *, whereas append can append a single char.
+ Appending a NULL char to push_back is cast as a NULL pointer which is
+ bad.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+ (cherry picked from commit 08fe028bad13096d482454a2f303158727c363ff)
+
+commit daf85c45dd4d158bc7c33a2fb784857bc7db35cd
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Wed Sep 11 13:46:31 2013 -0700
+
+ rgw: NULL terminate buffer before parsing it
+
+ Fixes: #6175
+ Backport: dumpling
+ We get a buffer off the remote gateway which might
+ not be NULL terminated. The JSON parser needs the
+ buffer to be NULL terminated even though we provide
+ a buffer length as it calls strlen().
+
+ Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit e7f7483192cddca1159aba439ce62b1e78669d51)
+
+commit c73040a5518971813b9ebaae1624c5bacef315d0
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Wed Sep 11 22:30:12 2013 -0700
+
+ rgw: don't call list::size() in ObjectCache
+
+ Fixes: #6286
+ Use an external counter instead of calling list::size()
+
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit 31e3a51e933429d286104fe077e98ea883437ad6)
+
+commit a855aba9d18936e9a060119e041518790cd4b831
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Tue Sep 10 12:18:55 2013 -0700
+
+ rgw: drain pending requests before completing write
+
+ Fixes: #6268
+ When doing aio write of objects (either regular or multipart parts) we
+ need to drain pending aio requests. Otherwise if gateway goes down then
+ object might end up corrupted.
+
+ Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit 626669afaa333d73707553a85f5c874e99e9cbd8)
+
+commit 670db7e80ddc9c26c43a4f66907a5996ce207c4d
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Fri Sep 6 22:33:38 2013 -0700
+
+ rgw: fix get cors, delete cors
+
+ Remove a couple of variables that overrode class member. Not
+ really clear how it was working before, might have been a bad
+ merge / rebase.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit 13872785aeeddbe1b8dd97e49fd6a2d879514f8d)
+
+commit a304016fa01b02efd500135c00b9bf3407a9999c
+Merge: 408cd61 ac0a30f
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Wed Sep 11 09:47:10 2013 -0700
+
+ Merge branch 'wip-6078-dumpling' into dumpling
+
+ Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+
+commit ac0a30feb8c64a3b80d9c519a7b561213403afab
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Wed Aug 28 21:25:20 2013 -0700
+
+ rgw: fix certain return status cases in CORS
+
+ Change return values in certain cases, reorder
+ checks, etc.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit 13b28cc3f1eb8ef42875b630c485ee0105cd244a
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Wed Aug 28 21:24:36 2013 -0700
+
+ rgw: add COPY method to be handled by CORS
+
+ Was missing this http method.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit d45c87ea738807487e72c0719b0d3d459cbe19e9
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Tue Aug 27 19:38:45 2013 -0700
+
+ rgw: fix CORS rule check
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit 986fa92a7a1d88111ba28457160adfcfdaabc5d2
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Tue Aug 27 19:38:18 2013 -0700
+
+ rgw: don't handle CORS if rule not found (is NULL)
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit 71873aba6553492d3ad71596cefd7c841030a277
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Thu Aug 22 13:38:55 2013 -0700
+
+ rgw: tie CORS header response to all relevant operations
+
+ Have the CORS responses on all relevant operations. Also add headers
+ on failure cases.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit 94e7b594d85dbd26e58d823b41f418032e9f163f
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Thu Aug 22 10:00:53 2013 -0700
+
+ rgw: add a generic CORS response handling
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit c3385d8a102faf5379559bb98cf89637ceda1579
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Wed Aug 21 17:22:46 2013 -0700
+
+ rgw: OPTIONS request doesn't need to read object info
+
+ This is a bucket-only operation, so we shouldn't look at the
+ object. Object may not exist and we might respond with Not
+ Exists response which is not what we want.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit a5fdd44e5d8ce4b8d82273d83e27aea19e63aa7c
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Wed Aug 21 14:43:28 2013 -0700
+
+ rgw: remove use of s->bucket_cors
+
+ Some old code still tried to use s->bucket_cors, which was
+ abandoned in a cleanup work.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
diff --git a/doc/dev/cache-pool.rst b/doc/dev/cache-pool.rst
new file mode 100644
index 00000000000..4433d7114ea
--- /dev/null
+++ b/doc/dev/cache-pool.rst
@@ -0,0 +1,70 @@
+Cache pool
+==========
+
+Purpose
+-------
+
+Use a pool of fast storage devices (probably SSDs) and use it as a
+cache for an existing larger pool.
+
+We should be able to create and add a cache pool to an existing pool
+of data, and later remove it, without disrupting service or migrating
+data around.
+
+Use cases
+---------
+
+Read-write pool, writeback
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We have an existing data pool and put a fast cache pool "in front" of it. Writes will
+go to the cache pool and immediately ack. We flush them back to the data pool based on
+some policy.
+
+Read-only pool, weak consistency
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We have an existing data pool and add one or more read-only cache
+pools. We copy data to the cache pool(s) on read. Writes are
+forwarded to the original data pool. Stale data is expired from the
+cache pools based on some as-yet undetermined policy.
+
+This is likely only useful for specific applications with specific
+data access patterns. It may be a match for rgw, for example.
+
+
+Interface
+---------
+
+Set up a read/write cache pool foo-hot for pool foo::
+
+ ceph osd tier add foo foo-hot
+ ceph osd tier cache-mode foo-hot writeback
+ ceph osd tier cache-target-size foo-hot 10G
+ ceph osd tier cache-target-dirty foo-hot 1G
+
+Direct all traffic for foo to foo-hot::
+
+ ceph osd tier set-overlay foo foo-hot
+
+Drain the cache in preparation for turning it off::
+
+ ceph osd tier cache-mode foo-hot invalidate+forward
+ ceph osd tier cache-target-size foo-hot 0 # do not cache any new items
+
+When cache pool is finally empty, disable it::
+
+ ceph osd tier remove-overlay foo
+ ceph osd tier remove foo foo-hot
+
+Read-only pools with lazy consistency::
+
+ ceph osd tier add foo foo-east
+ ceph osd tier cache-mode foo-east readonly
+ ceph osd tier add foo foo-west
+ ceph osd tier cache-mode foo-west readonly
+
+Set up a cold storage tier::
+
+ ceph osd tier add foo foo-cold
+
diff --git a/doc/dev/corpus.rst b/doc/dev/corpus.rst
index 92f480a01a3..64f71c12fdf 100644
--- a/doc/dev/corpus.rst
+++ b/doc/dev/corpus.rst
@@ -22,71 +22,71 @@ We can generate an object corpus for a particular version of ceph like so.
#. Checkout a clean repo (best not to do this where you normally work)::
- git clone ceph.git
- cd ceph
- git submodule update --init
+ git clone ceph.git
+ cd ceph
+ git submodule update --init
#. Build with flag to dump objects to /tmp/foo::
- rm -rf /tmp/foo ; mkdir /tmp/foo
- ./do_autogen.sh -e /tmp/foo
- make
+ rm -rf /tmp/foo ; mkdir /tmp/foo
+ ./do_autogen.sh -e /tmp/foo
+ make
#. Start via vstart::
- cd src
- MON=3 OSD=3 MDS=3 RGW=1 ./vstart.sh -n -x
+ cd src
+ MON=3 OSD=3 MDS=3 RGW=1 ./vstart.sh -n -x
#. Use a much functionality of the cluster as you can, to exercise as many object encoder methods as possible::
- ./rados -p rbd bench 10 write -b 123
- ./ceph osd out 0
- ./init-ceph stop osd.1
- for f in ../qa/workunits/cls/*.sh ; do PATH=".:$PATH" $f ; done
- ../qa/workunits/rados/test.sh
- ./ceph_test_librbd
- ./ceph_test_libcephfs
- ./init-ceph restart mds.a
+ ./rados -p rbd bench 10 write -b 123
+ ./ceph osd out 0
+ ./init-ceph stop osd.1
+ for f in ../qa/workunits/cls/*.sh ; do PATH=".:$PATH" $f ; done
+ ../qa/workunits/rados/test.sh
+ ./ceph_test_librbd
+ ./ceph_test_libcephfs
+ ./init-ceph restart mds.a
Do some more stuff with rgw if you know how.
#. Stop::
- ./stop.sh
+ ./stop.sh
#. Import the corpus (this will take a few minutes)::
- test/encoding/import.sh /tmp/foo `./ceph-dencoder version` ../ceph-object-corpus/archive
- test/encoding/import-generated.sh ../ceph-object-corpus/archive
+ test/encoding/import.sh /tmp/foo `./ceph-dencoder version` ../ceph-object-corpus/archive
+ test/encoding/import-generated.sh ../ceph-object-corpus/archive
#. Prune it! There will be a bazillion copies of various objects, and we only want a representative sample.::
- pushd ../ceph-object-corpus
- bin/prune-archive.sh
- popd
+ pushd ../ceph-object-corpus
+ bin/prune-archive.sh
+ popd
#. Verify the tests pass::
- make check-local
+ make check-local
#. Commit it to the corpus repo and push::
- pushd ../ceph-object-corpus
- git checkout -b wip-new
- git add archive/`../src/ceph-dencoder version`
- git commit -m `../src/ceph-dencoder version`
- git remote add cc ceph.com:/git/ceph-object-corpus.git
- git push cc wip-new
- popd
+ pushd ../ceph-object-corpus
+ git checkout -b wip-new
+ git add archive/`../src/ceph-dencoder version`
+ git commit -m `../src/ceph-dencoder version`
+ git remote add cc ceph.com:/git/ceph-object-corpus.git
+ git push cc wip-new
+ popd
#. Go test it out::
- cd my/regular/tree
- cd ceph-object-corpus
- git fetch origin
- git checkout wip-new
- cd ../src
- make check-local
+ cd my/regular/tree
+ cd ceph-object-corpus
+ git fetch origin
+ git checkout wip-new
+ cd ../src
+ make check-local
#. If everything looks good, update the submodule master branch, and commit the submodule in ceph.git.
diff --git a/doc/dev/generatedocs.rst b/doc/dev/generatedocs.rst
index 16a90792c5b..2a0b68aa8a8 100644
--- a/doc/dev/generatedocs.rst
+++ b/doc/dev/generatedocs.rst
@@ -20,7 +20,7 @@ on your local host. To install ``git``, execute::
To clone the Ceph repository, execute::
- git clone git://ceph/ceph.git
+ git clone git://github.com/ceph/ceph
You should have a full copy of the Ceph repository.
diff --git a/doc/dev/mon-bootstrap.rst b/doc/dev/mon-bootstrap.rst
index 9ce0070b791..0a4a9a2981e 100644
--- a/doc/dev/mon-bootstrap.rst
+++ b/doc/dev/mon-bootstrap.rst
@@ -42,7 +42,7 @@ with a command like::
When creating a new monitor cluster, the keyring should also contain a ``client.admin`` key that can be used
to administer the system::
- ceph-authtool /path/to/keyring --gen-key -n client.admin
+ ceph-authtool /path/to/keyring --gen-key -n client.admin --set-uid=0 --cap mon 'allow *' --cap osd 'allow *' --cap mds 'allow'
The resulting keyring is fed to ``ceph-mon --mkfs`` with the ``--keyring <keyring>`` command-line argument.
diff --git a/doc/dev/osd_internals/erasure_coding.rst b/doc/dev/osd_internals/erasure_coding.rst
index cc1efe4b4bf..0586c46c3bb 100644
--- a/doc/dev/osd_internals/erasure_coding.rst
+++ b/doc/dev/osd_internals/erasure_coding.rst
@@ -3,8 +3,8 @@ Erasure Coded Placement Groups
==============================
The documentation of the erasure coding implementation in Ceph was
-created in July 2013. It is included in Ceph even before erasure
-coding is available because it drives a number of architectural
+created in July 2013. It is included in Ceph even before erasure coded
+pools are available because it drives a number of architectural
changes. It is meant to be updated to reflect the `progress of these
architectural changes <http://tracker.ceph.com/issues/4929>`_, up to
the point where it becomes a reference of the erasure coding
@@ -14,8 +14,14 @@ Glossary
--------
*chunk*
- when the encoding function is called, it returns chunks of the
- same size.
+ when the encoding function is called, it returns chunks of the same
+ size. Data chunks which can be concated to reconstruct the original
+ object and coding chunks which can be used to rebuild a lost chunk.
+
+*chunk rank*
+ the index of a chunk when returned by the encoding function. The
+ rank of the first chunk is 0, the rank of the second chunk is 1
+ etc.
*stripe*
when an object is too large to be encoded with a single call,
@@ -23,9 +29,13 @@ Glossary
called a stripe.
*shard|strip*
- the file that holds all chunks of a same rank for a given object.
+ an ordered sequence of chunks of the same rank from the same
+ object. For a given placement group, each OSD contains shards of
+ the same rank. When dealing with objects that are encoded with a
+ single operation, *chunk* is sometime used instead of *shard*
+ because the shard is made of a single chunk.
-Example:
+The definitions are illustrated as follows:
::
OSD 40 OSD 33
@@ -53,6 +63,6 @@ Table of content
.. toctree::
:maxdepth: 1
- High level design document <erasure_coding/pgbackend>
Developer notes <erasure_coding/developer_notes>
- Draft PGBackend.h header <erasure_coding/PGBackend-h>
+ Jerasure plugin <erasure_coding/jerasure>
+ High level design document <erasure_coding/pgbackend>
diff --git a/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst b/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst
deleted file mode 100644
index b39cdb0e88e..00000000000
--- a/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst
+++ /dev/null
@@ -1,156 +0,0 @@
-===========
-PGBackend.h
-===========
-
-Work in progress:
-::
-
- /**
- * PGBackend
- *
- * PGBackend defines an interface for logic handling IO and
- * replication on RADOS objects. The PGBackend implementation
- * is responsible for:
- *
- * 1) Handling client operations
- * 2) Handling object recovery
- * 3) Handling object access
- */
- class PGBackend {
- public:
- /// IO
-
- /// Perform write
- int perform_write(
- const vector<OSDOp> &ops, ///< [in] ops to perform
- Context *onreadable, ///< [in] called when readable on all reaplicas
- Context *onreadable, ///< [in] called when durable on all replicas
- ) = 0; ///< @return 0 or error
-
- /// Attempt to roll back a log entry
- int try_rollback(
- const pg_log_entry_t &entry, ///< [in] entry to roll back
- ObjectStore::Transaction *t ///< [out] transaction
- ) = 0; ///< @return 0 on success, -EINVAL if it can't be rolled back
-
- /// Perform async read, oncomplete is called when ops out_bls are filled in
- int perform_read(
- vector<OSDOp> &ops, ///< [in, out] ops
- Context *oncomplete ///< [out] called with r code
- ) = 0; ///< @return 0 or error
-
- /// Peering
-
- /**
- * have_enough_infos
- *
- * Allows PGBackend implementation to ensure that enough peers have
- * been contacted to satisfy its requirements.
- *
- * TODO: this interface should yield diagnostic info about which infos
- * are required
- */
- bool have_enough_infos(
- const map<epoch_t, pg_interval_t> &past_intervals, ///< [in] intervals
- const map<chunk_id_t, map<int, pg_info_t> > &peer_infos ///< [in] infos
- ) = 0; ///< @return true if we can continue peering
-
- /**
- * choose_acting
- *
- * Allows PGBackend implementation to select the acting set based on the
- * received infos
- *
- * @return False if the current acting set is inadequate, *req_acting will
- * be filled in with the requested new acting set. True if the
- * current acting set is adequate, *auth_log will be filled in
- * with the correct location of the authoritative log.
- */
- bool choose_acting(
- const map<int, pg_info_t> &peer_infos, ///< [in] received infos
- int *auth_log, ///< [out] osd with auth log
- vector<int> *req_acting ///< [out] requested acting set
- ) = 0;
-
- /// Scrub
-
- /// scan
- int scan(
- const hobject_t &start, ///< [in] scan objects >= start
- const hobject_t &up_to, ///< [in] scan objects < up_to
- vector<hobject_t> *out ///< [out] objects returned
- ) = 0; ///< @return 0 or error
-
- /// stat (TODO: ScrubMap::object needs to have PGBackend specific metadata)
- int scrub(
- const hobject_t &to_stat, ///< [in] object to stat
- bool deep, ///< [in] true if deep scrub
- ScrubMap::object *o ///< [out] result
- ) = 0; ///< @return 0 or error
-
- /**
- * compare_scrub_maps
- *
- * @param inconsistent [out] map of inconsistent pgs to pair<correct, incorrect>
- * @param errstr [out] stream of text about inconsistencies for user
- * perusal
- *
- * TODO: this interface doesn't actually make sense...
- */
- void compare_scrub_maps(
- const map<int, ScrubMap> &maps, ///< [in] maps to compare
- bool deep, ///< [in] true if scrub is deep
- map<hobject_t, pair<set<int>, set<int> > > *inconsistent,
- std:ostream *errstr
- ) = 0;
-
- /// Recovery
-
- /**
- * might_have_unrecoverable
- *
- * @param missing [in] missing,info gathered so far (must include acting)
- * @param intervals [in] past intervals
- * @param should_query [out] pair<int, cpg_t> shards to query
- */
- void might_have_unrecoverable(
- const map<chunk_id_t, map<int, pair<pg_info_t, pg_missing_t> > &missing,
- const map<epoch_t, pg_interval_t> &past_intervals,
- set<pair<int, cpg_t> > *should_query
- ) = 0;
-
- /**
- * might_have_unfound
- *
- * @param missing [in] missing,info gathered so far (must include acting)
- */
- bool recoverable(
- const map<chunk_id_t, map<int, pair<pg_info_t, pg_missing_t> > &missing,
- const hobject_t &hoid ///< [in] object to check
- ) = 0; ///< @return true if object can be recovered given missing
-
- /**
- * recover_object
- *
- * Triggers a recovery operation on the specified hobject_t
- * onreadable must be called before onwriteable
- *
- * @param missing [in] set of info, missing pairs for queried nodes
- */
- void recover_object(
- const hobject_t &hoid, ///< [in] object to recover
- const map<chunk_id_t, map<int, pair<pg_info_t, pg_missing_t> > &missing
- Context *onreadable, ///< [in] called when object can be read
- Context *onwriteable ///< [in] called when object can be written
- ) = 0;
-
- /// Backfill
-
- /// choose_backfill
- void choose_backfill(
- const map<chunk_id_t, map<int, pg_info_t> > &peer_infos ///< [in] infos
- const vector<int> &acting, ///< [in] acting set
- const vector<int> &up, ///< [in] up set
- set<int> *to_backfill ///< [out] osds to backfill
- ) = 0;
- };
diff --git a/doc/dev/osd_internals/erasure_coding/developer_notes.rst b/doc/dev/osd_internals/erasure_coding/developer_notes.rst
index d542fdb86e2..454f087fe53 100644
--- a/doc/dev/osd_internals/erasure_coding/developer_notes.rst
+++ b/doc/dev/osd_internals/erasure_coding/developer_notes.rst
@@ -1,35 +1,42 @@
-============
-Erasure Code
-============
+============================
+Erasure Code developer notes
+============================
Introduction
------------
-An erasure coded pool only supports full writes, appends and read. It
-does not support snapshots or clone. An ErasureCodedPGBackend is derived
-from PGBackend.
+Each chapter of this document explains an aspect of the implementation
+of the erasure code within Ceph. It is mostly based on examples being
+explained to demonstrate how things work. It is written as if the
+implementation is complete although it may not be the case. For
+instance the plugin system and the jerasure plugin are implemented but
+the erasure coded pool is not.
Reading and writing encoded chunks from and to OSDs
---------------------------------------------------
-An erasure coded pool stores each object as M+K chunks. It is divided
-into M data chunks and K parity chunks. The pool is configured to have
-a size of M+K so that each chunk is stored in an OSD in the acting
-set. The rank of the chunks is stored as an attribute of the object.
-An erasure coded pool is created to use five OSDs ( M+K = 5 ) and
-sustain the loss of two of them ( K = 2 ).
+An erasure coded pool stores each object as K+M chunks. It is divided
+into K data chunks and M coding chunks. The pool is configured to have
+a size of K+M so that each chunk is stored in an OSD in the acting
+set. The rank of the chunk is stored as `an attribute of the object
+<http://tracker.ceph.com/issues/5862>`_.
+
+For instance an erasure coded pool is created to use five OSDs ( K+M =
+5 ) and sustain the loss of two of them ( M = 2 ).
When the object *NYAN* containing *ABCDEFGHI* is written to it, the
erasure encoding function splits the content in three data chunks,
simply by dividing the content in three : the first contains *ABC*,
-the second *DEF* and the last *GHI*. The function also creates two
-parity chunks : the fourth with *YXY* and the fifth with *GQC*. Each
+the second *DEF* and the last *GHI*. The content will be padded if the
+content length is not a multiple of K. The function also creates two
+coding chunks : the fourth with *YXY* and the fifth with *GQC*. Each
chunk is stored in an OSD in the acting set. The chunks are stored in
objects that have the same name ( *NYAN* ) but reside on different
OSDs. The order in which the chunks were created must be preserved and
-is stored as an attribute of the object. The chunk *1* contains *ABC*
-and is stored on *OSD5*, the chunk *4* contains *XYY* and is stored on
-*OSD3*.
+is stored as an attribute of the object ( shard_t ), in addition to its
+name. Chunk *1* contains *ABC* and is stored on *OSD5* while chunk *4*
+contains *XYY* and is stored on *OSD3*.
+
::
+-------------------+
@@ -49,7 +56,7 @@ and is stored on *OSD5*, the chunk *4* contains *XYY* and is stored on
+--v---+ +--v---+ +--v---+ +--v---+ +--v---+
name | NYAN | | NYAN | | NYAN | | NYAN | | NYAN |
+------+ +------+ +------+ +------+ +------+
- attribute | 1 | | 2 | | 3 | | 4 | | 5 |
+ shard | 1 | | 2 | | 3 | | 4 | | 5 |
+------+ +------+ +------+ +------+ +------+
content | ABC | | DEF | | GHI | | YXY | | QGC |
+--+---+ +--+---+ +--+---+ +--+---+ +--+---+
@@ -78,10 +85,12 @@ When the object *NYAN* is read from the erasure coded pool, the
decoding function reads three chunks : chunk *1* containing *ABC*,
chunk *3* containing *GHI* and chunk *4* containing *YXY* and rebuild
the original content of the object *ABCDEFGHI*. The decoding function
-is informed that the chunks *2* and *5* are missing. The chunk *5*
-could not be read because the *OSD4* is *out*. The decoding function
-is called as soon as three chunks are read : *OSD2* was the slowest
-and its chunk was not taken into account.
+is informed that the chunks *2* and *5* are missing ( they are called
+*erasures* ). The chunk *5* could not be read because the *OSD4* is
+*out*. The decoding function can be called as soon as three chunks are
+read : *OSD2* was the slowest and its chunk was not taken into
+account.
+
::
+-------------------+
@@ -94,7 +103,7 @@ and its chunk was not taken into account.
|
+------+------+
| decode(3,2) |
- | erased 2,5 |
+ | erasures 2,5|
+-------------->| |
| +-------------+
| ^ ^
@@ -103,17 +112,17 @@ and its chunk was not taken into account.
+--+---+ +------+ +--+---+ +--+---+
name | NYAN | | NYAN | | NYAN | | NYAN |
+------+ +------+ +------+ +------+
- attribute | 1 | | 2 | | 3 | | 4 |
+ shard | 1 | | 2 | | 3 | | 4 |
+------+ +------+ +------+ +------+
content | ABC | | DEF | | GHI | | YXY |
+--+---+ +--+---+ +--+---+ +--+---+
- ^ ^ ^ ^
- | | | |
- | | +--+---+ |
- | | | OSD1 | |
+ ^ . ^ ^
+ | TOO . | |
+ | SLOW . +--+---+ |
+ | ^ | OSD1 | |
| | +------+ |
| | +------+ |
- | SLOW +-------| OSD2 | |
+ | +-------| OSD2 | |
| +------+ |
| +------+ |
| | OSD3 |-----+
@@ -128,11 +137,11 @@ and its chunk was not taken into account.
Interrupted full writes
-----------------------
-In an erasure coded pool the primary OSD is the first of the acting
-set and receives all write operations. It is responsible for encoding
-the payload into M+K chunks and send them to the OSDs in the acting
-set. It is also responsible for maintaining an authoritative version
-of the placement group logs.
+In an erasure coded pool the primary OSD in the up set receives all
+write operations. It is responsible for encoding the payload into K+M
+chunks and sends them to the other OSDs. It is also responsible
+for maintaining an authoritative version of the placement group logs.
+
::
primary
@@ -152,11 +161,18 @@ of the placement group logs.
| log |
| |
|+----+ |
- ||P1v1| 1,1 |
+ ||C1v1| 1,1 |
|+----+ |
+-----------+
-An erasure coded placement group has been created with M = 2 + K = 1 and is supported by three OSDs, two for M and one for K. The acting set of the placement group is made of *OSD 1* *OSD 2* and *OSD 3*. An object has been encoded and stored in the OSDs : the chunk D1v1 (i.e. Data chunk number 1 version 1) is on *OSD 1*, D2v1 on *OSD 2* and P1v1 (i.e. Parity chunk number 1 version 1) on *OSD 3*. The placement group logs on each OSD are in synch at epoch 1 version 1 (i.e. 1,1).
+An erasure coded placement group has been created with K = 2 + M = 1
+and is supported by three OSDs, two for K and one for M. The acting
+set of the placement group is made of *OSD 1*, *OSD 2* and *OSD 3*. An
+object has been encoded and stored in the OSDs : the chunk D1v1
+(i.e. Data chunk number 1 version 1) is on *OSD 1*, D2v1 on *OSD 2*
+and C1v1 (i.e. Coding chunk number 1 version 1) on *OSD 3*. The
+placement group logs on each OSD are identical (i.e. 1,1).
+
::
primary
@@ -175,14 +191,30 @@ An erasure coded placement group has been created with M = 2 + K = 1 and is supp
| +-----------+
| +---OSD 3---+
| |+----+ log |
- +---------->|P1v2| 1,2 |
+ +---------->|C1v2| 1,2 |
|+----+ |
|+----+ |
- ||P1v1| 1,1 |
+ ||C1v1| 1,1 |
|+----+ |
+-----------+
-*OSD 1* is the primary and receives a WRITE FULL from a client, meaning the payload is to replace the content of the object entirely, it is not a partial write that would only overwrite part of it. The version two of the object is created to override the version one. *OSD 1* encodes the payload into three chunks : D1v2 (i.e. Data chunk number 1 version 2) will be on *OSD 1*, D2v2 on *OSD 2* and P1v2 (i.e. Parity chunk number 1 version 2) on *OSD 3*. Each chunk is sent to the target OSD, including the primary OSD which is responsible for storing chunks in addition to handling write operations and maintaining an authoritative version of the placement group logs. When an OSD receives the message instructing it to write the chunk, it also creates a new entry in the placement group logs to reflect the change. For instance, as soon as *OSD 3* stores *P1v2*, it adds the entry 1,2 ( i.e. epoch 1, version 2 ) to its logs. Because the OSDs work asynchronously, some chunks may still be in flight ( such as *D2v2* ) while others are acknowledged and on disk ( such as *P1v1* and *D1v1* ).
+*OSD 1* is the primary and receives a WRITE FULL from a client, which
+means the payload is to replace the object entirely instead of
+overwriting a portion of it. Version two of the object is created to
+override version one. *OSD 1* encodes the payload into three chunks :
+D1v2 (i.e. Data chunk number 1 version 2) will be on *OSD 1*, D2v2 on
+*OSD 2* and C1v2 (i.e. Coding chunk number 1 version 2) on *OSD
+3*. Each chunk is sent to the target OSD, including the primary OSD
+which is responsible for storing chunks in addition to handling write
+operations and maintaining an authoritative version of the placement
+group logs. When an OSD receives the message instructing it to write
+the chunk, it also creates a new entry in the placement group logs to
+reflect the change. For instance, as soon as *OSD 3* stores *C1v2*, it
+adds the entry 1,2 ( i.e. epoch 1, version 2 ) to its logs. Because
+the OSDs work asynchronously, some chunks may still be in flight (
+such as *D2v2* ) while others are acknowledged and on disk ( such as
+*C1v1* and *D1v1* ).
+
::
primary
@@ -204,14 +236,19 @@ An erasure coded placement group has been created with M = 2 + K = 1 and is supp
| +-----------+
| +---OSD 3---+
| |+----+ log |
- +---------->|P1v2| 1,2 |
+ +---------->|C1v2| 1,2 |
|+----+ |
|+----+ |
- ||P1v1| 1,1 |
+ ||C1v1| 1,1 |
|+----+ |
+-----------+
-If all goes well, the chunks are acknowledged on each OSD in the acting set and the *last_complete* pointer of the logs can move from *1,1* to *1,2* and the files used to store the chunks of the previous version of the object can be removed : *D1v1* on *OSD 1*, *D2v1* on *OSD 2* and *P1v1* on *OSD 3*.
+If all goes well, the chunks are acknowledged on each OSD in the
+acting set and the logs' *last_complete* pointer can move from
+*1,1* to *1,2* and the files used to store the chunks of the previous
+version of the object can be removed : *D1v1* on *OSD 1*, *D2v1* on
+*OSD 2* and *C1v1* on *OSD 3*.
+
::
+---OSD 1---+
@@ -226,10 +263,10 @@ If all goes well, the chunks are acknowledged on each OSD in the acting set and
+-----------+
+---OSD 3---+
|+----+ log |
- ||P1v2| 1,2 |
+ ||C1v2| 1,2 |
|+----+ |
|+----+ |
- ||P1V1| 1,1 |
+ ||C1V1| 1,1 |
|+----+ |
primary +-----------+
+---OSD 4---+
@@ -238,7 +275,16 @@ If all goes well, the chunks are acknowledged on each OSD in the acting set and
| |
+-----------+
-But accidents happen. If *OSD 1* goes down while *D2v2* is still in flight, the version 2 of the object is partially written : *OSD 3* has one chunk but does not have enough to recover. It lost two chunks : *D1v2* and *D2v2* but the erasure coding parameters M = 2 + K = 1 requires that at least two chunks are available to rebuild the third. *OSD 4* becomes the new primary and finds that the *last_complete* log entry ( i.e. all objects before this entry were known to be available on all OSDs in the previous acting set ) is *1,1* and will be the head of the new authoritative log.
+But accidents happen. If *OSD 1* goes down while *D2v2* is still in
+flight, the object's version 2 is partially written : *OSD 3* has
+one chunk but that is no not enough to recover. It lost two chunks :
+*D1v2* and *D2v2* and the erasure coding parameters K = 2 + M = 1
+require that at least two chunks are available to rebuild the
+third. *OSD 4* becomes the new primary and finds that the
+*last_complete* log entry ( i.e. all objects before this entry were
+known to be available on all OSDs in the previous acting set ) is
+*1,1* and that will be the head of the new authoritative log.
+
::
+---OSD 2---+
@@ -248,7 +294,7 @@ But accidents happen. If *OSD 1* goes down while *D2v2* is still in flight, the
+-----------+
+---OSD 3---+
|+----+ log |
- ||P1V1| 1,1 |
+ ||C1V1| 1,1 |
|+----+ |
primary +-----------+
+---OSD 4---+
@@ -257,7 +303,10 @@ But accidents happen. If *OSD 1* goes down while *D2v2* is still in flight, the
| |
+-----------+
-The log entry *1,2* found on *OSD 3* is divergent from the new authoritative log provided by *OSD 4* : it is discarded and the file containing the *P1v2* chunk is removed.
+The log entry *1,2* found on *OSD 3* is divergent from the new
+authoritative log provided by *OSD 4* : it is discarded and the file
+containing the *C1v2* chunk is removed.
+
::
+---OSD 2---+
@@ -267,7 +316,7 @@ The log entry *1,2* found on *OSD 3* is divergent from the new authoritative log
+-----------+
+---OSD 3---+
|+----+ log |
- ||P1V1| 1,1 |
+ ||C1V1| 1,1 |
|+----+ |
primary +-----------+
+---OSD 4---+
@@ -276,12 +325,19 @@ The log entry *1,2* found on *OSD 3* is divergent from the new authoritative log
|+----+ |
+-----------+
-The *D1v1* chunk is rebuilt with the *repair* function of the erasure coding library during scrubbing and stored on the new primary *OSD 4*.
+The *D1v1* chunk is rebuilt with the *decode* function of the erasure
+coding library during scrubbing and stored on the new primary *OSD 4*.
Interrupted append
------------------
-An object is coded in stripes as described above. In the case of a full write, and assuming the object size is not too large to encode it in memory, there is a single stripe. When appending to an existing object, the stripe size is retrieved from the attributes of the object and if the total size of the object is a multiple of the stripe size and the payload of the append message is lower or equal to the strip size, the following applies. It applies, for instance, when *rgw* writes an object with sequence of append instead of a single write.
+An object is coded in stripes, either because it is too big or because
+it is created with multiple write operations instead of a single full
+write. When appending to an existing object, the stripe size is
+retrieved from the attributes of the object. It applies, for instance,
+when *rgw* writes an object with a sequence of appends instead of a
+single full write.
+
::
primary
@@ -299,13 +355,29 @@ An object is coded in stripes as described above. In the case of a full write, a
| +-----------+
| +---OSD 3---+
| |+-s3-+ log |
- +---------->|S1P1| 1,2 |
+ +---------->|S1C1| 1,2 |
||----| |
- ||S2P1| 1,1 |
+ ||S2C1| 1,1 |
|+----+ |
+-----------+
-*OSD 1* is the primary and receives an APPEND from a client, meaning the payload is to be appended at the end of the object. *OSD 1* encodes the payload into three chunks : S2D1 (i.e. Stripe two data chunk number 1 ) will be in s1 ( shard 1 ) on *OSD 1*, S2D2 in s2 on *OSD 2* and S2P1 (i.e. Stripe two parity chunk number 1 ) in s3 on *OSD 3*. Each chunk is sent to the target OSD, including the primary OSD which is responsible for storing chunks in addition to handling write operations and maintaining an authoritative version of the placement group logs. When an OSD receives the message instructing it to write the chunk, it also creates a new entry in the placement group logs to reflect the change. For instance, as soon as *OSD 3* stores *S2P1*, it adds the entry 1,2 ( i.e. epoch 1, version 2 ) to its logs. The log entry also carries the nature of the operation: in this case 1,2 is an APPEND where 1,1 was a CREATE. Because the OSDs work asynchronously, some chunks may still be in flight ( such as *S2D2* ) while others are acknowledged and on disk ( such as *S2D1* and *S2P1* ).
+*OSD 1* is the primary and receives an APPEND from a client, meaning
+the payload is to be appended to the end of the object. *OSD 1*
+encodes the payload into three chunks : S2D1 (i.e. Stripe two data
+chunk number 1 ) will be in s1 ( shard 1 ) on *OSD 1*, S2D2 in s2 on
+*OSD 2* and S2C1 (i.e. Stripe two coding chunk number 1 ) in s3 on
+*OSD 3*. Each chunk is sent to the target OSD, including the primary
+OSD which is responsible for storing chunks in addition to handling
+write operations and maintaining an authoritative version of the
+placement group logs. When an OSD receives the message instructing it
+to write the chunk, it also creates a new entry in the placement group
+logs to reflect the change. For instance, as soon as *OSD 3* stores
+*S2C1*, it adds the entry 1,2 ( i.e. epoch 1, version 2 ) to its
+logs. The log entry also carries the nature of the operation: in this
+case 1,2 is an APPEND where 1,1 was a CREATE. Because the OSDs work
+asynchronously, some chunks may still be in flight ( such as *S2D2* )
+while others are acknowledged and on disk (such as *S2D1* and *S2C1*).
+
::
+---OSD 1---+
@@ -320,9 +392,9 @@ An object is coded in stripes as described above. In the case of a full write, a
+-----------+
+---OSD 3---+
|+-s3-+ log |
- ||S1P1| 1,2 |
+ ||S1C1| 1,2 |
||----| |
- ||S2P1| 1,1 |
+ ||S2C1| 1,1 |
|+----+ |
primary +-----------+
+---OSD 4---+
@@ -331,7 +403,16 @@ An object is coded in stripes as described above. In the case of a full write, a
| |
+-----------+
-If *OSD 1* goes down while *S2D2* is still in flight, the payload is partially appended : s3 ( shard 3) in *OSD 3* has one chunk but does not have enough to recover because s1 and s2 don't have it. It lost two chunks : *S2D1* and *S2D2* but the erasure coding parameters M = 2 + K = 1 requires that at least two chunks are available to rebuild the third. *OSD 4* becomes the new primary and finds that the *last_complete* log entry ( i.e. all objects before this entry were known to be available on all OSDs in the previous acting set ) is *1,1* and will be the head of the new authoritative log.
+If *OSD 1* goes down while *S2D2* is still in flight, the payload is
+partially appended : s3 (shard 3) in *OSD 3* has one chunk but does
+not have enough to recover. Two chunks were lost (*S2D1* and S2D2) but
+the erasure coding parameters K = 2 + M = 1 requires that at least two
+chunks are available to rebuild the third. *OSD 4* becomes the new
+primary and finds that the *last_complete* log entry ( i.e. all
+objects before this entry were known to be available on all OSDs in
+the previous acting set ) is *1,1* and will be the head of the new
+authoritative log.
+
::
+---OSD 2---+
@@ -341,7 +422,7 @@ If *OSD 1* goes down while *S2D2* is still in flight, the payload is partially a
+-----------+
+---OSD 3---+
|+-s3-+ log |
- ||S1P1| 1,1 |
+ ||S1C1| 1,1 |
|+----+ |
primary +-----------+
+---OSD 4---+
@@ -350,26 +431,28 @@ If *OSD 1* goes down while *S2D2* is still in flight, the payload is partially a
| |
+-----------+
-The log entry *1,2* found on *OSD 3* is divergent from the new authoritative log provided by *OSD 4* : it is discarded and the file containing the *S2P1* chunk is truncated to the nearest multiple of the stripe size.
+The log entry *1,2* found on *OSD 3* is divergent from the new
+authoritative log provided by *OSD 4* : it is discarded and the file
+containing the *S2C1* chunk is truncated to the nearest multiple of
+the stripe size.
Erasure code library
--------------------
-See also `the corresponding tracker issue <http://tracker.ceph.com/issues/5878>`_
-
Using `Reed-Solomon <https://en.wikipedia.org/wiki/Reed_Solomon>`_,
-with parameters M+K object O is encoded by dividing it into chunks O1,
-O2, ... OM and computing parity chunks P1, P2, ... PK. Any M chunks
-out of the available M+K chunks can be used to obtain the original
-object. If data chunk O2 or parity chunk P2 are lost, they can be
-repaired using any M chunks out of the M+K chunks. If more than K
+with parameters K+M, object O is encoded by dividing it into chunks O1,
+O2, ... OM and computing coding chunks P1, P2, ... PK. Any K chunks
+out of the available K+M chunks can be used to obtain the original
+object. If data chunk O2 or coding chunk P2 are lost, they can be
+repaired using any K chunks out of the K+M chunks. If more than M
chunks are lost, it is not possible to recover the object.
Reading the original content of object O could be a simple
-concatenation of O1, O2, ... OM, if using `systematic codes
-<http://en.wikipedia.org/wiki/Systematic_code>`_. Otherwise the
-chunks must be given to the erasure code library to retrieve the
-content of the object.
+concatenation of O1, O2, ... OM, because the plugins are using
+`systematic codes
+<http://en.wikipedia.org/wiki/Systematic_code>`_. Otherwise the chunks
+must be given to the erasure code library *decode* method to retrieve
+the content of the object.
Reed-Solomon is significantly more expensive to encode than fountain
codes with the current `jerasure implementation
@@ -381,14 +464,17 @@ the difference becomes negligible. The difference is even more
important when an object is divided in hundreds or more chunks, but
Ceph will typically be used with less than 32 chunks.
-Performances depend on the parameters to the Reed-Solomon functions
-but they are also influenced by the buffer sizes used when calling
-the encoding functions: smaller buffers will mean more calls and more
-overhead.
+Performance depend on the parameters to the encoding functions and
+is also influenced by the packet sizes used when calling the encoding
+functions ( for Cauchy or Liberation for instance ): smaller packets
+means more calls and more overhead.
Although Reed-Solomon is provided as a default, Ceph uses it via an
-abstract API designed to allow each pool to choose the plugin that
-implements it.
+`abstract API <https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/osd/ErasureCodeInterface.h>`_ designed to
+allow each pool to choose the plugin that implements it using
+`key=value pairs when creating the pool
+<https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/mon/MonCommands.h#L483>`_.
+
::
ceph osd pool create <pool> \
@@ -396,86 +482,67 @@ implements it.
erasure-code-plugin=<plugin>
The *<plugin>* is dynamically loaded from *<dir>* (defaults to
-*/usr/lib/ceph/erasure-code* ) and expected to implement the
-*void __erasure_code_init(char *plugin_name)* function
-which is responsible for registering an object derived from
-*ErasureCodePlugin* in the registry singleton :
+*/usr/lib/ceph/erasure-code* ) and expected to implement the *int
+__erasure_code_init(char *plugin_name)* function which is responsible
+for registering an object derived from *ErasureCodePlugin* in the
+registry. The `ErasureCodePluginExample <https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/test/osd/ErasureCodePluginExample.cc#L32>`_ plugin reads:
+
::
- registry.plugins[plugin_name] = new ErasureCodePluginExample();
+ ErasureCodePluginRegistry &instance =
+ ErasureCodePluginRegistry::instance();
+ instance.add(plugin_name, new ErasureCodePluginExample());
The *ErasureCodePlugin* derived object must provide a factory method
from which the concrete implementation of the *ErasureCodeInterface*
-object can be generated:
+object can be generated. The `ErasureCodePluginExample plugin <https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/test/osd/ErasureCodePluginExample.cc#L22>`_ reads:
+
::
- virtual int factory(ErasureCodeInterfaceRef *erasure_code,
- const map<std::string,std::string> &parameters) {
+ virtual int factory(const map<std::string,std::string> &parameters,
+ ErasureCodeInterfaceRef *erasure_code) {
*erasure_code = ErasureCodeInterfaceRef(new ErasureCodeExample(parameters));
return 0;
- }
+ }
+
+The *parameters* argument is the list of *key=value* pairs that were
+set when the pool was created. Each *key* must be prefixed with
+*erasure-code* to avoid name collisions:
-The *parameters* is the list of *key=value* pairs that were set when the pool
-was created. Each *key* must be prefixed with erasure-code to avoid name collisions
::
- ceph osd pool create <pool> \
+ ceph osd pool create poolname 123 \
erasure-code-directory=<dir> \ # mandatory
erasure-code-plugin=jerasure \ # mandatory
erasure-code-m=10 \ # optional and plugin dependant
erasure-code-k=3 \ # optional and plugin dependant
- erasure-code-algorithm=Reed-Solomon \ # optional and plugin dependant
-
-Erasure code library abstract API
----------------------------------
-
- .. doxygenfile:: ErasureCodeInterface.h
-
-Erasure code jerasure plugin
-----------------------------
-
-The parameters interpreted by the jerasure plugin are:
-::
-
- ceph osd pool create <pool> \
- erasure-code-directory=<dir> \ # plugin directory absolute path
- erasure-code-plugin=jerasure \ # plugin name (only jerasure)
- erasure-code-m=<m> \ # data chunks (default 10)
- erasure-code-k=<k> \ # parity chunks (default 3)
- erasure-code-algorithm=Reed-Solomon \ # algorithm (only Reed-Solomon)
-
+ erasure-code-technique=reed_sol_van \ # optional and plugin dependant
Scrubbing
---------
+See also `Refactor scrub to use PGBackend methods <http://tracker.ceph.com/issues/5861>`_
The simplest form of scrubbing is to check with each OSDs holding a
-chunk if it exists locally. If more thank K chunks are missing the
-object is marked as lost. If up to K chunks are missing they are
+chunk if it exists locally. If more thank M chunks are missing the
+object is marked as lost. If up to M chunks are missing they are
repaired and written to the relevant OSDs.
-From time to time it may make sense to attempt to read and object,
+From time to time it may make sense to attempt to read an object,
using all of its chunks. If the decode function fails, the object is
lost.
Bit flips happen. Not often, but it is possible. Here is `an article
from 2011 <http://www.linux-mag.com/id/8794/>`_ also search for "bit
rot" and "bit error rate". To detect corrupted chunks, a checksum
-(CRC23C for instance) should be added as an attribute of the file
-containing the chunk so that deep scrubbing can check that the chunk
-is valid by recomputing the content of the chunk and compare it with
-the signature. BTRFS and ZFS have a CRC32C check built-in on a per
-block basis.
+(CRC23C for instance) must be added as an attribute of the file
+containing the chunk ( or shard ) so that deep scrubbing can check
+that the chunk is valid by recomputing the content of the chunk and
+compare it with the signature. BTRFS and ZFS have a CRC32C check
+built-in on a per block basis.
Notes
-----
-This document is a description of how erasure coding could be
-implemented, it does not reflect the current state of the code
-base. Possible optimizations are mentionned where relevant but the
-first implementation should not include any of them: they are
-presented to show that there is a path toward optimization starting
-from simple minded implementation.
-
If the objects are large, it may be impractical to encode and decode
them in memory. However, when using *RBD* a 1TB device is divided in
many individual 4MB objects and *RGW* does the same.
@@ -483,68 +550,3 @@ many individual 4MB objects and *RGW* does the same.
Encoding and decoding is implemented in the OSD. Although it could be
implemented client side for read write, the OSD must be able to encode
and decode on its own when scrubbing.
-
-If a partial read is required, an optimization could be to only fetch
-the chunk that contains the data instead of always fetching all
-chunks. For instance if *H* is required in the example above, chunk 3
-is read if available. Reading 3 chunks is a fallback in case chunk 3 is
-not available.
-
-Partial reads and writes
-------------------------
-
-If an object is large, reading or writing all of it when changing only
-a few bytes is expensive. It is more efficient to only read or write a
-subset of the object. When a client writes on an existing object, it
-can provide the offset and the length of the write as well as the payload with the `CEPH_OSD_OP_WRITE <https://github.com/ceph/ceph/blob/962b64a83037ff79855c5261325de0cd1541f582/src/osd/ReplicatedPG.cc#L2542>`_ operation. It is refered to as *partial write* and is different from the `CEPH_OSD_OP_WRITEFULL operation <https://github.com/ceph/ceph/blob/962b64a83037ff79855c5261325de0cd1541f582/src/osd/ReplicatedPG.cc#L2552>`_ which writes the entire object at once.
-
-When using replicas for partial writes or reads, the primary OSD
-translates them into read(2) and write(2) POSIX system calls. When
-writing, it then forwards the CEPH_OSD_OP_WRITE message to the
-replicas and waits for them to acknowledge they are done.
-
-When reading erasure coded objects, at least K chunks must be read and
-decoded to extract the desired bytes. If a `systematic code
-<https://en.wikipedia.org/wiki/Systematic_code>`_ is used ( i.e. the
-data chunks are readable by simple concatenation ) read can be
-optimized to use the chunk containing the desired bytes and rely on
-the erasure decoding function only if a chunk is missing.
-
-When writing an erasure coded object, changing even one byte requires
-that it is encoded again in full.
-
-If Ceph is only used thru the radosgw or librbd, objects will mostly
-have the same size. The radosgw user may upload a 1GB object, it will
-be divided into smaller 4MB objects behind the scene ( or whatever is
-set with rgw obj stripe size ). If a KVM is attached a 10GB RBD block
-device, it will also be divided into smaller 4BM objects ( or whatever
-size is given to the --stripe-unit argument when creating the RBD
-block ). In both cases, writing one byte at the beginning will only
-require to encode the first object and not all of them.
-
-Objects can be further divided into stripes to reduce the overhead of
-partial writes. For instance:
-::
-
- +-----------------------+
- |+---------------------+|
- || stripe 0 ||
- || [0,N) ||
- |+---------------------+|
- |+---------------------+|
- || stripe 1 ||
- || [N,N*2) ||
- |+---------------------+|
- |+---------------------+|
- || stripe 3 [N*2,len) ||
- |+---------------------+|
- +-----------------------+
- object of size len
-
-Each stripe is encoded independantly and the same OSDs are used for
-all of them. For instance, if stripe 0 is encoded into 3 chunks on
-OSDs 5, 8 and 9, stripe 1 is also encoded into 3 chunks on the same
-OSDs. The size of a stripe is stored as an attribute of the object.
-When writing one byte at offset N, instead of re-encoding the whole
-object it is enough to re-encode the stripe that contains it.
-
diff --git a/doc/dev/osd_internals/erasure_coding/jerasure.rst b/doc/dev/osd_internals/erasure_coding/jerasure.rst
new file mode 100644
index 00000000000..312eac52e5d
--- /dev/null
+++ b/doc/dev/osd_internals/erasure_coding/jerasure.rst
@@ -0,0 +1,22 @@
+===============
+jerasure plugin
+===============
+
+Introduction
+------------
+
+The parameters interpreted by the jerasure plugin are:
+
+::
+
+ ceph osd pool create <pool> \
+ erasure-code-directory=<dir> \ # plugin directory absolute path
+ erasure-code-plugin=jerasure \ # plugin name (only jerasure)
+ erasure-code-k=<k> \ # data chunks (default 2)
+ erasure-code-m=<m> \ # coding chunks (default 2)
+ erasure-code-technique=<technique> \ # coding technique
+
+The coding techniques can be chosen among *reed_sol_van*,
+*reed_sol_r6_op*, *cauchy_orig*, *cauchy_good*, *liberation*,
+*blaum_roth* and *liber8tion*.
+
diff --git a/doc/dev/osd_internals/erasure_coding/pgbackend.rst b/doc/dev/osd_internals/erasure_coding/pgbackend.rst
index 9e3fcb2bf86..43415ba4f7e 100644
--- a/doc/dev/osd_internals/erasure_coding/pgbackend.rst
+++ b/doc/dev/osd_internals/erasure_coding/pgbackend.rst
@@ -2,14 +2,13 @@
PG Backend Proposal
===================
-See also `PGBackend.h <../PGBackend-h>`_
-
Motivation
----------
-The purpose of the PG Backend interface is to abstract over the
-differences between replication and erasure coding as failure recovery
-mechanisms.
+The purpose of the `PG Backend interface
+<https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h>`_
+is to abstract over the differences between replication and erasure
+coding as failure recovery mechanisms.
Much of the existing PG logic, particularly that for dealing with
peering, will be common to each. With both schemes, a log of recent
@@ -34,12 +33,12 @@ and erasure coding which PGBackend must abstract over:
positions are not interchangeable. In particular, it might make
sense for a single OSD to hold more than 1 PG copy for different
acting set positions.
-5. Selection of a pgtemp for backfill may difer between replicated
+5. Selection of a pgtemp for backfill may differ between replicated
and erasure coded backends.
6. The set of necessary osds from a particular interval required to
- to continue peering may difer between replicated and erasure
+ to continue peering may differ between replicated and erasure
coded backends.
-7. The selection of the authoritative log may difer between replicated
+7. The selection of the authoritative log may differ between replicated
and erasure coded backends.
Client Writes
@@ -78,11 +77,15 @@ Core Changes:
- Current code should be adapted to use and rollback as appropriate
APPEND, DELETE, (SET|RM)ATTR log entries.
- The filestore needs to be able to deal with multiply versioned
- hobjects. This probably means adapting the filestore internally to
- use a vhobject which is basically a pair<version_t, hobject_t>. The
- version needs to be included in the on-disk filename. An interface
- needs to be added to get all versions of a particular hobject_t or
- the most recently versioned instance of a particular hobject_t.
+ hobjects. This means adapting the filestore internally to
+ use a `ghobject <https://github.com/ceph/ceph/blob/aba6efda13eb6ab4b96930e9cc2dbddebbe03f26/src/common/hobject.h#L193>`_
+ which is basically a tuple<hobject_t, gen_t,
+ shard_t>. The gen_t + shard_t need to be included in the on-disk
+ filename. gen_t is a unique object identifier to make sure there
+ are no name collisions when object N is created +
+ deleted + created again. An interface needs to be added to get all
+ versions of a particular hobject_t or the most recently versioned
+ instance of a particular hobject_t.
PGBackend Interfaces:
@@ -111,14 +114,14 @@ divergent objects. Thus, we must choose the *oldest* last_update from
the last interval which went active in order to minimize the number of
divergent objects.
-The dificulty is that the current code assumes that as long as it has
+The difficulty is that the current code assumes that as long as it has
an info from at least 1 osd from the prior interval, it can complete
peering. In order to ensure that we do not end up with an
-unrecoverably divergent object, an M+K erasure coded PG must hear from at
-least M of the replicas of the last interval to serve writes. This ensures
-that we will select a last_update old enough to roll back at least M
+unrecoverably divergent object, a K+M erasure coded PG must hear from at
+least K of the replicas of the last interval to serve writes. This ensures
+that we will select a last_update old enough to roll back at least K
replicas. If a replica with an older last_update comes along later,
-we will be able to provide at least M chunks of any divergent object.
+we will be able to provide at least K chunks of any divergent object.
Core Changes:
@@ -158,7 +161,7 @@ Client Reads
------------
Reads with the replicated strategy can always be satisfied
-syncronously out of the primary osd. With an erasure coded strategy,
+synchronously out of the primary osd. With an erasure coded strategy,
the primary will need to request data from some number of replicas in
order to satisfy a read. The perform_read() interface for PGBackend
therefore will be async.
@@ -178,7 +181,7 @@ acting set have different pieces of the erasure coding scheme and are
not interchangeable. Worse, crush might cause chunk 2 to be written
to an osd which happens already to contain an (old) copy of chunk 4.
This means that the OSD and PG messages need to work in terms of a
-type like pair<chunk_id_t, pg_t> in order to distinguish different pg
+type like pair<shard_t, pg_t> in order to distinguish different pg
chunks on a single OSD.
Because the mapping of object name to object in the filestore must
@@ -188,14 +191,14 @@ include the chunk id in the object key.
Core changes:
-- The filestore `vhobject_t needs to also include a chunk id
- <http://tracker.ceph.com/issues/5862>`_ making it more like
- tuple<hobject_t, version_t, chunk_id_t>.
-- coll_t needs to include a chunk_id_t.
+- The filestore `ghobject_t needs to also include a chunk id
+ <https://github.com/ceph/ceph/blob/aba6efda13eb6ab4b96930e9cc2dbddebbe03f26/src/common/hobject.h#L193>`_ making it more like
+ tuple<hobject_t, gen_t, shard_t>.
+- coll_t needs to include a shard_t.
- The `OSD pg_map and similar pg mappings need to work in terms of a
cpg_t <http://tracker.ceph.com/issues/5863>`_ (essentially
- pair<pg_t, chunk_id_t>). Similarly, pg->pg messages need to include
- a chunk_id_t
+ pair<pg_t, shard_t>). Similarly, pg->pg messages need to include
+ a shard_t
- For client->PG messages, the OSD will need a way to know which PG
chunk should get the message since the OSD may contain both a
primary and non-primary chunk for the same pg
@@ -254,7 +257,7 @@ Core changes:
- Ensure that crush behaves as above for INDEP.
-`Recovery <http://tracker.ceph.com/issues/5857>`_
+Recovery
--------
The logic for recovering an object depends on the backend. With
@@ -267,24 +270,27 @@ and push out the replacement chunks concurrently.
Another difference is that objects in erasure coded pg may be
unrecoverable without being unfound. The "unfound" concept
should probably then be renamed to unrecoverable. Also, the
-PGBackend impementation will have to be able to direct the search
+PGBackend implementation will have to be able to direct the search
for pg replicas with unrecoverable object chunks and to be able
to determine whether a particular object is recoverable.
+
Core changes:
- s/unfound/unrecoverable
PGBackend interfaces:
-- might_have_unrecoverable()
-- recoverable()
-- recover_object()
+- `on_local_recover_start <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L46>`_
+- `on_local_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L52>`_
+- `on_global_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L64>`_
+- `on_peer_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L69>`_
+- `begin_peer_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L76>`_
-`Backfill <http://tracker.ceph.com/issues/5856>`_
+Backfill
--------
-For the most part, backfill itself should behave similarly between
+See `Issue #5856`_. For the most part, backfill itself should behave similarly between
replicated and erasure coded pools with a few exceptions:
1. We probably want to be able to backfill multiple osds concurrently
@@ -311,3 +317,5 @@ PGBackend interfaces:
- choose_backfill(): allows the implementation to determine which osds
should be backfilled in a particular interval.
+
+.. _Issue #5856: http://tracker.ceph.com/issues/5856
diff --git a/doc/dev/osd_internals/snaps.rst b/doc/dev/osd_internals/snaps.rst
index a1e9020ba02..63b1bc0503c 100644
--- a/doc/dev/osd_internals/snaps.rst
+++ b/doc/dev/osd_internals/snaps.rst
@@ -70,18 +70,19 @@ See ReplicatedPG::SnapTrimmer, SnapMapper
This trimming is performed asynchronously by the snap_trim_wq while the
pg is clean and not scrubbing.
- 1. The next snap in PG::snaptrimq is selected for trimming
- 2. We determine the next object for trimming out of PG::snap_mapper.
+ #. The next snap in PG::snaptrimq is selected for trimming
+ #. We determine the next object for trimming out of PG::snap_mapper.
For each object, we create a log entry and repop updating the
object info and the snap set (including adjusting the overlaps).
- 3. We also locally update our *SnapMapper* instance with the object's
+ #. We also locally update our *SnapMapper* instance with the object's
new snaps.
- 4. The log entry containing the modification of the object also
+ #. The log entry containing the modification of the object also
contains the new set of snaps, which the replica uses to update
its own *SnapMapper* instance.
- 6. The primary shares the info with the replica, which persists
+ #. The primary shares the info with the replica, which persists
the new set of purged_snaps along with the rest of the info.
+
Recovery
--------
Because the trim operations are implemented using repops and log entries,
diff --git a/doc/dev/release-process.rst b/doc/dev/release-process.rst
index 656d5ca76c5..b48934831bb 100644
--- a/doc/dev/release-process.rst
+++ b/doc/dev/release-process.rst
@@ -26,7 +26,7 @@ Prior to building, it's necessary to update the pbuilder seed tarballs::
The release key should be present::
pub 4096R/17ED316D 2012-05-20
- uid Ceph Release Key <sage@newdream.net>
+ uid Ceph Release Key <sage@newdream.net>
3. Set up build area
@@ -48,24 +48,25 @@ Checkout the submodules::
4. Update Build version numbers
================================
-Substitute the ceph release number where indicated below by the string 0.xx::
+Substitute the ceph release number where indicated below by the string ``0.xx``.
Edit configure.ac and update the version number. Example diff::
- -AC_INIT([ceph], [0.54], [ceph-devel@vger.kernel.org])
- +AC_INIT([ceph], [0.55], [ceph-devel@vger.kernel.org])
+ -AC_INIT([ceph], [0.54], [ceph-devel@vger.kernel.org])
+ +AC_INIT([ceph], [0.55], [ceph-devel@vger.kernel.org])
Update the version number in the debian change log::
- DEBEMAIL user@host dch -v 0.xx-1
+ DEBEMAIL user@host dch -v 0.xx-1
Commit the changes::
- git commit -a
+ git commit -a
Tag the release::
- ../ceph-build/tag-release v0.xx
+ ../ceph-build/tag-release v0.xx
+
5. Create Makefiles
===================
@@ -74,7 +75,7 @@ The actual configure options used to build packages are in the
``ceph.spec.in`` and ``debian/rules`` files. At this point we just
need to create a Makefile.::
- ./do_autogen.sh
+ ./do_autogen.sh
6. Run the release scripts
@@ -84,7 +85,8 @@ This creates tarballs and copies them, with other needed files to
the build hosts listed in deb_hosts and rpm_hosts, runs a local build
script, then rsyncs the results back tot the specified release directory.::
- ../ceph-build/do_release.sh /tmp/release
+ ../ceph-build/do_release.sh /tmp/release
+
7. Create RPM Repo
==================
@@ -92,35 +94,38 @@ script, then rsyncs the results back tot the specified release directory.::
Copy the rpms to the destination repo, creates the yum repository
rpm and indexes.::
- ../ceph-build/push_to_rpm_repo.sh /tmp/release /tmp/rpm-repo 0.xx
+ ../ceph-build/push_to_rpm_repo.sh /tmp/release /tmp/rpm-repo 0.xx
+
8. Create debian repo
=====================
The key-id used below is the id of the ceph release key from step 2::
- mkdir /tmp/debian-repo
- ../ceph-build/gen_reprepro_conf.sh /tmp/debian-repo key-id
- ../ceph-build/push_to_deb_repo.sh /tmp/release /tmp/debian-repo 0.xx main
+ mkdir /tmp/debian-repo
+ ../ceph-build/gen_reprepro_conf.sh /tmp/debian-repo key-id
+ ../ceph-build/push_to_deb_repo.sh /tmp/release /tmp/debian-repo 0.xx main
+
9. Push repos to ceph.org
==========================
For a development release::
- rcp ceph-0.xx.tar.bz2 ceph-0.xx.tar.gz \
- ceph_site@ceph.com:ceph.com/downloads/.
- rsync -av /tmp/rpm-repo/0.xx/ ceph_site@ceph.com:ceph.com/rpm-testing
- rsync -av /tmp/debian-repo/ ceph_site@ceph.com:ceph.com/debian-testing
+ rcp ceph-0.xx.tar.bz2 ceph-0.xx.tar.gz \
+ ceph_site@ceph.com:ceph.com/downloads/.
+ rsync -av /tmp/rpm-repo/0.xx/ ceph_site@ceph.com:ceph.com/rpm-testing
+ rsync -av /tmp/debian-repo/ ceph_site@ceph.com:ceph.com/debian-testing
For a stable release, replace {CODENAME} with the release codename (e.g., ``argonaut`` or ``bobtail``)::
- rcp ceph-0.xx.tar.bz2 \
- ceph_site@ceph.com:ceph.com/downloads/ceph-0.xx{CODENAME}.tar.bz2
- rcp ceph-0.xx.tar.gz \
- ceph_site@ceph.com:ceph.com/downloads/ceph-0.xx{CODENAME}.tar.gz
- rsync -av /tmp/rpm-repo/0.xx/ ceph_site@ceph.com:ceph.com/rpm-{CODENAME}
- rsync -auv /tmp/debian-repo/ ceph_site@ceph.com:ceph.com/debian-{CODENAME}
+ rcp ceph-0.xx.tar.bz2 \
+ ceph_site@ceph.com:ceph.com/downloads/ceph-0.xx{CODENAME}.tar.bz2
+ rcp ceph-0.xx.tar.gz \
+ ceph_site@ceph.com:ceph.com/downloads/ceph-0.xx{CODENAME}.tar.gz
+ rsync -av /tmp/rpm-repo/0.xx/ ceph_site@ceph.com:ceph.com/rpm-{CODENAME}
+ rsync -auv /tmp/debian-repo/ ceph_site@ceph.com:ceph.com/debian-{CODENAME}
+
10. Update Git
==============
@@ -148,15 +153,16 @@ Similarly, for a development release, for both ``teuthology.git`` and ``ceph-qa-
Stable release
--------------
-For ``ceph.git``:
+For ``ceph.git``::
+
+ git push origin stable
- git push origin stable
Point release
-------------
-Just push the new tag:
+Just push the new tag::
- git push origin v0.xx
+ git push origin v0.xx
diff --git a/doc/dev/repo-lab-access.rst b/doc/dev/repo-lab-access.rst
index 706f02e395c..2d1b328325a 100644
--- a/doc/dev/repo-lab-access.rst
+++ b/doc/dev/repo-lab-access.rst
@@ -4,11 +4,11 @@ Notes on Ceph repositories and test lab
Special branches
----------------
-* ``master'': current tip (integration branch)
-* ``next'': pending release (feature frozen, bugfixes only)
-* ``last'': last/previous release
-* ``dumpling'', ``cuttlefish'', ``bobtail'', ``argonaut'', etc.: stable release branches
-* ``dumpling-next'': backports for stable release, pending testing
+* ``master``: current tip (integration branch)
+* ``next``: pending release (feature frozen, bugfixes only)
+* ``last``: last/previous release
+* ``dumpling``, ``cuttlefish``, ``bobtail``, ``argonaut``, etc.: stable release branches
+* ``dumpling-next``: backports for stable release, pending testing
Rules
-----
@@ -18,7 +18,7 @@ The source repos are all on github.
* Any branch pushed to ceph.git will kick off builds that will either
run unit tests or generate packages for gitbuilder.ceph.com. Try
not to generate unnecessary load. For private, unreviewed work,
- only push to branches named ``wip-*''. This avoids colliding with
+ only push to branches named ``wip-*``. This avoids colliding with
any special branches.
* Nothing should every reach a special branch unless it has been
@@ -34,7 +34,7 @@ The source repos are all on github.
Reviewed-by: directly to the commit so that it is also visible when
the patch is cherry-picked for backports.
-* All backports should use ``git cherry-pick -x'' to capture which
+* All backports should use ``git cherry-pick -x`` to capture which
commit they are cherry-picking from.
@@ -63,21 +63,21 @@ Locking machines
* All tests pull their builds from gitbuilder.ceph.com.
* Anybody can lock machines with ``teuthology-lock --lock-many NUM
- --machine-type TYPE''.
+ --machine-type TYPE``.
-* Machines are locked as ``whoami''@``hostname -s''. --owner to
+* Machines are locked as ``whoami''@``hostname -s``. --owner to
choose otherwise.
-* Automated tests current run on the ``plana''; please avoid locking
+* Automated tests current run on the ``plana``; please avoid locking
these for personal use.
-* To unlock, please use ``teuthology-nuke -t list.yaml -r -u'', which
+* To unlock, please use ``teuthology-nuke -t list.yaml -r -u``, which
will reboot and clean up any leftover test state before unlocking
- (or fail to unlock). It looks for a ``targets::'' section in the
+ (or fail to unlock). It looks for a ``targets::`` section in the
yaml, so the regular job yaml will work. You can get a list of all
- locked machines with ``teuthology-lock --list-targets''.
+ locked machines with ``teuthology-lock --list-targets``.
-* ``teuthology-lock -a --brief'' or ``teuthology-lock --summary'' to
+* ``teuthology-lock -a --brief`` or ``teuthology-lock --summary`` to
see what is locked and by whom.
* Be conscientious about scheduling entire qa runs. Coordinate
@@ -85,4 +85,4 @@ Locking machines
ceph-qa-suite.git and teuthology.git.
* Results for scheduled runs appear in /a/$jobname on the teuthology
- machine. ``ls -alt | head'' to find them.
+ machine. ``ls -alt | head`` to find them.
diff --git a/doc/dev/versions.rst b/doc/dev/versions.rst
new file mode 100644
index 00000000000..bf5ee252cd5
--- /dev/null
+++ b/doc/dev/versions.rst
@@ -0,0 +1,42 @@
+==================
+Public OSD Version
+==================
+
+We maintain two versions on disk: an eversion_t pg_log.head and a
+version_t info.user_version. Each object is tagged with both the pg
+version and user_version it was last modified with. The PG version is
+modified by manipulating OpContext::at_version and then persisting it
+to the pg log as transactions, and is incremented in all the places it
+used to be. The user_version is modified by manipulating the new
+OpContext::user_at_version and is also persisted via the pg log
+transactions.
+user_at_version is modified only in ReplicatedPG::prepare_transaction
+when the op was a "user modify" (a non-watch write), and the durable
+user_version is updated according to the following rules:
+1) set user_at_version to the maximum of ctx->new_obs.oi.user_version+1
+and info.last_user_version+1.
+2) set user_at_version to the maximum of itself and
+ctx->at_version.version.
+3) ctx->new_obs.oi.user_version = ctx->user_at_version (to change the
+object's user_version)
+
+This set of update semantics mean that for traditional pools the
+user_version will be equal to the past reassert_version, while for
+caching pools the object and PG user-version will be able to cross
+pools without making a total mess of things.
+In order to support old clients, we keep the old reassert_version but
+rename it to "bad_replay_version"; we fill it in as before: for writes
+it is set to the at_version (and is the proper replay version); for
+watches it is set to our user version; for ENOENT replies it is set to
+the replay version's epoch but the user_version's version. We also now
+fill in the version_t portion of the bad_replay_version on read ops as
+well as write ops, which should be fine for all old clients.
+
+For new clients, we prevent them from reading bad_replay_version and
+add two proper members: user_version and replay_version; user_version
+is filled in on every operation (reads included) while replay_version
+is filled in for writes.
+
+The objclass function get_current_version() now always returns the
+pg->info.last_user_version, which means it is guaranteed to contain
+the version of the last user update in the PG (including on reads!).
diff --git a/doc/index.rst b/doc/index.rst
index 8bf5340b2f6..4068be599e5 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -90,6 +90,7 @@ about Ceph, see our `Architecture`_ section.
:maxdepth: 1
:hidden:
+ start/intro
start/index
install/index
rados/index
diff --git a/doc/install/index.rst b/doc/install/index.rst
index 347b6ae9ac2..3be09c5d0df 100644
--- a/doc/install/index.rst
+++ b/doc/install/index.rst
@@ -1,50 +1,54 @@
-==============
- Installation
-==============
-
-The Ceph Object Store is the foundation of all Ceph clusters, and it consists
-primarily of two types of daemons: Object Storage Daemons (OSDs) and monitors.
-The Ceph Object Store is based upon the concept of
-:abbr:`RADOS (Reliable Autonomic Distributed Object Store)`, which eliminates
-single points of failure and delivers infinite scalability. For details on
-the architecture of Ceph and RADOS, refer to `Ceph Architecture`_. All Ceph
-deployments have OSDs and monitors, so you should prepare your Ceph cluster
-by focusing first on the object storage cluster.
+=======================
+ Installation (Manual)
+=======================
.. raw:: html
- <table cellpadding="10"><colgroup><col width="33%"><col width="33%"><col width="33%"></colgroup><tbody valign="top"><tr><td><h3>Recommendations</h3>
-
-To begin using Ceph in production, you should review our hardware
-recommendations and operating system recommendations. Many of the
-frequently-asked questions in our mailing list involve hardware-related
-questions and how to install Ceph on various distributions.
+ <table><colgroup><col width="50%"><col width="50%"></colgroup><tbody valign="top"><tr><td><h3>Advanced Package Tool (APT)</h3>
+
+If you are deploying a Ceph cluster on Debian or Ubuntu distributions,
+use the instructions below to install packages manually.
.. toctree::
:maxdepth: 2
- Hardware Recommendations <hardware-recommendations>
- OS Recommendations <os-recommendations>
-
-.. raw:: html
+ Installing Debian/Ubuntu Packages <debian>
+ Installing on Calxeda Hardware <calxeda>
+ Installing QEMU <qemu-deb>
+ Installing libvirt <libvirt-deb>
- </td><td><h3>Installation</h3>
+.. raw:: html
-If you are deploying a Ceph cluster (that is, not developing Ceph),
-install Ceph using our stable release packages. For testing, you
-may install development release and testing packages.
+ </td><td><h3>Redhat Package Manager (RPM) / Yellowdog Updater, Modified (YUM) </h3>
+
+If you are deploying a Ceph cluster on Red Hat(rhel6), CentOS (el6), Fedora
+17-19 (f17-f19), OpenSUSE 12 (opensuse12), and SLES (sles11) distributions, use
+the instructions below to install packages manually.
.. toctree::
:maxdepth: 2
- Installing Debian/Ubuntu Packages <debian>
Installing RPM Packages <rpm>
- Installing on Calxeda <calxeda>
+ Installing YUM Priorities <yum-priorities>
+ Installing QEMU <qemu-rpm>
+ Installing libvirt <libvirt-rpm>
+
+.. raw:: html
+
+ </td></tr><tr><td><h3>Upgrading Ceph</h3>
+
+If you are upgrading Ceph from a previous release, please read the the upgrade
+documentation to ensure that you follow the proper upgrade sequence.
+
+.. toctree::
+ :maxdepth: 2
+
Upgrading Ceph <upgrading-ceph>
+
-.. raw:: html
+.. raw:: html
- </td><td><h3>Building Ceph from Source</h3>
+ </td><td><h3>Building Ceph</h3>
You can build Ceph from source by downloading a release or cloning the ``ceph``
repository at github. If you intend to build Ceph from source, please see the
@@ -63,9 +67,10 @@ will save you time.
Build a Package <build-packages>
Contributing Code <contributing>
+See the `Development`_ section for additional development details.
.. raw:: html
</td></tr></tbody></table>
-
-.. _Ceph Architecture: ../architecture/
+
+.. _Development: ../../dev \ No newline at end of file
diff --git a/doc/install/libvirt-deb.rst b/doc/install/libvirt-deb.rst
new file mode 100644
index 00000000000..9365e46c747
--- /dev/null
+++ b/doc/install/libvirt-deb.rst
@@ -0,0 +1,43 @@
+====================
+ Installing libvirt
+====================
+
+
+Prerequisites
+=============
+
+- `Install`_ and `configure`_ a Ceph Storage Cluster
+- `Install and configure`_ QEMU/KVM
+
+
+Installing ``libvirt`` on Ubuntu 12.04 Precise
+==============================================
+
+``libvirt`` packages are incorporated into the Ubuntu 12.04 precise
+distribution. To install ``libvirt`` on precise, execute the following::
+
+ sudo apt-get update && sudo apt-get install libvirt-bin
+
+
+Installing ``libvirt`` on Earlier Versions of Ubuntu
+====================================================
+
+For Ubuntu distributions 11.10 oneiric and earlier, you must build ``libvirt``
+from source. Clone the ``libvirt`` repository, and use `AutoGen`_ to generate
+the build. Then, execute ``make`` and ``make install`` to complete the
+installation. For example::
+
+ git clone git://libvirt.org/libvirt.git
+ cd libvirt
+ ./autogen.sh
+ make
+ sudo make install
+
+See `libvirt Installation`_ for details.
+
+
+.. _libvirt Installation: http://www.libvirt.org/compiling.html
+.. _AutoGen: http://www.gnu.org/software/autogen/
+.. _Install: ../index
+.. _configure: ../../rados/configuration
+.. _Install and configure: ../../rbd/qemu-rbd
diff --git a/doc/install/libvirt-rpm.rst b/doc/install/libvirt-rpm.rst
new file mode 100644
index 00000000000..a94c6e8ae12
--- /dev/null
+++ b/doc/install/libvirt-rpm.rst
@@ -0,0 +1,19 @@
+====================
+ Installing libvirt
+====================
+
+To use ``libvirt`` with a Ceph Storage Cluster, you must
+have a running Ceph Storage Cluster. You must also install QEMU.
+See `Installing QEMU`_ for details.
+
+
+``libvirt`` packages are incorporated into the recent CentOS/RHEL distributions.
+To install ``libvirt``, execute the following::
+
+ sudo yum install libvirt
+
+See `libvirt Installation`_ for details.
+
+
+.. _libvirt Installation: http://www.libvirt.org/compiling.html
+.. _Installing QEMU: ../qemu-rpm \ No newline at end of file
diff --git a/doc/install/qemu-deb.rst b/doc/install/qemu-deb.rst
new file mode 100644
index 00000000000..29abeafa3bc
--- /dev/null
+++ b/doc/install/qemu-deb.rst
@@ -0,0 +1,26 @@
+=================
+ Installing QEMU
+=================
+
+
+
+Installing QEMU (12.04 Precise and later)
+=========================================
+
+QEMU packages are incorporated into Ubuntu 12.04 Precise Pangolin and later
+versions. To install QEMU, execute the following::
+
+ sudo apt-get install qemu
+
+Installing QEMU (11.10 Oneric and earlier)
+==========================================
+
+For Ubuntu distributions 11.10 Oneiric and earlier, you must install
+the 0.15 version of QEMU or later. To build QEMU from source, use the
+following procedure::
+
+ cd {your-development-directory}
+ git clone git://git.qemu.org/qemu.git
+ cd qemu
+ ./configure --enable-rbd
+ make; make install
diff --git a/doc/install/qemu-rpm.rst b/doc/install/qemu-rpm.rst
new file mode 100644
index 00000000000..67da2c3714c
--- /dev/null
+++ b/doc/install/qemu-rpm.rst
@@ -0,0 +1,56 @@
+=================
+ Installing QEMU
+=================
+
+To install QEMU with ``yum``, you must ensure that you have
+``yum-plugin-priorities`` installed. See `Installing YUM Priorities`_
+for details.
+
+To install QEMU, execute the following:
+
+#. Create a ``/etc/yum.repos.d/ceph-qemu.conf`` file with the following
+ contents::
+
+ [ceph-qemu]
+ name=Ceph Packages for QEMU
+ baseurl=http://ceph.com/packages/ceph-extras/rpm/centos6.3/$basearch
+ enabled=1
+ priority=2
+ gpgcheck=1
+ type=rpm-md
+ gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc
+
+ [ceph-qemu-noarch]
+ name=Ceph QEMU noarch
+ baseurl=http://ceph.com/packages/ceph-extras/rpm/centos6.3/noarch
+ enabled=1
+ priority=2
+ gpgcheck=1
+ type=rpm-md
+ gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc
+
+ [ceph-qemu-source]
+ name=Ceph QEMU Sources
+ baseurl=http://ceph.com/packages/ceph-extras/rpm/centos6.3/SRPMS
+ enabled=1
+ priority=2
+ gpgcheck=1
+ type=rpm-md
+ gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc
+
+#. Update your repositories. ::
+
+ sudo yum update
+
+#. Install QEMU for Ceph. ::
+
+ sudo yum install qemu-kvm qemu-kvm-tools qemu-img
+
+#. Install additional QEMU packages (optional)::
+
+ sudo yum install qemu-guest-agent qemu-guest-agent-win32
+
+See `QEMU and Block Devices`_ for usage.
+
+.. _QEMU and Block Devices: ../../rbd/qemu-rbd
+.. _Installing YUM Priorities: ../yum-priorities \ No newline at end of file
diff --git a/doc/install/rpm.rst b/doc/install/rpm.rst
index 72934cc8d5e..9e8cdcd003c 100644
--- a/doc/install/rpm.rst
+++ b/doc/install/rpm.rst
@@ -7,6 +7,7 @@ development release packages (for the latest features), or development
testing packages (for development and QA only). Do not add multiple
package sources at the same time.
+
Install Release Key
===================
@@ -139,144 +140,54 @@ You can download the RPMs directly from::
-Installing Ceph Deploy
-======================
-
-Once you have added either release or development packages to ``yum``, you
-can install ``ceph-deploy``. ::
-
- sudo yum install ceph-deploy python-pushy
-
-
-
-Installing Ceph Packages
-========================
-
-Once you have added either release or development packages to ``yum``, you
-can install Ceph packages. You can also use ``ceph-deploy`` to install Ceph
-packages. ::
-
- sudo yum install ceph
-
-
-
-Installing Ceph Object Storage
-==============================
-
-:term:`Ceph Object Storage` runs on Apache and FastCGI in conjunction with the
-:term:`Ceph Storage Cluster`.
-
-#. Install Apache and FastCGI. ::
-
- rpm -ivh fcgi-2.4.0-10.el6.x86_64.rpm
- rpm -ivh mod_fastcgi-2.4.6-2.el6.rf.x86_64.rpm
-
-
-#. Install the Ceph Object Storage daemon. ::
+Adding Ceph to YUM
+==================
- yum install ceph-radosgw
+You may also add Ceph to the ``/etc/yum.repos.d`` directory. Create a
+``ceph.repo`` file. In the example below, replace ``{ceph-stable}`` with
+a stable release of Ceph (e.g., ``cuttlefish``, ``dumpling``, etc.) and
+``{distro}`` with your Linux distribution (e.g., ``el6``, ``rhel6``, etc.). ::
+ [ceph]
+ name=Ceph packages for $basearch
+ baseurl=http://ceph.com/rpm-{ceph-stable}/{distro}/$basearch
+ enabled=1
+ gpgcheck=1
+ type=rpm-md
+ gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc
-#. Add the following lines to your Ceph configuration file.
+ [ceph-noarch]
+ name=Ceph noarch packages
+ baseurl=http://ceph.com/rpm-{ceph-stable}/{distro}/noarch
+ enabled=1
+ gpgcheck=1
+ type=rpm-md
+ gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc
-.. code-block:: ini
+ [ceph-source]
+ name=Ceph source packages
+ baseurl=http://ceph.com/rpm-{ceph-stable}/{distro}/SRPMS
+ enabled=0
+ gpgcheck=1
+ type=rpm-md
+ gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc
- [client.radosgw.gateway]
- host = {fqdn}
- keyring = /etc/ceph/keyring.radosgw.gateway
- rgw socket path = /tmp/radosgw.sock
- log file = /var/log/ceph/radosgw.log
- rgw print continue = false
-
-.. note:: Replace ``{fqdn}`` with the output from ``hostname``. This is
- important. Debian systems use the simple hostname, but on CentOS 6/RHEL 6
- you must use the fully qualified domain name.
-
-#. Create a data directory. ::
-
- mkdir -p /var/lib/ceph/radosgw/ceph-radosgw.gateway
-
-
-#. Change ``httpd ServerName`` in ``/etc/httpd/conf/httpd.conf``. ::
-
- ServerName {FQDN}
-
-
-#. Create an Apache httpd virtual host in ``/etc/httpd/conf.d/rgw.conf``.
-
-.. code-block:: ini
-
- FastCgiExternalServer /var/www/s3gw.fcgi -socket /tmp/radosgw.sock
- <VirtualHost *:80>
- ServerName <FQDN of the host>
- ServerAdmin root@localhost
- DocumentRoot /var/www
- RewriteEngine On
- RewriteRule ^/([a-zA-Z0-9-_.]*)([/]?.*) /s3gw.fcgi?page=$1&params=$2&%{QUERY_STRING} [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L]
- <IfModule mod_fastcgi.c>
- <Directory /var/www>
- Options +ExecCGI
- AllowOverride All
- SetHandler fastcgi-script
- Order allow,deny
- Allow from all
- AuthBasicAuthoritative Off
- </Directory>
- </IfModule>
- AllowEncodedSlashes On
- ErrorLog /var/log/httpd/error.log
- CustomLog /var/log/httpd/access.log combined
- ServerSignature Off
- </VirtualHost>
-
-#. Turn off ``fastcgiwrapper`` in ``/etc/httpd/conf.d/fastcgi.conf`` by
- commenting out the following line::
-
- #FastCgiWrapper On
-
-
-#. Add a ``fastcgi`` script. ::
-
- #!/bin/sh
- exec /usr/bin/radosgw -c /etc/ceph/ceph.conf -n client.radosgw.gateway
-
-
-#. Make ``s3gw.fcgi`` executable::
-
- chmod +x /var/www/rgw/s3gw.fcgi
-
-
-#. Create a user key. ::
-
- ceph-authtool -C -n client.radosgw.gateway --gen-key /etc/ceph/keyring.radosgw.gateway
- ceph-authtool -n client.radosgw.gateway --cap mon 'allow rw' --cap osd 'allow rwx' /etc/ceph/keyring.radosgw.gateway
- ceph auth add client.radosgw.gateway --in-file=/etc/ceph/keyring.radosgw.gateway
-
-
-#. Please make sure ``/etc/ceph/keyring.radosgw.gateway`` file and
- ``/var/log/ceph/radosgw.log`` are accessible by the ``apache`` user. ::
-
- sudo chown apache:apache /etc/ceph/keyring.radosgw.gateway
- sudo chown apache:apache /var/log/ceph/radosgw.log
-
-.. note:: This is important. The user is ``root`` for Debian.
+Installing Ceph Deploy
+======================
-#. Create ``.rgw.buckets`` and add it to the Ceph Object Storage daemon. ::
+Once you have added either release or development packages, or added a
+``ceph.repo`` file to ``/etc/yum.repos.d``, you can install ``ceph-deploy``. ::
- rados mkpool .rgw.buckets
- radosgw-admin pool add --pool .rgw.buckets
+ sudo yum install ceph-deploy python-pushy
-#. Configure Apache and the Ceph Object Storage daemon to start on boot. ::
- chkconfig httpd on
- chkconfig ceph-radosgw on
+Installing Ceph Packages
+========================
-#. Start the services. ::
+Once you have added either release or development packages, or added a
+``ceph.repo`` file to ``/etc/yum.repos.d``, you can install Ceph packages. ::
- /etc/init.d/httpd start
- /etc/init.d/ceph-radosgw start
-
-See `Ceph Object Storage`_ for additional details.
+ sudo yum install ceph
-.. _Ceph Object Storage: ../../radosgw
+.. note:: You can also use ``ceph-deploy`` to install Ceph packages.
diff --git a/doc/install/yum-priorities.rst b/doc/install/yum-priorities.rst
new file mode 100644
index 00000000000..e4adb72b7dd
--- /dev/null
+++ b/doc/install/yum-priorities.rst
@@ -0,0 +1,20 @@
+===========================
+ Installing YUM Priorities
+===========================
+
+Ceph builds packages for Apache and FastCGI (for 100-continue support) and
+QEMU (for ``rbd`` support). You must set priorities in your ``.repo``
+files to ensure that ``yum`` installs the Ceph packages instead of the
+standard packages. The ``priorities`` setting requires you to install
+and enable ``yum-plugin-priorities``.
+
+#. Install ``yum-plugin-priorities``. ::
+
+ sudo yum install yum-plugin-priorities
+
+#. Ensure ``/etc/yum/pluginconf.d/priorities.conf`` exists. ::
+
+#. Ensure ``priorities.conf`` enables the plugin. ::
+
+ [main]
+ enabled = 1
diff --git a/doc/man/8/ceph-conf.rst b/doc/man/8/ceph-conf.rst
index 7b4f83a9282..48d63d8fb07 100644
--- a/doc/man/8/ceph-conf.rst
+++ b/doc/man/8/ceph-conf.rst
@@ -84,4 +84,3 @@ See also
========
:doc:`ceph <ceph>`\(8),
-:doc:`mkcephfs <mkcephfs>`\(8)
diff --git a/doc/man/8/ceph-osd.rst b/doc/man/8/ceph-osd.rst
index 451ca6d956b..9a08bef7e04 100644
--- a/doc/man/8/ceph-osd.rst
+++ b/doc/man/8/ceph-osd.rst
@@ -46,8 +46,7 @@ Options
.. option:: --mkfs
- Create an empty object repository. Normally invoked by
- :doc:`mkcephfs <mkcephfs>`\(8). This also initializes the journal
+ Create an empty object repository. This also initializes the journal
(if one is defined).
.. option:: --mkkey
diff --git a/doc/man/8/ceph-post-file.rst b/doc/man/8/ceph-post-file.rst
index 5625843eaa6..f16fbd8a63c 100644
--- a/doc/man/8/ceph-post-file.rst
+++ b/doc/man/8/ceph-post-file.rst
@@ -1,6 +1,6 @@
-========================================
+==================================================
ceph-post-file -- post files for ceph developers
-========================================
+==================================================
.. program:: ceph-post-file
diff --git a/doc/man/8/ceph.rst b/doc/man/8/ceph.rst
index 634c82c433e..661d93ad5b7 100644
--- a/doc/man/8/ceph.rst
+++ b/doc/man/8/ceph.rst
@@ -95,4 +95,3 @@ See also
========
:doc:`ceph <ceph>`\(8),
-:doc:`mkcephfs <mkcephfs>`\(8)
diff --git a/doc/man/8/crushtool.rst b/doc/man/8/crushtool.rst
index 187352a5c04..97303cc4bfa 100644
--- a/doc/man/8/crushtool.rst
+++ b/doc/man/8/crushtool.rst
@@ -7,8 +7,8 @@
Synopsis
========
-| **crushtool** ( -d *map* | -c *map.txt* | --build *numosds*
- *layer1* *...* ) [ -o *outfile* [ --clobber ]]
+| **crushtool** ( -d *map* | -c *map.txt* | --build --num_osds *numosds*
+ *layer1* *...* ) [ -o *outfile* ]
Description
@@ -41,9 +41,11 @@ The tool has four modes of operation.
will create a relatively generic map with the given layer
structure. See below for examples.
-.. option:: --test ...
+.. option:: --test
+
will perform a dry run of a CRUSH mapping for a range of input object
- names, see crushtool --help for more information.
+ names, see crushtool --help for more information.
+
Options
=======
@@ -52,10 +54,7 @@ Options
will specify the output file.
-.. option:: --clobber
-
- will allow the tool to overwrite an existing outfile (it will normally refuse).
-
+
Building a map
==============
@@ -83,22 +82,26 @@ preceding layer.
Example
=======
-Suppose we have 128 devices, each grouped into shelves with 4 devices
-each, and 8 shelves per rack. We could create a three level hierarchy
-with::
+Suppose we have two rows with two racks each and 20 nodes per rack. Suppose
+each node contains 4 storage devices for Ceph OSD Daemons. This configuration
+allows us to deploy 320 Ceph OSD Daemons. Lets assume a 42U rack with 2U nodes,
+leaving an extra 2U for a rack switch.
+
+To reflect our hierarchy of devices, nodes, racks and rows, we would execute
+the following::
- crushtool --build 128 shelf uniform 4 rack straw 8 root straw 0 -o map
+ crushtool -o crushmap --build --num_osds 320 node straw 4 rack straw 20 row straw 2
To adjust the default (generic) mapping rules, we can run::
# decompile
- crushtool -d map -o map.txt
+ crushtool -d crushmap -o map.txt
# edit
vi map.txt
# recompile
- crushtool -c map.txt -o map
+ crushtool -c map.txt -o crushmap
Availability
@@ -114,4 +117,3 @@ See also
:doc:`ceph <ceph>`\(8),
:doc:`osdmaptool <osdmaptool>`\(8),
-:doc:`mkcephfs <mkcephfs>`\(8)
diff --git a/doc/man/8/mkcephfs.rst b/doc/man/8/mkcephfs.rst
deleted file mode 100644
index 054a8deae1d..00000000000
--- a/doc/man/8/mkcephfs.rst
+++ /dev/null
@@ -1,123 +0,0 @@
-=======================================
- mkcephfs -- create a ceph file system
-=======================================
-
-.. program:: mkcephfs
-
-Synopsis
-========
-
-| **mkcephfs** -c *ceph.conf* [ --mkfs ] [ -a, --all-hosts [ -k
- */path/to/admin.keyring* ] ]
-
-
-Description
-===========
-
-**mkcephfs** is used to create an empty Ceph file system, possibly
-spanning multiple hosts. The ceph.conf file describes the composition
-of the entire Ceph cluster, including which hosts are participating,
-which daemons run where, and which paths are used to store file system
-data or metadata.
-
-The mkcephfs tool can be used in two ways. If -a is used, it will use
-ssh and scp to connect to remote hosts on your behalf and do the setup
-of the entire cluster. This is the easiest solution, but can also be
-inconvenient (if you don't have ssh to connect without prompting for
-passwords) or slow (if you have a large cluster).
-
-Alternatively, you can run each setup phase manually. First, you need
-to prepare a monmap that will be shared by each node::
-
- # prepare
- master# mkdir /tmp/foo
- master# mkcephfs -c /etc/ceph/ceph.conf \
- --prepare-monmap -d /tmp/foo
-
-Share the ``/tmp/foo`` directory with other nodes in whatever way is
-convenient for you. On each OSD and MDS node::
-
- osdnode# mkcephfs --init-local-daemons osd -d /tmp/foo
- mdsnode# mkcephfs --init-local-daemons mds -d /tmp/foo
-
-Collect the contents of the /tmp/foo directories back onto a single
-node, and then::
-
- master# mkcephfs --prepare-mon -d /tmp/foo
-
-Finally, distribute ``/tmp/foo`` to all monitor nodes and, on each of
-those nodes::
-
- monnode# mkcephfs --init-local-daemons mon -d /tmp/foo
-
-
-Options
-=======
-
-.. option:: -a, --allhosts
-
- Performs the necessary initialization steps on all hosts in the
- cluster, executing commands via SSH.
-
-.. option:: -c ceph.conf, --conf=ceph.conf
-
- Use the given conf file instead of the default ``/etc/ceph/ceph.conf``.
-
-.. option:: -k /path/to/keyring
-
- When ``-a`` is used, we can specify a location to copy the
- client.admin keyring, which is used to administer the cluster. The
- default is ``/etc/ceph/keyring`` (or whatever is specified in the
- config file).
-
-.. option:: --mkfs
-
- Create and mount the file systems specified in the ceph.conf for
- OSD data storage using mkfs.$type. The ``devs`` option in ceph.conf
- must specify the device(s) and the ``osd mkfs type`` option must
- specify the file system type (normally one of btrfs, xfs, or ext4).
-
-.. option:: --no-copy-conf
-
- By default, mkcephfs with -a will copy the new configuration to
- /etc/ceph/ceph.conf on each node in the cluster. This option
- disables that behavior.
-
-Subcommands
-===========
-
-The sub-commands performed during cluster setup can be run individually with
-
-.. option:: --prepare-monmap -d dir -c ceph.conf
-
- Create an initial monmap with a random fsid/uuid and store it and
- the ceph.conf in dir.
-
-.. option:: --init-local-daemons type -d dir
-
- Initialize any daemons of type type on the local host using the
- monmap in dir. For types osd and mds, the resulting authentication
- keys will be placed in dir. For type mon, the initial data files
- generated by --prepare-mon (below) are expected in dir.
-
-.. option:: --prepare-mon -d dir
-
- Prepare the initial monitor data based on the monmap, OSD, and MDS
- authentication keys collected in dir, and put the result in dir.
-
-
-Availability
-============
-
-**mkcephfs** is part of the Ceph distributed file system. Please refer
-to the Ceph documentation at http://ceph.com/docs for more
-information.
-
-
-See also
-========
-
-:doc:`ceph <ceph>`\(8),
-:doc:`monmaptool <monmaptool>`\(8),
-:doc:`osdmaptool <osdmaptool>`\(8),
-:doc:`crushtool <crushtool>`\(8)
diff --git a/doc/man/8/monmaptool.rst b/doc/man/8/monmaptool.rst
index 8415ba4136a..42af2faea8b 100644
--- a/doc/man/8/monmaptool.rst
+++ b/doc/man/8/monmaptool.rst
@@ -103,4 +103,3 @@ See also
:doc:`ceph <ceph>`\(8),
:doc:`crushtool <crushtool>`\(8),
-:doc:`mkcephfs <mkcephfs>`\(8)
diff --git a/doc/man/8/osdmaptool.rst b/doc/man/8/osdmaptool.rst
index 27ffb6eba72..c815e485685 100644
--- a/doc/man/8/osdmaptool.rst
+++ b/doc/man/8/osdmaptool.rst
@@ -73,4 +73,3 @@ See also
:doc:`ceph <ceph>`\(8),
:doc:`crushtool <crushtool>`\(8),
-:doc:`mkcephfs <mkcephfs>`\(8)
diff --git a/doc/man/8/rbd.rst b/doc/man/8/rbd.rst
index f50d93eb04c..2d78748f5f2 100644
--- a/doc/man/8/rbd.rst
+++ b/doc/man/8/rbd.rst
@@ -113,6 +113,10 @@ Parameters
Make json or xml formatted output more human-readable.
+.. option:: --read-only
+
+ Set device readonly when mapping image.
+
Commands
========
diff --git a/doc/rados/configuration/ceph-conf.rst b/doc/rados/configuration/ceph-conf.rst
index fc316478034..e0633483120 100644
--- a/doc/rados/configuration/ceph-conf.rst
+++ b/doc/rados/configuration/ceph-conf.rst
@@ -504,12 +504,8 @@ See `General Settings`_, `OSD Settings`_, `Monitor Settings`_, `MDS Settings`_,
.. _RGW Settings: ../../../radosgw/config-ref/
.. _Log Settings: ../log-and-debug-ref
-When deploying the Ceph configuration file, ensure that you use the cluster name
-in your command line syntax. For example::
- ssh myserver01 sudo tee /etc/ceph/openstack.conf < /etc/ceph/openstack.conf
-
-When creating default directories or files, you should also use the cluster
+When creating default directories or files, you should use the cluster
name at the appropriate places in the path. For example::
sudo mkdir /var/lib/ceph/osd/openstack-0
@@ -520,10 +516,10 @@ name at the appropriate places in the path. For example::
have monitors using port 6789, use a different port for your other cluster(s).
To invoke a cluster other than the default ``ceph`` cluster, use the
-``--cluster=clustername`` option with the ``ceph`` command. For example::
-
- ceph --cluster=openstack health
+``-c {filename}.conf`` option with the ``ceph`` command. For example::
+ ceph -c {cluster-name}.conf health
+ ceph -c openstack.conf health
.. _Hardware Recommendations: ../../../install/hardware-recommendations
diff --git a/doc/rados/configuration/journal-ref.rst b/doc/rados/configuration/journal-ref.rst
index b7344544b9a..97300f4a57f 100644
--- a/doc/rados/configuration/journal-ref.rst
+++ b/doc/rados/configuration/journal-ref.rst
@@ -27,6 +27,7 @@ Ceph OSDs use a journal for two reasons: speed and consistency.
Ceph OSD Daemons support the following journal settings:
+
``journal dio``
:Description: Enables direct i/o to the journal. Requires ``journal block
@@ -37,14 +38,17 @@ Ceph OSD Daemons support the following journal settings:
:Default: ``true``
+
``journal aio``
+.. versionchanged:: 0.61 Cuttlefish
+
:Description: Enables using ``libaio`` for asynchronous writes to the journal.
Requires ``journal dio`` set to ``true``.
:Type: Boolean
:Required: No.
-:Default: ``false``
+:Default: Version 0.61 and later, ``true``. Version 0.60 and earlier, ``false``.
``journal block align``
diff --git a/doc/rados/deployment/ceph-deploy-osd.rst b/doc/rados/deployment/ceph-deploy-osd.rst
index 9b27ac41094..d504b7eebbf 100644
--- a/doc/rados/deployment/ceph-deploy-osd.rst
+++ b/doc/rados/deployment/ceph-deploy-osd.rst
@@ -8,10 +8,6 @@ write data to the disk and to journals. So you need to provide a disk for the
OSD and a path to the journal partition (i.e., this is the most common
configuration, but you may configure your system to your own needs).
-By default, ``ceph-deploy`` will create an OSD with the XFS filesystem. You may
-override the filesystem type by providing a ``--fs-type FS_TYPE`` argument,
-where ``FS_TYPE`` is an alternate filesystem such as ``ext4`` or ``btrfs``.
-
In Ceph v0.60 and later releases, Ceph supports ``dm-crypt`` on disk encryption.
You may specify the ``--dm-crypt`` argument when preparing an OSD to tell
``ceph-deploy`` that you want to use encryption. You may also specify the
diff --git a/doc/rados/man/index.rst b/doc/rados/man/index.rst
index d7bbe9d3acb..e4f0f23bfa2 100644
--- a/doc/rados/man/index.rst
+++ b/doc/rados/man/index.rst
@@ -21,3 +21,9 @@
../../man/8/monmaptool.rst
../../man/8/osdmaptool.rst
../../man/8/rados.rst
+
+
+.. toctree::
+ :hidden:
+
+ ../../man/8/ceph-post-file.rst \ No newline at end of file
diff --git a/doc/rados/operations/add-or-rm-mons.rst b/doc/rados/operations/add-or-rm-mons.rst
index 0a15781c6ea..e3bac1fca09 100644
--- a/doc/rados/operations/add-or-rm-mons.rst
+++ b/doc/rados/operations/add-or-rm-mons.rst
@@ -32,7 +32,7 @@ version of Linux installed (typically Ubuntu 12.04 precise).
Add your monitor host to a rack in your cluster, connect it to the network
and ensure that it has network connectivity.
-.. _Hardware Recommendations: ../../install/hardware-recommendations
+.. _Hardware Recommendations: ../../../start/hardware-recommendations
Install the Required Software
-----------------------------
@@ -42,17 +42,9 @@ manually. See `Installing Debian/Ubuntu Packages`_ for details.
You should configure SSH to a user with password-less authentication
and root permissions.
-.. _Installing Debian/Ubuntu Packages: ../../install/debian
+.. _Installing Debian/Ubuntu Packages: ../../../install/debian
-For clusters deployed with Chef, create a `chef user`_, `configure
-SSH keys`_, `install Ruby`_ and `install the Chef client`_ on your host. See
-`Installing Chef`_ for details.
-.. _chef user: ../../install/chef#createuser
-.. _configure SSH keys: ../../install/chef#genkeys
-.. _install the Chef client: ../../install/chef#installchef
-.. _Installing Chef: ../../install/chef
-.. _install Ruby: ../../install/chef#installruby
.. _Adding a Monitor (Manual):
@@ -164,7 +156,7 @@ a cluster that has placement groups that are persistently not ``active + clean``
ceph mon dump
ssh {mon-host}
-#. Stop the ``ceph-mon'' daemon and extract a copy of the monap file. ::
+#. Stop the ``ceph-mon`` daemon and extract a copy of the monap file. ::
service ceph stop mon || stop ceph-mon-all
ceph-mon -i {mon-id} --extract-monmap {map-path}
diff --git a/doc/rados/operations/authentication.rst b/doc/rados/operations/authentication.rst
index 0b71d08b0c4..d9995da8fb8 100644
--- a/doc/rados/operations/authentication.rst
+++ b/doc/rados/operations/authentication.rst
@@ -126,18 +126,15 @@ you may skip the steps related to generating keys.
auth service required = cephx
auth client required = cephx
-#. Or, enable ``cephx`` authentication for versions ``0.50`` and below by
+#. Or, enable ``cephx`` authentication for Ceph versions ``0.50`` and below by
setting the following option in the ``[global]`` section of your `Ceph
- configuration`_ file::
+ configuration`_ file. **NOTE:** Deprecated as of version ``0.50``. ::
auth supported = cephx
-.. deprecated:: 0.51
-#. Start or restart the Ceph cluster. ::
+#. Start or restart the Ceph cluster. See `Operating a Cluster`_ for details.
- sudo service ceph -a start
- sudo service ceph -a restart
.. _disable-cephx:
@@ -157,6 +154,7 @@ during setup and/or troubleshooting to temporarily disable authentication.
auth cluster required = none
auth service required = none
auth client required = none
+ auth supported = none
#. Or, disable ``cephx`` authentication for versions ``0.50`` and below
(deprecated as of version 0.51) by setting the following option in the
@@ -164,10 +162,8 @@ during setup and/or troubleshooting to temporarily disable authentication.
auth supported = none
-#. Start or restart the Ceph cluster. ::
+#. Start or restart the Ceph cluster. See `Operating a Cluster`_ for details.
- sudo service ceph -a start
- sudo service ceph -a restart
Daemon Keyrings
@@ -422,3 +418,4 @@ of the enhanced authentication.
.. _Ceph configuration: ../../configuration/ceph-conf
.. _Cephx Configuration Reference: ../../configuration/auth-config-ref
+.. _Operating a Cluster: ../operating \ No newline at end of file
diff --git a/doc/rados/operations/operating.rst b/doc/rados/operations/operating.rst
index 591704217d0..8c62ed5cdbf 100644
--- a/doc/rados/operations/operating.rst
+++ b/doc/rados/operations/operating.rst
@@ -7,29 +7,32 @@
Running Ceph with Upstart
=========================
-When deploying Ceph Cuttlefish and beyond with ``ceph-deploy``, you may start
-and stop Ceph daemons or the entire cluster using the event-based `Upstart`_.
-Upstart does not require you to define daemon instances in the Ceph configuration
-file (although, they are still required for ``sysvinit`` should you choose to
-use it).
+When deploying Ceph Cuttlefish and beyond with ``ceph-deploy`` on Debian/Ubuntu
+distributions, you may start and stop Ceph daemons on a :term:`Ceph Node` using
+the event-based `Upstart`_. Upstart does not require you to define daemon
+instances in the Ceph configuration file.
-To list the Ceph Upstart jobs and instances, execute::
+To list the Ceph Upstart jobs and instances on a node, execute::
sudo initctl list | grep ceph
See `initctl`_ for additional details.
-Starting a Cluster
-------------------
-To start the cluster, execute the following::
+Starting all Daemons
+--------------------
+
+To start all daemons on a Ceph Node (irrespective of type), execute the
+following::
sudo start ceph-all
-Stopping a Cluster
-------------------
-To stop the cluster, execute the following::
+Stopping all Daemons
+--------------------
+
+To stop all daemons on a Ceph Node (irrespective of type), execute the
+following::
sudo stop ceph-all
@@ -37,7 +40,8 @@ To stop the cluster, execute the following::
Starting all Daemons by Type
----------------------------
-To start all daemons of a particular type, execute one of the following::
+To start all daemons of a particular type on a Ceph Node, execute one of the
+following::
sudo start ceph-osd-all
sudo start ceph-mon-all
@@ -47,7 +51,8 @@ To start all daemons of a particular type, execute one of the following::
Stopping all Daemons by Type
----------------------------
-To stop all daemons of a particular type, execute one of the following::
+To stop all daemons of a particular type on a Ceph Node, execute one of the
+following::
sudo stop ceph-osd-all
sudo stop ceph-mon-all
@@ -57,7 +62,8 @@ To stop all daemons of a particular type, execute one of the following::
Starting a Daemon
-----------------
-To start a specific daemon instance, execute one of the following::
+To start a specific daemon instance on a Ceph Node, execute one of the
+following::
sudo start ceph-osd id={id}
sudo start ceph-mon id={hostname}
@@ -73,7 +79,8 @@ For example::
Stopping a Daemon
-----------------
-To stop a specific daemon instance, execute one of the following::
+To stop a specific daemon instance on a Ceph Node, execute one of the
+following::
sudo stop ceph-osd id={id}
sudo stop ceph-mon id={hostname}
@@ -86,29 +93,20 @@ For example::
sudo start ceph-mds id=ceph-server
-
.. index:: Ceph service; sysvinit; operating a cluster
-Running Ceph as a Service
-=========================
-
-When you deploy Ceph Argonaut or Bobtail with ``mkcephfs``, use the
-service or traditional sysvinit.
-
-The ``ceph`` service provides functionality to **start**, **restart**, and
-**stop** your Ceph cluster. Each time you execute ``ceph`` processes, you
-must specify at least one option and one command. You may also specify a daemon
-type or a daemon instance. For most newer Debian/Ubuntu distributions, you may
-use the following syntax::
+Running Ceph
+============
- sudo service ceph [options] [commands] [daemons]
+Each time you to **start**, **restart**, and **stop** Ceph daemons (or your
+entire cluster) you must specify at least one option and one command. You may
+also specify a daemon type or a daemon instance. ::
-For older distributions, you may wish to use the ``/etc/init.d/ceph`` path::
+ {commandline} [options] [commands] [daemons]
- sudo /etc/init.d/ceph [options] [commands] [daemons]
-The ``ceph`` service options include:
+The ``ceph`` options include:
+-----------------+----------+-------------------------------------------------+
| Option | Shortcut | Description |
@@ -127,7 +125,7 @@ The ``ceph`` service options include:
| ``--conf`` | ``-c`` | Use an alternate configuration file. |
+-----------------+----------+-------------------------------------------------+
-The ``ceph`` service commands include:
+The ``ceph`` commands include:
+------------------+------------------------------------------------------------+
| Command | Description |
@@ -145,83 +143,213 @@ The ``ceph`` service commands include:
| ``cleanalllogs`` | Cleans out **everything** in the log directory. |
+------------------+------------------------------------------------------------+
-For subsystem operations, the ``ceph`` service can target specific daemon types by
-adding a particular daemon type for the ``[daemons]`` option. Daemon types include:
+For subsystem operations, the ``ceph`` service can target specific daemon types
+by adding a particular daemon type for the ``[daemons]`` option. Daemon types
+include:
- ``mon``
- ``osd``
- ``mds``
-The ``ceph`` service's ``[daemons]`` setting may also target a specific instance.
-To start a Ceph daemon on the local :term:`Ceph Node`, use the following syntax::
- sudo /etc/init.d/ceph start osd.0
-
-To start a Ceph daemon on another node, use the following syntax::
+Running Ceph with sysvinit
+--------------------------
- sudo /etc/init.d/ceph -a start osd.0
-
-Where ``osd.0`` is the first OSD in the cluster.
+Using traditional ``sysvinit`` is the recommended way to run Ceph with CentOS,
+Red Hat, Fedora, and SLES distributions. You may also use it for older
+distributions of Debian/Ubuntu.
-Starting a Cluster
-------------------
+Starting all Daemons
+~~~~~~~~~~~~~~~~~~~~
To start your Ceph cluster, execute ``ceph`` with the ``start`` command.
-The usage may differ based upon your Linux distribution. For example, for most
-newer Debian/Ubuntu distributions, you may use the following syntax::
-
- sudo service ceph [options] [start|restart] [daemonType|daemonID]
-
-For older distributions, you may wish to use the ``/etc/init.d/ceph`` path::
+Use the following syntax::
sudo /etc/init.d/ceph [options] [start|restart] [daemonType|daemonID]
The following examples illustrates a typical use case::
- sudo service ceph -a start
sudo /etc/init.d/ceph -a start
Once you execute with ``-a`` (i.e., execute on all nodes), Ceph should begin
-operating. You may also specify a particular daemon instance to constrain the
-command to a single instance. To start a Ceph daemon on the local Ceph Node,
-use the following syntax::
+operating.
+
+
+Stopping all Daemons
+~~~~~~~~~~~~~~~~~~~~
+
+To stop your Ceph cluster, execute ``ceph`` with the ``stop`` command.
+Use the following syntax::
+
+ sudo /etc/init.d/ceph [options] stop [daemonType|daemonID]
+
+The following examples illustrates a typical use case::
+
+ sudo /etc/init.d/ceph -a stop
+
+Once you execute with ``-a`` (i.e., execute on all nodes), Ceph should stop
+operating.
+
+Starting all Daemons by Type
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To start all Ceph daemons of a particular type on the local Ceph Node, use the
+following syntax::
+
+ sudo /etc/init.d/ceph start {daemon-type}
+ sudo /etc/init.d/ceph start osd
+
+To start all Ceph daemons of a particular type on another node, use the
+following syntax::
+
+ sudo /etc/init.d/ceph -a start {daemon-type}
+ sudo /etc/init.d/ceph -a start osd
+
+
+Stopping all Daemons by Type
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To stop all Ceph daemons of a particular type on the local Ceph Node, use the
+following syntax::
+
+ sudo /etc/init.d/ceph stop {daemon-type}
+ sudo /etc/init.d/ceph stop osd
+
+To stop all Ceph daemons of a particular type on another node, use the
+following syntax::
+
+ sudo /etc/init.d/ceph -a stop {daemon-type}
+ sudo /etc/init.d/ceph -a stop osd
+
+
+Starting a Daemon
+~~~~~~~~~~~~~~~~~
+
+To start a Ceph daemon on the local Ceph Node, use the following syntax::
+
+ sudo /etc/init.d/ceph start {daemon-type}.{instance}
sudo /etc/init.d/ceph start osd.0
To start a Ceph daemon on another node, use the following syntax::
+ sudo /etc/init.d/ceph -a start {daemon-type}.{instance}
sudo /etc/init.d/ceph -a start osd.0
-Stopping a Cluster
-------------------
+Stopping a Daemon
+~~~~~~~~~~~~~~~~~
+
+To stop a Ceph daemon on the local Ceph Node, use the following syntax::
+
+ sudo /etc/init.d/ceph stop {daemon-type}.{instance}
+ sudo /etc/init.d/ceph stop osd.0
+
+To stop a Ceph daemon on another node, use the following syntax::
+
+ sudo /etc/init.d/ceph -a stop {daemon-type}.{instance}
+ sudo /etc/init.d/ceph -a stop osd.0
+
+
+Running Ceph as a Service
+-------------------------
+
+When you deploy Ceph Argonaut or Bobtail with ``mkcephfs``, you operate
+Ceph as a service (you may also use sysvinit).
+
+
+Starting all Daemons
+~~~~~~~~~~~~~~~~~~~~
+
+To start your Ceph cluster, execute ``ceph`` with the ``start`` command.
+Use the following syntax::
+
+ sudo service ceph [options] [start|restart] [daemonType|daemonID]
+
+The following examples illustrates a typical use case::
+
+ sudo service ceph -a start
+
+Once you execute with ``-a`` (i.e., execute on all nodes), Ceph should begin
+operating.
+
+
+Stopping all Daemons
+~~~~~~~~~~~~~~~~~~~~
To stop your Ceph cluster, execute ``ceph`` with the ``stop`` command.
-The usage may differ based upon your Linux distribution. For example, for most
-newer Debian/Ubuntu distributions, you may use the following syntax::
+Use the following syntax::
sudo service ceph [options] stop [daemonType|daemonID]
For example::
- sudo service ceph -a stop
-
-For older distributions, you may wish to use the ``/etc/init.d/ceph`` path::
-
- sudo /etc/init.d/ceph -a stop
+ sudo service ceph -a stop
Once you execute with ``-a`` (i.e., execute on all nodes), Ceph should shut
-down. You may also specify a particular daemon instance to constrain the
-command to a single instance. To stop a Ceph daemon on the local Ceph Node,
-use the following syntax::
+down.
- sudo /etc/init.d/ceph stop osd.0
+
+Starting all Daemons by Type
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To start all Ceph daemons of a particular type on the local Ceph Node, use the
+following syntax::
+
+ sudo service ceph start {daemon-type}
+ sudo service ceph start osd
+
+To start all Ceph daemons of a particular type on all nodes, use the following
+syntax::
+
+ sudo service ceph -a start {daemon-type}
+ sudo service ceph -a start osd
+
+
+Stopping all Daemons by Type
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To stop all Ceph daemons of a particular type on the local Ceph Node, use the
+following syntax::
+
+ sudo service ceph stop {daemon-type}
+ sudo service ceph stop osd
+
+To stop all Ceph daemons of a particular type on all nodes, use the following
+syntax::
+
+ sudo service ceph -a stop {daemon-type}
+ sudo service ceph -a stop osd
+
+
+Starting a Daemon
+~~~~~~~~~~~~~~~~~
+
+To start a Ceph daemon on the local Ceph Node, use the following syntax::
+
+ sudo service ceph start {daemon-type}.{instance}
+ sudo service ceph start osd.0
+
+To start a Ceph daemon on another node, use the following syntax::
+
+ sudo service ceph -a start {daemon-type}.{instance}
+ sudo service ceph -a start osd.0
+
+
+Stopping a Daemon
+~~~~~~~~~~~~~~~~~
+
+To stop a Ceph daemon on the local Ceph Node, use the following syntax::
+
+ sudo service ceph stop {daemon-type}.{instance}
+ sudo service ceph stop osd.0
To stop a Ceph daemon on another node, use the following syntax::
- sudo /etc/init.d/ceph -a stop osd.0
+ sudo service ceph -a stop {daemon-type}.{instance}
+ sudo service ceph -a stop osd.0
diff --git a/doc/rados/operations/pools.rst b/doc/rados/operations/pools.rst
index 514a09eeb9a..62623b491ba 100644
--- a/doc/rados/operations/pools.rst
+++ b/doc/rados/operations/pools.rst
@@ -89,12 +89,6 @@ you have many pools with many placement groups (e.g., 50 pools with 100
placement groups each). The point of diminishing returns depends upon the power
of the OSD host.
-.. important:: Increasing the number of placement groups in a pool after you
- create the pool is still an experimental feature in Bobtail (v 0.56). We
- recommend defining a reasonable number of placement groups and maintaining
- that number until Ceph's placement group splitting and merging
- functionality matures.
-
See `Placement Groups`_ for details on calculating an appropriate number of
placement groups for your pool.
diff --git a/doc/rados/troubleshooting/log-and-debug.rst b/doc/rados/troubleshooting/log-and-debug.rst
index 2f2e5e4abc0..7d1ea43d1db 100644
--- a/doc/rados/troubleshooting/log-and-debug.rst
+++ b/doc/rados/troubleshooting/log-and-debug.rst
@@ -243,7 +243,7 @@ to their default level or to a level suitable for normal operations.
+--------------------+-----------+--------------+
| ``rgw`` | 1 | 5 |
+--------------------+-----------+--------------+
-| ``hadoop`` | 1 | 5 |
+| ``javaclient`` | 1 | 5 |
+--------------------+-----------+--------------+
| ``asok`` | 1 | 5 |
+--------------------+-----------+--------------+
diff --git a/doc/radosgw/adminops.rst b/doc/radosgw/adminops.rst
index 50d9870775a..460292be0cd 100644
--- a/doc/radosgw/adminops.rst
+++ b/doc/radosgw/adminops.rst
@@ -13,6 +13,8 @@ Get Usage
Request bandwidth usage information.
+:caps: usage=read
+
Syntax
~~~~~~
@@ -161,6 +163,8 @@ Trim Usage
Remove usage information. With no dates specified, removes all usage
information.
+:caps: usage=write
+
Syntax
~~~~~~
@@ -214,6 +218,8 @@ Get User Info
Get user information. If no user is specified returns the list of all users along with suspension
information.
+:caps: users=read
+
Syntax
~~~~~~
@@ -308,6 +314,7 @@ generated key is added to the keyring without replacing an existing key pair.
If ``access-key`` is specified and refers to an existing key owned by the user
then it will be modified.
+:caps: users=write
Syntax
~~~~~~
@@ -501,6 +508,8 @@ Modify User
Modify a user.
+:caps: users=write
+
Syntax
~~~~~~
@@ -686,6 +695,8 @@ Remove User
Remove an existing user.
+:caps: users=write
+
Syntax
~~~~~~
@@ -733,6 +744,8 @@ granted permissions by specifying ``access``. As with user creation if
``subuser`` is specified without ``secret``, then a secret key will
be automatically generated.
+:caps: users=write
+
Syntax
~~~~~~
@@ -840,6 +853,8 @@ Modify Subuser
Modify an existing subuser
+:caps: users=write
+
Syntax
~~~~~~
@@ -943,6 +958,8 @@ Remove Subuser
Remove an existing subuser
+:caps: users=write
+
Syntax
~~~~~~
@@ -1000,6 +1017,8 @@ type as the key created. Note that when creating a swift key, specifying the opt
``access-key`` will have no effect. Additionally, only one swift key may be held by
each user or subuser.
+:caps: users=write
+
Syntax
~~~~~~
@@ -1116,6 +1135,8 @@ Remove Key
Remove an existing key.
+:caps: users=write
+
Syntax
~~~~~~
@@ -1175,6 +1196,8 @@ without ``bucket`` then all buckets beloning to the user will be returned. If
``bucket`` alone is specified, information for that particular bucket will be
retrieved.
+:caps: buckets=read
+
Syntax
~~~~~~
@@ -1286,6 +1309,8 @@ Check Bucket Index
Check the index of an existing bucket. NOTE: to check multipart object
accounting with ``check-objects``, ``fix`` must be set to True.
+:caps: buckets=write
+
Syntax
~~~~~~
@@ -1340,6 +1365,8 @@ Remove Bucket
Delete an existing bucket.
+:caps: buckets=write
+
Syntax
~~~~~~
@@ -1391,6 +1418,8 @@ Unlink Bucket
Unlink a bucket from a specified user. Primarily useful for changing
bucket ownership.
+:caps: buckets=write
+
Syntax
~~~~~~
@@ -1436,6 +1465,8 @@ Link Bucket
Link a bucket to a specified user, unlinking the bucket from
any previous user.
+:caps: buckets=write
+
Syntax
~~~~~~
@@ -1530,6 +1561,8 @@ Remove Object
Remove an existing object. NOTE: Does not require owner to be non-suspended.
+:caps: buckets=write
+
Syntax
~~~~~~
@@ -1580,6 +1613,8 @@ Get Bucket or Object Policy
Read the policy of an object or bucket.
+:caps: buckets=read
+
Syntax
~~~~~~
@@ -1630,6 +1665,8 @@ Add A User Capability
Add an administrative capability to a specified user.
+:caps: users=write
+
Syntax
~~~~~~
@@ -1705,6 +1742,8 @@ Remove A User Capability
Remove an administrative capability from a specified user.
+:caps: users=write
+
Syntax
~~~~~~
diff --git a/doc/radosgw/config-ref.rst b/doc/radosgw/config-ref.rst
index 790fafdc854..ea650f0683a 100644
--- a/doc/radosgw/config-ref.rst
+++ b/doc/radosgw/config-ref.rst
@@ -3,9 +3,9 @@
======================================
The following settings may added to the Ceph configuration file (i.e., usually
-``ceph.conf``) under the ``[client.radosgw.gateway]`` section. The settings may
-contain default values. If you do not specify each setting in the Ceph
-configuration file, the default value will be set automatically.
+``ceph.conf``) under the ``[client.radosgw.{instance-name}]`` section. The
+settings may contain default values. If you do not specify each setting in the
+Ceph configuration file, the default value will be set automatically.
``rgw data``
@@ -91,70 +91,222 @@ configuration file, the default value will be set automatically.
:Default: None
+``rgw print continue``
-``rgw swift url``
+:Description: Enable ``100-continue`` if it is operational.
+:Type: Boolean
+:Default: ``true``
+
+
+``rgw remote addr param``
+
+:Description: The remote address parameter. For example, the HTTP field
+ containing the remote address, or the ``X-Forwarded-For``
+ address if a reverse proxy is operational.
-:Description: The URL for the Ceph Object Gateway Swift API.
:Type: String
-:Default: None
-
+:Default: ``REMOTE_ADDR``
-``rgw swift url prefix``
-:Description: The URL prefix for the Swift API.
-:Default: ``swift``
-:Example: http://fqdn.com/swift
+``rgw op thread timeout``
+
+:Description: The timeout in seconds for open threads.
+:Type: Integer
+:Default: 600
-``rgw swift auth url``
+``rgw op thread suicide timeout``
+
+:Description: The time ``timeout`` in seconds before a Ceph Object Gateway
+ process dies. Disbled if set to ``0``.
-:Description: Default URL for verifying v1 auth tokens (if not using internal
- Swift auth).
+:Type: Integer
+:Default: ``0``
-:Type: String
-:Default: None
+``rgw thread pool size``
+
+:Description: The size of the thread pool.
+:Type: Integer
+:Default: 100 threads.
-``rgw swifth auth entry``
-:Description: The entry point for a Swift auth URL.
-:Type: String
-:Default: ``auth``
+``rgw num control oids``
+:Description: The number of notification objects used for cache synchronization
+ between different ``rgw`` instances.
-``rgw keystone url``
+:Type: Integer
+:Default: ``8``
+
+
+``rgw init timeout``
+
+:Description: The number of seconds before Ceph Object Gateway gives up on
+ initialization.
+
+:Type: Integer
+:Default: ``30``
+
+
+``rgw mime types file``
+
+:Description: The path and location of the MIME types. Used for Swift
+ auto-detection of object types.
-:Description: The URL for the Keystone server.
:Type: String
-:Default: None
+:Default: ``/etc/mime.types``
-``rgw keystone admin token``
+``rgw gc max objs``
+
+:Description: The maximum number of objects that may be handled by
+ garbage collection in one garbage collection processing cycle.
+
+:Type: Integer
+:Default: ``32``
+
+
+``rgw gc obj min wait``
+
+:Description: The minimum wait time before the object may be removed
+ and handled by garbage collection processing.
+
+:Type: Integer
+:Default: ``2 * 3600``
+
+
+``rgw gc processor max time``
+
+:Description: The maximum time between the beginning of two consecutive garbage
+ collection processing cycles.
+
+:Type: Integer
+:Default: ``3600``
+
+
+``rgw gc processor period``
+
+:Description: The cycle time for garbage collection processing.
+:Type: Integer
+:Default: ``3600``
+
+
+``rgw s3 success create obj status``
+
+:Description: The alternate success status response for ``create-obj``.
+:Type: Integer
+:Default: ``0``
+
+
+``rgw resolve cname``
+
+:Description: Whether ``rgw`` should use DNS CNAME record of the request
+ hostname field (if hostname is not equal to ``rgw dns name``).
+
+:Type: Boolean
+:Default: ``false``
+
+
+``rgw object stripe size``
+
+:Description: The size of an object stripe for Ceph Object Gateway objects.
+ See `Architecture`_ for details on striping.
+
+:Type: Integer
+:Default: ``4 << 20``
+
+
+``rgw extended http attrs``
+
+:Description: Add new set of attributes that could be set on an object. These
+ extra attributes can be set through HTTP header fields when
+ putting the objects. If set, these attributes will return as HTTP
+ fields when doing GET/HEAD on the object.
-:Description: The Keystone admin token (shared secret).
:Type: String
:Default: None
+:Example: "content_foo, content_bar"
-``rgw keystone accepted roles``
+``rgw exit timeout secs``
-:Description: The roles requires to serve requests.
-:Type: String
-:Default: ``Member, admin``
+:Description: Number of seconds to wait for a process before exiting
+ unconditionally.
+:Type: Integer
+:Default: ``120``
-``rgw keystone token cache size``
-:Description: The maximum number of entries in each Keystone token cache.
+``rgw get obj window size``
+
+:Description: The window size in bytes for a single object request.
:Type: Integer
-:Default: ``10000``
+:Default: ``16 << 20``
-``rgw keystone revocation interval``
+``rgw get obj max req size``
+
+:Description: The maximum request size of a single get operation sent to the
+ Ceph Storage Cluster.
-:Description: The number of seconds between token revocation checks.
:Type: Integer
-:Default: ``15 * 60``
+:Default: ``4 << 20``
+
+
+``rgw relaxed s3 bucket names``
+
+:Description: Enables relaxed S3 bucket names rules for US region buckets.
+:Type: Boolean
+:Default: ``false``
+
+
+``rgw list buckets max chunk``
+
+:Description: The maximum number of buckets to retrieve in a single operation
+ when listing user buckets.
+
+:Type: Integer
+:Default: ``1000``
+
+
+``rgw num zone opstate shards``
+
+:Description: The maximum number of shards for keeping inter-region copy
+ progress information.
+
+:Type: Integer
+:Default: ``128``
+
+
+``rgw opstate ratelimit sec``
+
+:Description: The minimum time between opstate updates on a single upload.
+ ``0`` disables the ratelimit.
+
+:Type: Integer
+:Default: ``30``
+
+
+``rgw curl wait timeout ms``
+
+:Description: The timeout in milliseconds for certain ``curl`` calls.
+:Type: Integer
+:Default: ``1000``
+
+
+``rgw copy obj progress``
+
+:Description: Enables output of object progress during long copy operations.
+:Type: Boolean
+:Default: ``true``
+
+
+``rgw copy obj progress every bytes``
+
+:Description: The minimum bytes between copy progress output.
+:Type: Integer
+:Default: ``1024 * 1024``
``rgw admin entry``
@@ -162,69 +314,337 @@ configuration file, the default value will be set automatically.
:Description: The entry point for an admin request URL.
:Type: String
:Default: ``admin``
-
-``rgw enforce swift acls``
-:Description: Enforces the Swift Access Control List (ACL) settings.
-:Type: Boolean
-:Default: ``true``
+Regions
+=======
+
+In Ceph v0.67 and beyond, Ceph Object Gateway supports federated deployments and
+a global namespace via the notion of regions. A region defines the geographic
+location of one or more Ceph Object Gateway instances within one or more zones.
+
+
+Configuring regions differs from typical configuration procedures, because not
+all of the settings end up in a Ceph configuration file. In Ceph v0.67 and
+beyond, you can list regions, get a region configuration and set a region
+configuration.
+
+
+List Regions
+------------
+
+A Ceph cluster contains a list of regions. To list the regions, execute::
+
+ sudo radosgw-admin regions list
+
+The ``radosgw-admin`` returns a JSON formatted list of regions.
+
+.. code-block:: javascript
+
+ { "default_info": { "default_region": "default"},
+ "regions": [
+ "default"]}
+
+
+Get a Region Map
+----------------
+
+To list the details of each region, execute::
+
+ sudo radosgw-admin region-map get
-``rgw swift token expiration``
+.. note:: If you receive a ``failed to read region map`` error, run
+ ``sudo radosgw-admin region-map update`` first.
-:Description: The time in seconds for expiring a Swift token.
-:Type: Integer
-:Default: ``24 * 3600``
+Get a Region
+------------
-``rgw print continue``
+To view the configuration of a region, execute::
-:Description: Enable ``100-continue`` if it is operational.
-:Type: Boolean
-:Default: ``true``
+ radosgw-admin region get [--rgw-region=<region>]
+The ``default`` region looks like this:
-``rgw remote addr param``
+.. code-block:: javascript
-:Description: The remote address parameter. For example, the HTTP field
- containing the remote address, or the ``X-Forwarded-For``
- address if a reverse proxy is operational.
+ {"name": "default",
+ "api_name": "",
+ "is_master": "true",
+ "endpoints": [],
+ "master_zone": "",
+ "zones": [
+ {"name": "default",
+ "endpoints": [],
+ "log_meta": "false",
+ "log_data": "false"}
+ ],
+ "placement_targets": [
+ {"name": "default-placement",
+ "tags": [] }],
+ "default_placement": "default-placement"}
+
+Set a Region
+------------
+
+Defining a region consists of creating a JSON object, specifying at least the
+required settings:
+
+#. ``name``: The name of the region. Required.
+
+#. ``api_name``: The API name for the region. Optional.
+
+#. ``is_master``: Determines if the region is the master region. Required.
+ **note:** You can only have one master region.
+
+#. ``endpoints``: A list of all the endpoints in the region. For example,
+ you may use multiple domain names to refer to the same region. Remember to
+ escape the forward slashes (``\/``). You may also specify a
+ port (``fgdn:port``) for each endpoint. Optional.
+
+#. ``master_zone``: The master zone for the region. Optional. Uses the default
+ zone if not specified. **note:** You can only have one master zone per
+ region.
+
+#. ``zones``: A list of all zones within the region. Each zone has a
+ name (required), a list of endpoints (optional), and whether or not the
+ gateway will log metadata and data operations (false by default).
+
+#. ``placement_targets``: A list of placement targets (optional). Each
+ placement target contains a name (required) and a list of tags (optional).
+
+#. ``default_placement``: The set of default placement pools for the object
+ index and object data. Set to ``default-placement`` by default.
+
+To set a region, create a JSON object consisting of the required fields, save
+the object to a file (e.g., ``region.json``); then, execute the following
+command::
+
+
+ sudo radosgw-admin region set --infile region.json
+
+Where ``region.json`` is the JSON file you created.
+
+.. important:: The ``default`` region ``is_master`` setting is ``true`` by
+ default. If you create a new region and want to make it the master region,
+ you must either set the ``default`` region ``is_master`` setting to
+ ``false``, or delete the ``default`` region.
+
+Finally, update the map. ::
+
+ sudo radosgw-admin region-map update
+
+
+Set a Region Map
+----------------
+
+Setting a region map consists of creating a JSON object consisting of one or more
+regions, and setting the ``master_region`` for the cluster. Each region in the
+region map consists of a key/value pair, where the ``key`` setting is equivalent to
+the ``name`` setting for an individual region configuration, and the ``val`` is
+a JSON object consisting of an individual region configuration.
+
+You may only have one region with ``is_master`` equal to ``true``, and it must be
+specified as the ``master_region`` at the end of the region map. The following
+JSON object is an example of a default region map.
+
+
+.. code-block:: javascript
+
+ { "regions": [
+ { "key": "default",
+ "val": { "name": "default",
+ "api_name": "",
+ "is_master": "true",
+ "endpoints": [],
+ "master_zone": "",
+ "zones": [
+ { "name": "default",
+ "endpoints": [],
+ "log_meta": "false",
+ "log_data": "false"}],
+ "placement_targets": [
+ { "name": "default-placement",
+ "tags": []}],
+ "default_placement": "default-placement"
+ }
+ }
+ ],
+ "master_region": "default"
+ }
+
+To set a region map, execute the following::
+
+ sudo radosgw-admin region-map set --infile regionmap.json
+
+Where ``regionmap.json`` is the JSON file you created. Ensure that you have
+zones created for the ones specified in the region map. Finally, update the map.
+::
+
+ sudo radosgw-admin regionmap update
+
+
+Zones
+=====
+
+In Ceph v0.67 and beyond, Ceph Object Gateway supports the notion of zones. A
+zone defines a logical group consisting of one or more Ceph Object Gateway
+instances.
+
+Configuring zones differs from typical configuration procedures, because not
+all of the settings end up in a Ceph configuration file. In Ceph v0.67 and
+beyond, you can list zones, get a zone configuration and set a zone
+configuration.
+
+
+List Zones
+----------
+
+To list the zones in a cluster, execute::
+
+ sudo radosgw-admin zone list
+
+
+Get a Zone
+----------
+
+To get the configuration of a zone, execute::
+
+ sudo radosgw-admin zone [--rgw-zone=<zone>]
+
+The ``default`` zone looks like this:
+
+.. code-block:: javascript
+
+ { "domain_root": ".rgw",
+ "control_pool": ".rgw.control",
+ "gc_pool": ".rgw.gc",
+ "log_pool": ".log",
+ "intent_log_pool": ".intent-log",
+ "usage_log_pool": ".usage",
+ "user_keys_pool": ".users",
+ "user_email_pool": ".users.email",
+ "user_swift_pool": ".users.swift",
+ "user_uid_pool": ".users.uid",
+ "system_key": { "access_key": "", "secret_key": ""},
+ "placement_pools": [
+ { "key": "default-placement",
+ "val": { "index_pool": ".rgw.buckets.index",
+ "data_pool": ".rgw.buckets"}
+ }
+ ]
+ }
+
+
+Set a Zone
+----------
+
+Configuring a zone involves specifying a series of Ceph Object Gateway pools.
+For consistency, we recommend using a pool prefix that is
+the same as the zone name. See `Pools`_ for details of configuring pools.
+
+To set a zone, create a JSON object consisting of the pools, save
+the object to a file (e.g., ``zone.json``); then, execute the following
+command, replacing ``{zone-name}`` with the name of the zone::
+
+ sudo radosgw-admin zone set --rgw-zone={zone-name} --infile zone.json
+Where ``zone.json`` is the JSON file you created.
+
+
+Region/Zone Settings
+====================
+
+You may include the following settings in your Ceph configuration
+file under each ``[client.radosgw.{instance-name}]`` instance.
+
+
+.. versionadded:: v.67
+
+``rgw zone``
+
+:Description: The name of the zone for the gateway instance.
:Type: String
-:Default: ``REMOTE_ADDR``
+:Default: None
-``rgw op thread timeout``
-
-:Description: The timeout in seconds for open threads.
-:Type: Integer
-:Default: 600
-
+.. versionadded:: v.67
-``rgw op thread suicide timeout``
-
-:Description: The time ``timeout`` in seconds before a Ceph Object Gateway
- process dies. Disbled if set to ``0``.
+``rgw region``
-:Type: Integer
-:Default: ``0``
+:Description: The name of the region for the gateway instance.
+:Type: String
+:Default: None
-``rgw thread pool size``
+.. versionadded:: v.67
-:Description: The size of the thread pool.
-:Type: Integer
-:Default: 100 threads.
+``rgw default region info oid``
+:Description: The OID for storing the default region. We do not recommend
+ changing this setting.
+
+:Type: String
+:Default: ``default.region``
-``rgw num control oids``
-:Description: The number of notification objects used for cache synchronization
- between different ``rgw`` instances.
-:Type: Integer
-:Default: ``8``
+Pools
+=====
+
+Ceph zones map to a series of Ceph Storage Cluster pools.
+
+.. topic:: Manually Created Pools vs. Generated Pools
+
+ If you provide write capabilities to the user key for your Ceph Object
+ Gateway, the gateway has the ability to create pools automatically. This
+ is convenient, but the Ceph Object Storage Cluster uses the default
+ values for the number of placement groups (which may not be ideal) or the
+ values you specified in your Ceph configuration file. If you allow the
+ Ceph Object Gateway to create pools automatically, ensure that you have
+ reasonable defaults for the number of placement groups. See
+ `Pool Configuration`_ for details. See `Cluster Pools`_ for details on
+ creating pools.
+
+The default pools for the Ceph Object Gateway's default zone include:
+
+- ``.rgw``
+- ``.rgw.control``
+- ``.rgw.gc``
+- ``.log``
+- ``.intent-log``
+- ``.usage``
+- ``.users``
+- ``.users.email``
+- ``.users.swift``
+- ``.users.uid``
+
+You have significant discretion in determining how you want a zone to access
+pools. You can create pools on a per zone basis, or use the same pools for
+multiple zones. As a best practice, we recommend having a separate set of pools
+for your master zone and your secondary zones in each region. When creating
+pools for a specific zone, consider prepending the region name and zone name to
+the default pool names. For example:
+
+- ``.region1-zone1.rgw``
+- ``.region1-zone1.rgw.control``
+- ``.region1-zone1.rgw.gc``
+- ``.region1-zone1.log``
+- ``.region1-zone1.intent-log``
+- ``.region1-zone1.usage``
+- ``.region1-zone1.users``
+- ``.region1-zone1.users.email``
+- ``.region1-zone1.users.swift``
+- ``.region1-zone1.users.uid``
+
+
+Ceph Object Gateways store data for the bucket index (``index_pool``) and bucket
+data (``data_pool``) in placement pools. These may overlap--i.e., you may use
+the same pool for the the index and the data. The index pool for default
+placement is ``.rgw.buckets.index`` and for the data pool for default placement
+is ``.rgw.buckets``. See `Zones`_ for details on specifying pools in a zone
+configuration.
.. deprecated:: v.67
@@ -243,11 +663,12 @@ configuration file, the default value will be set automatically.
.. versionadded:: v.67
-``rgw zone``
+``rgw region root pool``
-:Description: The name of the zone for the gateway instance.
+:Description: The pool for storing all region-specific information.
:Type: String
-:Default: None
+:Default: ``.rgw.root``
+
.. versionadded:: v.67
@@ -259,30 +680,56 @@ configuration file, the default value will be set automatically.
:Default: ``.rgw.root``
-.. versionadded:: v.67
+Swift Settings
+==============
-``rgw region``
+``rgw enforce swift acls``
-:Description: The name of the region for the gateway instance.
+:Description: Enforces the Swift Access Control List (ACL) settings.
+:Type: Boolean
+:Default: ``true``
+
+
+``rgw swift token expiration``
+
+:Description: The time in seconds for expiring a Swift token.
+:Type: Integer
+:Default: ``24 * 3600``
+
+
+``rgw swift url``
+
+:Description: The URL for the Ceph Object Gateway Swift API.
:Type: String
:Default: None
+
-.. versionadded:: v.67
+``rgw swift url prefix``
-``rgw region root pool``
+:Description: The URL prefix for the Swift API.
+:Default: ``swift``
+:Example: http://fqdn.com/swift
+
-:Description: The pool for storing all region-specific information.
-:Type: String
-:Default: ``.rgw.root``
+``rgw swift auth url``
+:Description: Default URL for verifying v1 auth tokens (if not using internal
+ Swift auth).
-.. versionadded:: v.67
+:Type: String
+:Default: None
-``rgw default region info oid``
-:Description: The OID for storing the default region.
+``rgw swift auth entry``
+
+:Description: The entry point for a Swift auth URL.
:Type: String
-:Default: ``default.region``
+:Default: ``auth``
+
+
+
+Logging Settings
+================
``rgw log nonexistent bucket``
@@ -401,134 +848,41 @@ configuration file, the default value will be set automatically.
:Default: ``false``
-``rgw init timeout``
-
-:Description: The number of seconds before Ceph Object Gateway gives up on
- initialization.
+``rgw data log window``
+:Description: The data log entries window in seconds.
:Type: Integer
:Default: ``30``
-``rgw mime types file``
-
-:Description: The path and location of the MIME types. Used for Swift
- auto-detection of object types.
-
-:Type: String
-:Default: ``/etc/mime.types``
-
-
-``rgw gc max objs``
-
-:Description: The maximum number of objects that may be handled by
- garbage collection in one garbage collection processing cycle.
-
-:Type: Integer
-:Default: ``32``
-
-
-``rgw gc obj min wait``
-
-:Description: The minimum wait time before the object may be removed
- and handled by garbage collection processing.
-
-:Type: Integer
-:Default: ``2 * 3600``
-
-
-``rgw gc processor max time``
-
-:Description: The maximum time between the beginning of two consecutive garbage
- collection processing cycles.
-
-:Type: Integer
-:Default: ``3600``
-
-
-``rgw gc processor period``
-
-:Description: The cycle time for garbage collection processing.
-:Type: Integer
-:Default: ``3600``
-
-
-``rgw s3 success create obj status``
+``rgw data log changes size``
-:Description: The alternate success status response for ``create-obj``.
+:Description: The number of in-memory entries to hold for the data changes log.
:Type: Integer
-:Default: ``0``
-
-
-``rgw resolve cname``
-
-:Description: Whether ``rgw`` should use DNS CNAME record of the request
- hostname field (if hostname is not equal to ``rgw dns name``).
-
-:Type: Boolean
-:Default: ``false``
+:Default: ``1000``
-``rgw object stripe size``
+``rgw data log num shards``
-:Description: The size of an object stripe for Ceph Object Gateway objects.
- See `Architecture`_ for details on striping.
+:Description: The number of shards (objects) on which to keep the
+ data changes log.
:Type: Integer
-:Default: ``4 << 20``
-
+:Default: ``128``
-``rgw extended http attrs``
-:Description: Add new set of attributes that could be set on an object. These
- extra attributes can be set through HTTP header fields when
- putting the objects. If set, these attributes will return as HTTP
- fields when doing GET/HEAD on the object.
+``rgw data log obj prefix``
+:Description: The object name prefix for the data log.
:Type: String
-:Default: None
-:Example: "content_foo, content_bar"
-
-
-``rgw exit timeout secs``
-
-:Description: Number of seconds to wait for a process before exiting
- unconditionally.
-
-:Type: Integer
-:Default: ``120``
-
-
-``rgw get obj window size``
-
-:Description: The window size in bytes for a single object request.
-:Type: Integer
-:Default: ``16 << 20``
-
-
-``rgw get obj max req size``
-
-:Description: The maximum request size of a single get operation sent to the
- Ceph Storage Cluster.
-
-:Type: Integer
-:Default: ``4 << 20``
-
-
-``rgw relaxed s3 bucket names``
-
-:Description: Enables relaxed S3 bucket names rules for US region buckets.
-:Type: Boolean
-:Default: ``false``
-
+:Default: ``data_log``
-``rgw list buckets max chunk``
-:Description: The maximum number of buckets to retrieve in a single operation
- when listing user buckets.
+``rgw replica log obj prefix``
-:Type: Integer
-:Default: ``1000``
+:Description: The object name prefix for the replica log.
+:Type: String
+:Default: ``replica log``
``rgw md log max shards``
@@ -538,80 +892,47 @@ configuration file, the default value will be set automatically.
:Default: ``64``
-``rgw num zone opstate shards``
-
-:Description: The maximum number of shards for keeping inter-region copy
- progress information.
-
-:Type: Integer
-:Default: ``128``
+Keystone Settings
+=================
-``rgw opstate ratelimit sec``
-:Description: The minimum time between opstate updates on a single upload.
- ``0`` disables the ratelimit.
-
-:Type: Integer
-:Default: ``30``
-
-
-``rgw curl wait timeout ms``
-
-:Description: The timeout in milliseconds for certain ``curl`` calls.
-:Type: Integer
-:Default: ``1000``
-
-
-``rgw copy obj progress``
+``rgw keystone url``
-:Description: Enables output of object progress during long copy operations.
-:Type: Boolean
-:Default: ``true``
+:Description: The URL for the Keystone server.
+:Type: String
+:Default: None
-``rgw copy obj progress every bytes``
+``rgw keystone admin token``
-:Description: The minimum bytes between copy progress output.
-:Type: Integer
-:Default: ``1024 * 1024``
+:Description: The Keystone admin token (shared secret).
+:Type: String
+:Default: None
-``rgw data log window``
+``rgw keystone accepted roles``
-:Description: The data log entries window in seconds.
-:Type: Integer
-:Default: ``30``
+:Description: The roles requires to serve requests.
+:Type: String
+:Default: ``Member, admin``
-``rgw data log changes size``
+``rgw keystone token cache size``
-:Description: The number of in-memory entries to hold for the data changes log.
+:Description: The maximum number of entries in each Keystone token cache.
:Type: Integer
-:Default: ``1000``
-
+:Default: ``10000``
-``rgw data log num shards``
-:Description: The number of shards (objects) on which to keep the
- data changes log.
+``rgw keystone revocation interval``
+:Description: The number of seconds between token revocation checks.
:Type: Integer
-:Default: ``128``
-
-
-``rgw data log obj prefix``
-
-:Description: The object name prefix for the data log.
-:Type: String
-:Default: ``data_log``
-
-
-``rgw replica log obj prefix``
+:Default: ``15 * 60``
-:Description: The object name prefix for the replica log.
-:Type: String
-:Default: ``replica log``
-.. _Architecture: ../../architecture#data-striping \ No newline at end of file
+.. _Architecture: ../../architecture#data-striping
+.. _Pool Configuration: ../../rados/configuration/pool-pg-config-ref/
+.. _Cluster Pools: ../../rados/operations/pools \ No newline at end of file
diff --git a/doc/radosgw/config.rst b/doc/radosgw/config.rst
index 7dd9f9a93dd..caa3dac15e1 100644
--- a/doc/radosgw/config.rst
+++ b/doc/radosgw/config.rst
@@ -333,7 +333,7 @@ subuser and a Swift access key.
::
- sudo radosgw-admin key create --subuser=johndoe:swift --key-type=swift
+ sudo radosgw-admin key create --subuser=johndoe:swift --key-type=swift --gen-secret
.. code-block:: javascript
@@ -387,6 +387,7 @@ The following configuration options are available for Keystone integration::
rgw keystone accepted roles = {accepted user roles}
rgw keystone token cache size = {number of tokens to cache}
rgw keystone revocation interval = {number of seconds before checking revoked tickets}
+ rgw s3 auth use keystone = true
nss db path = {path to nss db}
A Ceph Object Gateway user is mapped into a Keystone ``tenant``. A Keystone user
@@ -449,4 +450,4 @@ on client the machine(s).
.. _Pool Configuration: ../../rados/configuration/pool-pg-config-ref/
.. _Pools: ../../rados/operations/pools
.. _Cephx Guide: ../../rados/operations/authentication/#cephx-guide
-.. _Operating a Cluster: ../../rados/rados/operations/operating \ No newline at end of file
+.. _Operating a Cluster: ../../rados/rados/operations/operating
diff --git a/doc/radosgw/manual-install.rst b/doc/radosgw/manual-install.rst
index aad4fec08ce..6b9b7e59d1f 100644
--- a/doc/radosgw/manual-install.rst
+++ b/doc/radosgw/manual-install.rst
@@ -2,15 +2,14 @@
Install Apache, FastCGI and Gateway
=====================================
-.. note:: If you deploy Ceph with Chef cookbooks, you may skip this section.
-
Install Packages
-----------------
+================
To install Ceph Object Gateway, you must install Apache and FastCGI first. ::
sudo apt-get update && sudo apt-get install apache2 libapache2-mod-fastcgi
+
100-Continue Support
--------------------
@@ -23,8 +22,10 @@ FastCGI packages modified for Ceph here:
- `Apache Oneiric`_
- `Apache Precise`_
+- `Apache Quantal for ARM (Calxeda)`_
- `FastCGI Oneric`_
- `FastCGI Precise`_
+- `FastCGI Quantal for ARM (Calxeda)`_
You may also clone Ceph's Apache and FastCGI git repositories::
@@ -33,8 +34,10 @@ You may also clone Ceph's Apache and FastCGI git repositories::
.. _Apache Oneiric: http://gitbuilder.ceph.com/apache2-deb-oneiric-x86_64-basic/
.. _Apache Precise: http://gitbuilder.ceph.com/apache2-deb-precise-x86_64-basic/
+.. _Apache Quantal for ARM (Calxeda): http://gitbuilder.ceph.com/apache2-deb-quantal-arm7l-basic/
.. _FastCGI Oneric: http://gitbuilder.ceph.com/libapache-mod-fastcgi-deb-oneiric-x86_64-basic/
.. _FastCGI Precise: http://gitbuilder.ceph.com/libapache-mod-fastcgi-deb-precise-x86_64-basic/
+.. _FastCGI Quantal for ARM (Calxeda): http://gitbuilder.ceph.com/libapache-mod-fastcgi-deb-quantal-arm7l-basic/
.. _RFC 2616, Section 8: http://www.w3.org/Protocols/rfc2616/rfc2616-sec8.html
.. important:: If you do NOT use a modified fastcgi as described above,
@@ -43,17 +46,19 @@ You may also clone Ceph's Apache and FastCGI git repositories::
rgw print continue = false
+
Apache Configuration
---------------------
+====================
Enable the URL rewrite modules for Apache and FastCGI. For example::
sudo a2enmod rewrite
sudo a2enmod fastcgi
-By default, the ``/etc/apache2/httpd.conf`` file is blank. Add a line for the
-``ServerName`` and provide the fully qualified domain name of the host where
-you will install RADOS GW. For example::
+By default, the ``/etc/apache2/httpd.conf`` or ``/etc/apache2/apache2.conf``
+file is blank. Add a line for the ``ServerName`` and provide the fully
+qualified domain name of the host where you will install the Ceph Object
+Gateway. For example::
ServerName {fqdn}
@@ -61,9 +66,9 @@ Restart Apache so that the foregoing changes take effect. ::
sudo service apache2 restart
-Then, install Ceph Object Gateway. For example::
+Then, install Ceph Object Gateway and its sync agent. For example::
- sudo apt-get install radosgw
+ sudo apt-get install radosgw radosgw-agent
Enable SSL
@@ -79,6 +84,49 @@ Once you enable SSL, you should generate an SSL certificate. ::
sudo mkdir /etc/apache2/ssl
sudo openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout /etc/apache2/ssl/apache.key -out /etc/apache2/ssl/apache.crt
+
+.. note:: The foregoing example uses self-certified certificates. Some client
+ APIs check for a trusted certificate authority. So you may need to obtain
+ a SSL certificate from a trusted authority to use those client APIs.
+
Then, restart Apache. ::
service apache2 restart
+
+
+Add Wildcard to DNS
+===================
+
+To use Ceph with S3-style subdomains (e.g., ``bucket-name.domain-name.com``),
+you need to add a wildcard to the DNS record of the DNS server you use with the
+``radosgw`` daemon.
+
+.. tip:: The address of the DNS must also be specified in the Ceph
+ configuration file with the ``rgw dns name = {hostname}`` setting.
+
+For ``dnsmasq``, consider addding the following ``address`` setting with a dot
+(.) prepended to the host name::
+
+ address=/.{hostname-or-fqdn}/{host-ip-address}
+ address=/.ceph-node/192.168.0.1
+
+For ``bind``, consider adding the a wildcard to the DNS record::
+
+ $TTL 604800
+ @ IN SOA ceph-node. root.ceph-node. (
+ 2 ; Serial
+ 604800 ; Refresh
+ 86400 ; Retry
+ 2419200 ; Expire
+ 604800 ) ; Negative Cache TTL
+ ;
+ @ IN NS ceph-node.
+ @ IN A 192.168.122.113
+ * IN CNAME @
+
+Restart your DNS server and ping your server with a subdomain to
+ensure that your Ceph Object Store ``radosgw`` daemon can process
+the subdomain requests. ::
+
+ ping mybucket.{fqdn}
+ ping mybucket.ceph-node
diff --git a/doc/rbd/libvirt.rst b/doc/rbd/libvirt.rst
index cc8dc9bd189..4813c3258d0 100644
--- a/doc/rbd/libvirt.rst
+++ b/doc/rbd/libvirt.rst
@@ -40,46 +40,11 @@ The most common ``libvirt`` use case involves providing Ceph block devices to
cloud solutions like OpenStack or CloudStack. The cloud solution uses
``libvirt`` to interact with QEMU/KVM, and QEMU/KVM interacts with Ceph block
devices via ``librbd``. See `Block Devices and OpenStack`_ and `Block Devices
-and CloudStack`_ for details.
+and CloudStack`_ for details. See `Installation`_ for installation details.
You can also use Ceph block devices with ``libvirt``, ``virsh`` and the
``libvirt`` API. See `libvirt Virtualization API`_ for details.
-Prerequisites
-=============
-
-- `Install`_ and `configure`_ a Ceph cluster
-- `Install and configure`_ QEMU/KVM
-
-
-Installing ``libvirt`` on Ubuntu 12.04 Precise
-==============================================
-
-``libvirt`` packages are incorporated into the Ubuntu 12.04 precise
-distribution. To install ``libvirt`` on precise, execute the following::
-
- sudo apt-get update && sudo apt-get install libvirt-bin
-
-
-Installing ``libvirt`` on Earlier Versions of Ubuntu
-====================================================
-
-For Ubuntu distributions 11.10 oneiric and earlier, you must build ``libvirt``
-from source. Clone the ``libvirt`` repository, and use `AutoGen`_ to generate
-the build. Then, execute ``make`` and ``make install`` to complete the
-installation. For example::
-
- git clone git://libvirt.org/libvirt.git
- cd libvirt
- ./autogen.sh
- make
- sudo make install
-
-See `libvirt Installation`_ for details.
-
-
-Using Ceph with Virtual Machines
-================================
To create VMs that use Ceph block devices, use the procedures in the following
sections. In the exemplary embodiment, we've used ``libvirt-pool`` for the pool
@@ -89,7 +54,7 @@ when executing commands in the subsequent procedures.
Configuring Ceph
-----------------
+================
To configure Ceph for use with ``libvirt``, perform the following steps:
@@ -132,7 +97,7 @@ To configure Ceph for use with ``libvirt``, perform the following steps:
Preparing the VM Manager
-------------------------
+========================
You may use ``libvirt`` without a VM manager, but you may find it simpler to
create your first domain with ``virt-manager``.
@@ -150,7 +115,7 @@ create your first domain with ``virt-manager``.
Creating a VM
--------------
+=============
To create a VM with ``virt-manager``, perform the following steps:
@@ -182,7 +147,7 @@ To create a VM with ``virt-manager``, perform the following steps:
Configuring the VM
-------------------
+==================
When configuring the VM for use with Ceph, it is important to use ``virsh``
where appropriate. Additionally, ``virsh`` commands often require root
@@ -290,7 +255,7 @@ commands, refer to `Virsh Command Reference`_.
Summary
--------
+=======
Once you have configured the VM for use with Ceph, you can start the VM.
To verify that the VM and Ceph are communicating, you may perform the
@@ -320,13 +285,8 @@ If everything looks okay, you may begin using the Ceph block device
within your VM.
-
-.. _AutoGen: http://www.gnu.org/software/autogen/
-.. _libvirt Installation: http://www.libvirt.org/compiling.html
+.. _Installation: ../../install
.. _libvirt Virtualization API: http://www.libvirt.org
-.. _Install: ../../install
-.. _configure: ../../rados/configuration
-.. _Install and configure: ../qemu-rbd
.. _Block Devices and OpenStack: ../rbd-openstack
.. _Block Devices and CloudStack: ../rbd-cloudstack
.. _Create a pool: ../../rados/operations/pools#create-a-pool
diff --git a/doc/rbd/qemu-rbd.rst b/doc/rbd/qemu-rbd.rst
index 9d366f3ea8d..e0b55dee257 100644
--- a/doc/rbd/qemu-rbd.rst
+++ b/doc/rbd/qemu-rbd.rst
@@ -27,33 +27,12 @@ image each time it spins up a new virtual machine.
Ceph Block Devices can integrate with the QEMU virtual machine. For details on
QEMU, see `QEMU Open Source Processor Emulator`_. For QEMU documentation, see
-`QEMU Manual`_.
+`QEMU Manual`_. For installation details, see `Installation`_.
.. important:: To use Ceph Block Devices with QEMU, you must have access to a
running Ceph cluster.
-Installing QEMU (12.04 Precise and later)
-=========================================
-
-QEMU packages are incorporated into Ubuntu 12.04 Precise Pangolin and later
-versions. To install QEMU, execute the following::
-
- sudo apt-get install qemu
-
-Installing QEMU (11.10 Oneric and earlier)
-==========================================
-
-For Ubuntu distributions 11.10 Oneiric and earlier, you must install
-the 0.15 version of QEMU or later. To build QEMU from source, use the
-following procedure::
-
- cd {your-development-directory}
- git clone git://git.qemu.org/qemu.git
- cd qemu
- ./configure --enable-rbd
- make; make install
-
Creating Images with QEMU
=========================
@@ -199,4 +178,5 @@ QEMU command line settings override the Ceph configuration file settings.
.. _QEMU Open Source Processor Emulator: http://wiki.qemu.org/Main_Page
.. _QEMU Manual: http://wiki.qemu.org/Manual
.. _RBD Cache: ../rbd-config-ref/
-.. _Snapshots: ../rbd-snapshot/ \ No newline at end of file
+.. _Snapshots: ../rbd-snapshot/
+.. _Installation: ../../install \ No newline at end of file
diff --git a/doc/rbd/rbd-openstack.rst b/doc/rbd/rbd-openstack.rst
index 660757639aa..80dd43ce406 100644
--- a/doc/rbd/rbd-openstack.rst
+++ b/doc/rbd/rbd-openstack.rst
@@ -127,7 +127,7 @@ Hosts running ``nova-compute`` do not need the keyring. Instead, they
store the secret key in libvirt. Create a temporary copy of the secret
key on the hosts running ``nova-compute``::
- ssh {your-compute-host} client.volumes.key <`ceph auth get-key client.volumes`
+ ceph auth get-key client.volumes | ssh {your-compute-host} tee client.volumes.key
Then, on the compute hosts, add the secret key to libvirt and remove the
temporary copy of the key::
diff --git a/doc/release-notes.rst b/doc/release-notes.rst
index bc043fd037a..2b566baa0ea 100644
--- a/doc/release-notes.rst
+++ b/doc/release-notes.rst
@@ -2,6 +2,246 @@
Release Notes
===============
+v0.70
+-----
+
+Upgrading
+~~~~~~~~~
+
+* librados::Rados::pool_create_async() and librados::Rados::pool_delete_async()
+ don't drop a reference to the completion object on error, caller needs to take
+ care of that. This has never really worked correctly and we were leaking an
+ object
+
+* 'ceph osd crush set <id> <weight> <loc..>' no longer adds the osd to the
+ specified location, as that's a job for 'ceph osd crush add'. It will
+ however continue to work just the same as long as the osd already exists
+ in the crush map.
+
+Notable Changes
+~~~~~~~~~~~~~~~
+
+* mon: a few 'ceph mon add' races fixed (command is now idempotent) (Joao Luis)
+* crush: fix name caching
+* rgw: fix a few minor memory leaks (Yehuda Sadeh)
+* ceph: improve parsing of CEPH_ARGS (Benoit Knecht)
+* mon: avoid rewriting full osdmaps on restart (Joao Luis)
+* crc32c: fix optimized crc32c code (it now detects arch support properly)
+* mon: fix 'ceph osd crush reweight ...' (Joao Luis)
+* osd: revert xattr size limit (fixes large rgw uploads)
+* mds: fix heap profiler commands (Joao Luis)
+* rgw: fix inefficient use of std::list::size() (Yehuda Sadeh)
+
+
+v0.69
+-----
+
+Upgrading
+~~~~~~~~~
+
+* Users of the librados C++ API should replace users of get_version()
+ with get_version64() as the old method only returns a 32-bit value
+ for a 64-bit field. The existing 32-bit get_version() method is now
+ deprecated.
+
+* The OSDs are now more picky that request payload match their
+ declared size. A write operation across N bytes that includes M
+ bytes of data will now be rejected. No known clients do this, but
+ the because the server-side behavior has changed it is possible that
+ an application misusing the interface may now get errors.
+
+* The OSD now enforces that class write methods cannot both mutate an
+ object and return data. The rbd.assign_bid method, the lone
+ offender, has been removed. This breaks compatibility with
+ pre-bobtail librbd clients by preventing them from creating new
+ images.
+
+* librados now returns on commit instead of ack for synchronous calls.
+ This is a bit safer in the case where both OSDs and the client crash, and
+ is probably how it should have been acting from the beginning. Users are
+ unlikely to notice but it could result in lower performance in some
+ circumstances. Those who care should switch to using the async interfaces,
+ which let you specify safety semantics precisely.
+
+* The C++ librados AioComplete::get_version() method was incorrectly
+ returning an int (usually 32-bits). To avoid breaking library
+ compatibility, a get_version64() method is added that returns the
+ full-width value. The old method is deprecated and will be removed
+ in a future release. Users of the C++ librados API that make use of
+ the get_version() method should modify their code to avoid getting a
+ value that is truncated from 64 to to 32 bits.
+
+
+Notable Changes
+~~~~~~~~~~~~~~~
+
+* build cleanly under clang (Christophe Courtaut)
+* common: migrate SharedPtrRegistry to use boost::shared_ptr<> (Loic Dachary)
+* doc: erasure coding design notes (Loic Dachary)
+* improved intel-optimized crc32c support (~8x faster on my laptop!)
+* librados: get_version64() method for C++ API
+* mds: fix locking deadlock (David Disseldorp)
+* mon, osd: initial CLI for configuring tiering
+* mon: allow cap strings with . to be unquoted
+* mon: continue to discover peer addr info during election phase
+* mon: fix 'osd crush move ...' command for buckets (Joao Luis)
+* mon: warn when mon data stores grow very large (Joao Luis)
+* objecter, librados: redirect requests based on cache tier config
+* osd, librados: add new COPY_FROM rados operation
+* osd, librados: add new COPY_GET rados operations (used by COPY_FROM)
+* osd: add 'osd heartbeat min healthy ratio' configurable (was hard-coded at 33%)
+* osd: add option to disable pg log debug code (which burns CPU)
+* osd: allow cap strings with . to be unquoted
+* osd: fix version value returned by various operations (Greg Farnum)
+* osd: infrastructure to copy objects from other OSDs
+* osd: use fdatasync(2) instead of fsync(2) to improve performance (Sam Just)
+* rgw: fix major CPU utilization bug with internal caching (Yehuda Sadeh, Mark Nelson)
+* rgw: fix ordering of write operations (preventing data loss on crash) (Yehuda Sadeh)
+* rgw: fix ordering of writes for mulitpart upload (Yehuda Sadeh)
+* rgw: fix various CORS bugs (Yehuda Sadeh)
+* rgw: improve help output (Christophe Courtaut)
+* rgw: validate S3 tokens against keystone (Roald J. van Loon)
+* rgw: wildcard support for keystone roles (Christophe Courtaut)
+* sysvinit radosgw: fix status return code (Danny Al-Gaaf)
+* sysvinit rbdmap: fix error 'service rbdmap stop' (Laurent Barbe)
+
+v0.68
+-----
+
+Upgrading
+~~~~~~~~~
+
+* 'ceph osd crush set <id> <weight> <loc..>' no longer adds the osd to the
+ specified location, as that's a job for 'ceph osd crush add'. It will
+ however continue to work just the same as long as the osd already exists
+ in the crush map.
+
+* The OSD now enforces that class write methods cannot both mutate an
+ object and return data. The rbd.assign_bid method, the lone
+ offender, has been removed. This breaks compatibility with
+ pre-bobtail librbd clients by preventing them from creating new
+ images.
+
+* librados now returns on commit instead of ack for synchronous calls.
+ This is a bit safer in the case where both OSDs and the client crash, and
+ is probably how it should have been acting from the beginning. Users are
+ unlikely to notice but it could result in lower performance in some
+ circumstances. Those who care should switch to using the async interfaces,
+ which let you specify safety semantics precisely.
+
+* The C++ librados AioComplete::get_version() method was incorrectly
+ returning an int (usually 32-bits). To avoid breaking library
+ compatibility, a get_version64() method is added that returns the
+ full-width value. The old method is deprecated and will be removed
+ in a future release. Users of the C++ librados API that make use of
+ the get_version() method should modify their code to avoid getting a
+ value that is truncated from 64 to to 32 bits.
+
+
+
+Notable Changes
+~~~~~~~~~~~~~~~
+
+* ceph-fuse: fix problem with readahead vs truncate race (Yan, Zheng)
+* ceph-post-file: new command to easily share logs or other files with ceph devs
+* ceph: parse CEPH_ARGS env variable
+* librados: fix async aio completion wakeup
+* librados: hello_world example (Greg Farnum)
+* librados: sync calls now return on commit (instead of ack) (Greg Farnum)
+* mds: fix mds rejoin with legacy parent backpointer xattrs (Alexandre Oliva)
+* mds: fix rare restart/failure race during fs creation
+* mds: notify clients about deleted files (so they can release from their cache) (Yan, Zheng)
+* mds: several bug fixes with clustered mds (Yan, Zheng)
+* mon: allow logging level of cluster log (/var/log/ceph/ceph.log) to be adjusted
+* mon: do not expose uncommitted state from 'osd crush {add,set} ...' (Joao Luis)
+* mon: fix byte counts (off by factor of 4) (Dan Mick, Joao Luis)
+* mon: fix paxos corner case
+* mon: modify 'auth add' semantics to make a bit more sense (Joao Luis)
+* mon: new 'osd perf' command to dump recent performance information (Samuel Just)
+* mon: new and improved 'ceph -s' or 'ceph status' command (more info, easier to read)
+* monc: fix small memory leak
+* new wireshark patches pulled into the tree (Kevin Jones)
+* objecter: fix possible hang when cluster is unpaused (Josh Durgin)
+* osd: 'osd recover clone overlap limit' option to limit cloning during recovery (Samuel Just)
+* osd: cls_hello OSD class example
+* osd: experiemental support for ZFS (zfsonlinux.org) (Yan, Zheng)
+* osd: instrument peering states (David Zafman)
+* osd: properly enforce RD/WR flags for rados classes
+* osd: remove old pg log on upgrade (Samuel Just)
+* rgw: complete in-progress requests before shutting down
+* rgw: fix S3 auth with response-* query string params (Sylvain Munaut, Yehuda Sadeh)
+* sysvinit: add condrestart command (Dan van der Ster)
+
+
+
+v0.67.4 "Dumpling"
+------------------
+
+This point release fixes an important performance issue with radosgw,
+keystone authentication token caching, and CORS. All users
+(especially those of rgw) are encouraged to upgrade.
+
+Notable changes
+~~~~~~~~~~~~~~~
+
+* crush: fix invalidation of cached names
+* crushtool: do not crash on non-unique bucket ids
+* mds: be more careful when decoding LogEvents
+* mds: fix heap check debugging commands
+* mon: avoid rebuilding old full osdmaps
+* mon: fix 'ceph crush move ...'
+* mon: fix 'ceph osd crush reweight ...'
+* mon: fix writeout of full osdmaps during trim
+* mon: limit size of transactions
+* mon: prevent both unmanaged and pool snaps
+* osd: disable xattr size limit (prevents upload of large rgw objects)
+* osd: fix recovery op throttling
+* osd: fix throttling of log messages for very slow requests
+* rgw: drain pending requests before completing write
+* rgw: fix CORS
+* rgw: fix inefficient list::size() usage
+* rgw: fix keystone token expiration
+* rgw: fix minor memory leaks
+* rgw: fix null termination of buffer
+
+For more detailed information, see :download:`the complete changelog <changelog/v0.67.4.txt>`.
+
+
+v0.67.3 "Dumpling"
+------------------
+
+This point release fixes a few important performance regressions with
+the OSD (both with CPU and disk utilization), as well as several other
+important but less common problems. We recommend that all production users
+upgrade.
+
+Notable Changes
+~~~~~~~~~~~~~~~
+
+* ceph-disk: partprobe after creation journal partition
+* ceph-disk: specify fs type when mounting
+* ceph-post-file: new utility to help share logs and other files with ceph developers
+* libcephfs: fix truncate vs readahead race (crash)
+* mds: fix flock/fcntl lock deadlock
+* mds: fix rejoin loop when encountering pre-dumpling backpointers
+* mon: allow name and addr discovery during election stage
+* mon: always refresh after Paxos store_state (fixes recovery corner case)
+* mon: fix off-by-4x bug with osd byte counts
+* osd: add and disable 'pg log keys debug' by default
+* osd: add option to disable throttling
+* osd: avoid leveldb iterators for pg log append and trim
+* osd: fix readdir_r invocations
+* osd: use fdatasync instead of sync
+* radosgw: fix sysvinit script return status
+* rbd: relicense as LGPL2
+* rgw: flush pending data on multipart upload
+* rgw: recheck object name during S3 POST
+* rgw: reorder init/startup
+* rpm: fix debuginfo package build
+
+For more detailed information, see :download:`the complete changelog <changelog/v0.67.3.txt>`.
+
+
v0.67.2 "Dumpling"
------------------
@@ -165,11 +405,11 @@ In addition to the above notes about upgrading from v0.66:
* The 'ceph osd tell ...' and 'ceph mon tell ...' commands are no
longer supported. Any callers should use::
- ceph tell osd.<id or *> ...
- ceph tell mon.<id or name or *> ...
+ ceph tell osd.<id or *> ...
+ ceph tell mon.<id or name or *> ...
The 'ceph mds tell ...' command is still there, but will soon also
- transition to 'ceph tell mds.<id or name or *> ...'
+ transition to 'ceph tell mds.<id or name or \*> ...'
* The 'ceph osd crush add ...' command used to take one of two forms::
@@ -194,7 +434,7 @@ In addition to the above notes about upgrading from v0.66:
renamed to 'mon osd min down {reporters|reports}', and the
documentation has been updated to reflect that these options apply
to the monitors (who process failure reports) and not OSDs. If you
- have adjusted these settings, please update your ``ceph.conf''
+ have adjusted these settings, please update your ``ceph.conf``
accordingly.
@@ -398,11 +638,11 @@ Upgrading
* The 'ceph osd tell ...' and 'ceph mon tell ...' commands are no
longer supported. Any callers should use::
- ceph tell osd.<id or *> ...
- ceph tell mon.<id or name or *> ...
+ ceph tell osd.<id or *> ...
+ ceph tell mon.<id or name or *> ...
The 'ceph mds tell ...' command is still there, but will soon also
- transition to 'ceph tell mds.<id or name or *> ...'
+ transition to 'ceph tell mds.<id or name or \*> ...'
* The 'ceph osd crush add ...' command used to take one of two forms::
@@ -496,7 +736,7 @@ Upgrading
renamed to 'mon osd min down {reporters|reports}', and the
documentation has been updated to reflect that these options apply
to the monitors (who process failure reports) and not OSDs. If you
- have adjusted these settings, please update your ``ceph.conf''
+ have adjusted these settings, please update your ``ceph.conf``
accordingly.
Notable Changes
@@ -1219,6 +1459,53 @@ Notable Changes
* auth: ability to require new cephx signatures on messages (still off by default)
+
+v0.56.7 "bobtail"
+-----------------
+
+This bobtail update fixes a range of radosgw bugs (including an easily
+triggered crash from multi-delete), a possible data corruption issue
+with power failure on XFS, and several OSD problems, including a
+memory "leak" that will affect aged clusters.
+
+Notable changes
+~~~~~~~~~~~~~~~
+
+* ceph-fuse: create finisher flags after fork()
+* debian: fix prerm/postinst hooks; do not restart daemons on upgrade
+* librados: fix async aio completion wakeup (manifests as rbd hang)
+* librados: fix hang when osd becomes full and then not full
+* librados: fix locking for aio completion refcounting
+* librbd python bindings: fix stripe_unit, stripe_count
+* librbd: make image creation default configurable
+* mon: fix validation of mds ids in mon commands
+* osd: avoid excessive disk updates during peering
+* osd: avoid excessive memory usage on scrub
+* osd: avoid heartbeat failure/suicide when scrubbing
+* osd: misc minor bug fixes
+* osd: use fdatasync instead of sync_file_range (may avoid xfs power-loss corruption)
+* rgw: escape prefix correctly when listing objects
+* rgw: fix copy attrs
+* rgw: fix crash on multi delete
+* rgw: fix locking/crash when using ops log socket
+* rgw: fix usage logging
+* rgw: handle deep uri resources
+
+For more detailed information, see :download:`the complete changelog <changelog/v0.56.7.txt>`.
+
+
+v0.56.6 "bobtail"
+-----------------
+
+Notable changes
+~~~~~~~~~~~~~~~
+
+* rgw: fix garbage collection
+* rpm: fix package dependencies
+
+For more detailed information, see :download:`the complete changelog <changelog/v0.56.6.txt>`.
+
+
v0.56.5 "bobtail"
-----------------
diff --git a/doc/install/hardware-recommendations.rst b/doc/start/hardware-recommendations.rst
index 90d29e5e7e2..90d29e5e7e2 100644
--- a/doc/install/hardware-recommendations.rst
+++ b/doc/start/hardware-recommendations.rst
diff --git a/doc/start/index.rst b/doc/start/index.rst
index 2fc03c0a284..6e9277746d9 100644
--- a/doc/start/index.rst
+++ b/doc/start/index.rst
@@ -1,34 +1,6 @@
-=================
- Getting Started
-=================
-
-Whether you want to provide :term:`Ceph Object Storage` and/or :term:`Ceph Block
-Device` services to :term:`Cloud Platforms`, deploy a :term:`Ceph Filesystem` or
-use Ceph for another purpose, all :term:`Ceph Storage Cluster` deployments begin
-with setting up each :term:`Ceph Node`, your network and the Ceph Storage
-Cluster. A Ceph Storage Cluster has three essential daemons:
-
-.. ditaa:: +---------------+ +---------------+ +---------------+
- | OSDs | | Monitor | | MDS |
- +---------------+ +---------------+ +---------------+
-
-- **OSDs**: A :term:`Ceph OSD Daemon` (OSD) stores data, handles data
- replication, recovery, backfilling, rebalancing, and provides some monitoring
- information to Ceph Monitors by checking other Ceph OSD Daemons for a
- heartbeat. A Ceph Storage Cluster requires at least two Ceph OSD Daemons to
- achieve an ``active + clean`` state.
-
-- **Monitors**: A :term:`Ceph Monitor` maintains maps of the cluster state,
- including the monitor map, the OSD map, the Placement Group (PG) map, and the
- CRUSH map. Ceph maintains a history (called an "epoch") of each state change
- in the Ceph Monitors, Ceph OSD Daemons, and PGs.
-
-- **MDSs**: A :term:`Ceph Metadata Server` (MDS) stores metadata on behalf of
- the :term:`Ceph Filesystem` (i.e., Ceph Block Devices and Ceph Object Storage
- do not use MDS). Ceph Metadata Servers make it feasible for POSIX file system
- users to execute basic commands like ``ls``, ``find``, etc. without placing
- an enormous burden on the Ceph Storage Cluster.
-
+======================
+ Installation (Quick)
+======================
.. raw:: html
@@ -37,18 +9,17 @@ Cluster. A Ceph Storage Cluster has three essential daemons:
A :term:`Ceph Client` and a :term:`Ceph Node` may require some basic
configuration work prior to deploying a Ceph Storage Cluster. You can also
-avail yourself of help from the Ceph community by getting involved.
+avail yourself of help by getting involved in the Ceph community.
.. toctree::
- Get Involved <get-involved>
Preflight <quick-start-preflight>
.. raw:: html
</td><td><h3>Step 2: Storage Cluster</h3>
-Once you've completed your preflight checklist, you should be able to begin
+Once you've completed your preflight checklist, you should be able to begin
deploying a Ceph Storage Cluster.
.. toctree::
diff --git a/doc/start/intro.rst b/doc/start/intro.rst
new file mode 100644
index 00000000000..704ff1e8cd5
--- /dev/null
+++ b/doc/start/intro.rst
@@ -0,0 +1,70 @@
+===============
+ Intro to Ceph
+===============
+
+Whether you want to provide :term:`Ceph Object Storage` and/or :term:`Ceph Block
+Device` services to :term:`Cloud Platforms`, deploy a :term:`Ceph Filesystem` or
+use Ceph for another purpose, all :term:`Ceph Storage Cluster` deployments begin
+with setting up each :term:`Ceph Node`, your network and the Ceph Storage
+Cluster. A Ceph Storage Cluster requires at least one Ceph Monitor and at least
+two Ceph OSD Daemons. The Ceph Metadata Server is essential when running Ceph
+Filesystem clients.
+
+.. ditaa:: +---------------+ +---------------+ +---------------+
+ | OSDs | | Monitor | | MDS |
+ +---------------+ +---------------+ +---------------+
+
+- **OSDs**: A :term:`Ceph OSD Daemon` (OSD) stores data, handles data
+ replication, recovery, backfilling, rebalancing, and provides some monitoring
+ information to Ceph Monitors by checking other Ceph OSD Daemons for a
+ heartbeat. A Ceph Storage Cluster requires at least two Ceph OSD Daemons to
+ achieve an ``active + clean`` state when the cluster makes two copies of your
+ data (Ceph makes 2 copies by default, but you can adjust it).
+
+- **Monitors**: A :term:`Ceph Monitor` maintains maps of the cluster state,
+ including the monitor map, the OSD map, the Placement Group (PG) map, and the
+ CRUSH map. Ceph maintains a history (called an "epoch") of each state change
+ in the Ceph Monitors, Ceph OSD Daemons, and PGs.
+
+- **MDSs**: A :term:`Ceph Metadata Server` (MDS) stores metadata on behalf of
+ the :term:`Ceph Filesystem` (i.e., Ceph Block Devices and Ceph Object Storage
+ do not use MDS). Ceph Metadata Servers make it feasible for POSIX file system
+ users to execute basic commands like ``ls``, ``find``, etc. without placing
+ an enormous burden on the Ceph Storage Cluster.
+
+Ceph stores a client's data as objects within storage pools. Using the CRUSH
+algorithm, Ceph calculates which placement group should contain the object,
+and further calculates which Ceph OSD Daemon should store the placement group.
+The CRUSH algorithm enables the Ceph Storage Cluster to scale, rebalance, and
+recover dynamically.
+
+
+.. raw:: html
+
+ <style type="text/css">div.body h3{margin:5px 0px 0px 0px;}</style>
+ <table cellpadding="10"><colgroup><col width="50%"><col width="50%"></colgroup><tbody valign="top"><tr><td><h3>Recommendations</h3>
+
+To begin using Ceph in production, you should review our hardware
+recommendations and operating system recommendations.
+
+.. toctree::
+ :maxdepth: 2
+
+ Hardware Recommendations <hardware-recommendations>
+ OS Recommendations <os-recommendations>
+
+
+.. raw:: html
+
+ </td><td><h3>Get Involved</h3>
+
+ You can avail yourself of help or contribute documentation, source
+ code or bugs by getting involved in the Ceph community.
+
+.. toctree::
+
+ get-involved
+
+.. raw:: html
+
+ </td></tr></tbody></table>
diff --git a/doc/install/os-recommendations.rst b/doc/start/os-recommendations.rst
index 71a4d3a278b..d8b418fe1b0 100644
--- a/doc/install/os-recommendations.rst
+++ b/doc/start/os-recommendations.rst
@@ -36,6 +36,36 @@ platforms. Generally speaking, there is very little dependence on
specific distributions aside from the kernel and system initialization
package (i.e., sysvinit, upstart, systemd).
+
+Dumpling (0.67)
+---------------
+
++----------+----------+--------------------+--------------+---------+------------+
+| Distro | Release | Code Name | Kernel | Notes | Testing |
++==========+==========+====================+==============+=========+============+
+| Ubuntu | 12.04 | Precise Pangolin | linux-3.2.0 | 1, 2 | B, I, C |
++----------+----------+--------------------+--------------+---------+------------+
+| Ubuntu | 12.10 | Quantal Quetzal | linux-3.5.4 | 2 | B |
++----------+----------+--------------------+--------------+---------+------------+
+| Ubuntu | 13.04 | Raring Ringtail | linux-3.8.5 | | B |
++----------+----------+--------------------+--------------+---------+------------+
+| Debian | 6.0 | Squeeze | linux-2.6.32 | 1, 2, 3 | B |
++----------+----------+--------------------+--------------+---------+------------+
+| Debian | 7.0 | Wheezy | linux-3.2.0 | 1, 2 | B |
++----------+----------+--------------------+--------------+---------+------------+
+| CentOS | 6.3 | N/A | linux-2.6.32 | 1, 2 | B, I |
++----------+----------+--------------------+--------------+---------+------------+
+| RHEL | 6.3 | | linux-2.6.32 | 1, 2 | B, I |
++----------+----------+--------------------+--------------+---------+------------+
+| Fedora | 18.0 | Spherical Cow | linux-3.6.0 | | B |
++----------+----------+--------------------+--------------+---------+------------+
+| Fedora | 19.0 | Schrödinger's Cat | linux-3.10.0 | | B |
++----------+----------+--------------------+--------------+---------+------------+
+| OpenSuse | 12.2 | N/A | linux-3.4.0 | 2 | B |
++----------+----------+--------------------+--------------+---------+------------+
+
+
+
Cuttlefish (0.61)
-----------------
@@ -63,6 +93,7 @@ Cuttlefish (0.61)
| OpenSuse | 12.2 | N/A | linux-3.4.0 | 2 | B |
+----------+----------+--------------------+--------------+---------+------------+
+
Bobtail (0.56)
--------------
@@ -90,6 +121,7 @@ Bobtail (0.56)
| OpenSuse | 12.2 | N/A | linux-3.4.0 | 2 | B |
+----------+----------+--------------------+--------------+---------+------------+
+
Argonaut (0.48)
---------------
@@ -126,6 +158,7 @@ Notes
``ceph-osd`` daemons using ``XFS`` or ``ext4`` on the same host will
not perform as well as they could.
+
Testing
-------
diff --git a/doc/start/quick-ceph-deploy.rst b/doc/start/quick-ceph-deploy.rst
index 3c0ca1b0653..1fabd1b182f 100644
--- a/doc/start/quick-ceph-deploy.rst
+++ b/doc/start/quick-ceph-deploy.rst
@@ -3,26 +3,31 @@
=============================
If you haven't completed your `Preflight Checklist`_, do that first. This
-**Quick Start** sets up a two-node demo cluster so you can explore some of the
-:term:`Ceph Storage Cluster` functionality. This **Quick Start** will help you
-install a minimal Ceph Storage Cluster on a server node from your admin node
-using ``ceph-deploy``.
+**Quick Start** sets up a :term:`Ceph Storage Cluster` using ``ceph-deploy``
+on your admin node. Create a three Ceph Node cluster so you can
+explore Ceph functionality.
.. ditaa::
- /----------------\ /----------------\
- | Admin Node |<------->| Server Node |
- | cCCC | | cCCC |
- +----------------+ +----------------+
- | Ceph Commands | | ceph - mon |
- \----------------/ +----------------+
- | ceph - osd |
- +----------------+
- | ceph - mds |
- \----------------/
-
-
-For best results, create a directory on your admin node for maintaining the
-configuration of your cluster. ::
+ /------------------\ /----------------\
+ | Admin Node | | ceph–node1 |
+ | +-------->+ cCCC |
+ | ceph–deploy | | mon.ceph–node1 |
+ \---------+--------/ \----------------/
+ |
+ | /----------------\
+ | | ceph–node2 |
+ +----------------->+ cCCC |
+ | | osd.0 |
+ | \----------------/
+ |
+ | /----------------\
+ | | ceph–node3 |
+ +----------------->| cCCC |
+ | osd.1 |
+ \----------------/
+
+For best results, create a directory on your admin node node for maintaining the
+configuration that ``ceph-deploy`` generates for your cluster. ::
mkdir my-cluster
cd my-cluster
@@ -31,228 +36,283 @@ configuration of your cluster. ::
current directory. Ensure you are in this directory when executing
``ceph-deploy``.
+As a first exercise, create a Ceph Storage Cluster with one Ceph Monitor and two
+Ceph OSD Daemons. Once the cluster reaches a ``active + clean`` state, expand it
+by adding a third Ceph OSD Daemon, a Metadata Server and two more Ceph Monitors.
+
+.. important:: Do not call ``ceph-deploy`` with ``sudo`` or run it as ``root``
+ if you are logged in as a different user, because it will not issue ``sudo``
+ commands needed on the remote host.
Create a Cluster
================
-To create your Ceph Storage Cluster, declare its initial monitors, generate a
-filesystem ID (``fsid``) and generate monitor keys by entering the following
-command on a commandline prompt::
+If at any point you run into trouble and you want to start over, execute
+the following::
- ceph-deploy new {mon-server-name}
- ceph-deploy new mon-ceph-node
+ ceph-deploy purgedata {ceph-node} [{ceph-node}]
+ ceph-deploy forgetkeys
-Check the output of ``ceph-deploy`` with ``ls`` and ``cat`` in the current
-directory. You should see a Ceph configuration file, a keyring, and a log file
-for the new cluster. See `ceph-deploy new -h`_ for additional details.
-.. topic:: Single Node Quick Start
+On your admin node, perform the following steps using ``ceph-deploy``.
- Assuming only one node for your Ceph Storage Cluster, you will need to
- modify the default ``osd crush chooseleaf type`` setting (it defaults to
- ``1`` for ``node``) to ``0`` for ``device`` so that it will peer with OSDs
- on the local node. Add the following line to your Ceph configuration file::
-
- osd crush chooseleaf type = 0
+#. Create the cluster. ::
-.. tip:: If you deploy without executing foregoing step on a single node
- cluster, your Ceph Storage Cluster will not achieve an ``active + clean``
- state. To remedy this situation, you must modify your `CRUSH Map`_.
+ ceph-deploy new {ceph-node}
+ ceph-deploy new ceph-node1
-Install Ceph
-============
+ Check the output of ``ceph-deploy`` with ``ls`` and ``cat`` in the current
+ directory. You should see a Ceph configuration file, a keyring, and a log
+ file for the new cluster. See `ceph-deploy new -h`_ for additional details.
-To install Ceph on your server node, open a command line on your admin
-node and type the following::
+#. Install Ceph. ::
- ceph-deploy install {server-node-name}[,{server-node-name}]
- ceph-deploy install mon-ceph-node
+ ceph-deploy install {ceph-node}[{ceph-node} ...]
+ ceph-deploy install ceph-node1 ceph-node2 ceph-node3
-Without additional arguments, ``ceph-deploy`` will install the most recent
-stable Ceph package to the server node. See `ceph-deploy install -h`_ for
-additional details.
-.. tip:: When ``ceph-deploy`` completes installation successfully,
- it should echo ``OK``.
+#. Add a Ceph Monitor. ::
+ ceph-deploy mon create {ceph-node}
+ ceph-deploy mon create ceph-node1
+
+#. Gather keys. ::
-Add a Monitor
-=============
+ ceph-deploy gatherkeys {ceph-node}
+ ceph-deploy gatherkeys ceph-node1
-To run a Ceph cluster, you need at least one Ceph Monitor. When using
-``ceph-deploy``, the tool enforces a single Ceph Monitor per node. Execute the
-following to create a Ceph Monitor::
+ Once you have gathered keys, your local directory should have the following
+ keyrings:
- ceph-deploy mon create {mon-server-name}
- ceph-deploy mon create mon-ceph-node
+ - ``{cluster-name}.client.admin.keyring``
+ - ``{cluster-name}.bootstrap-osd.keyring``
+ - ``{cluster-name}.bootstrap-mds.keyring``
+
-.. tip:: In production environments, we recommend running Ceph Monitors on
- nodes that do not run OSDs.
+#. Add two OSDs. For fast setup, this quick start uses a directory rather
+ than an entire disk per Ceph OSD Daemon. See `ceph-deploy osd`_ for
+ details on using separate disks/partitions for OSDs and journals.
+ Login to the Ceph Nodes and create a directory for
+ the Ceph OSD Daemon. ::
+
+ ssh ceph-node2
+ sudo mkdir /tmp/osd0
+ exit
+
+ ssh ceph-node3
+ sudo mkdir /tmp/osd1
+ exit
-When you have added a monitor successfully, directories under ``/var/lib/ceph``
-on your server node should have subdirectories ``bootstrap-mds`` and
-``bootstrap-osd`` that contain keyrings. If these directories do not contain
-keyrings, execute ``ceph-deploy mon create`` again on the admin node.
+ Then, from your admin node, use ``ceph-deploy`` to prepare the OSDs. ::
+ ceph-deploy osd prepare {ceph-node}:/path/to/directory
+ ceph-deploy osd prepare ceph-node2:/tmp/osd0 ceph-node3:/tmp/osd1
-Gather Keys
-===========
+ Finally, activate the OSDs. ::
-To deploy additional daemons and provision them with monitor authentication keys
-from your admin node, you must first gather keys from a monitor node. Execute
-the following to gather keys::
+ ceph-deploy osd activate {ceph-node}:/path/to/directory
+ ceph-deploy osd activate ceph-node2:/tmp/osd0 ceph-node3:/tmp/osd1
- ceph-deploy gatherkeys {mon-server-name}
- ceph-deploy gatherkeys mon-ceph-node
+#. Use ``ceph-deploy`` to copy the configuration file and admin key to
+ your admin node and your Ceph Nodes so that you can use the ``ceph``
+ CLI without having to specify the monitor address and
+ ``ceph.client.admin.keyring`` each time you execute a command. ::
+
+ ceph-deploy admin {ceph-node}
+ ceph-deploy admin admin-node ceph-node1 ceph-node2 ceph-node3
-Once you have gathered keys, your local directory should have the following keyrings:
+ **Note:** Since you are using ``ceph-deploy`` to talk to the
+ local host, your host must be reachable by its hostname
+ (e.g., you can modify ``/etc/hosts`` if necessary). Ensure that
+ you have the correct permissions for the ``ceph.client.admin.keyring``.
-- ``{cluster-name}.client.admin.keyring``
-- ``{cluster-name}.bootstrap-osd.keyring``
-- ``{cluster-name}.bootstrap-mds.keyring``
+#. Check your cluster's health. ::
-If you don't have these keyrings, you may not have created a monitor successfully,
-or you may have a problem with your network connection. Ensure that you complete
-this step such that you have the foregoing keyrings before proceeding further.
+ ceph health
-.. tip:: You may repeat this procedure. If it fails, check to see if the
- ``/var/lib/ceph/boostrap-{osd}|{mds}`` directories on the server node
- have keyrings. If they do not have keyrings, try adding the monitor again;
- then, return to this step.
+ Your cluster should return an ``active + clean`` state when it
+ has finished peering.
-Add Ceph OSD Daemons
-====================
+Operating Your Cluster
+======================
-For a cluster's object placement groups to reach an ``active + clean`` state,
-you must have at least two instances of a :term:`Ceph OSD Daemon` running and
-at least two copies of an object (``osd pool default size`` is ``2``
-by default).
+Deploying a Ceph cluster with ``ceph-deploy`` automatically starts the cluster.
+To operate the cluster daemons with Debian/Ubuntu distributions, see
+`Running Ceph with Upstart`_. To operate the cluster daemons with CentOS,
+Red Hat, Fedora, and SLES distributions, see `Running Ceph with sysvinit`_.
-Adding Ceph OSD Daemons is slightly more involved than other ``ceph-deploy``
-commands, because a Ceph OSD Daemon involves both a data store and a journal.
-The ``ceph-deploy`` tool has the ability to invoke ``ceph-disk-prepare`` to
-prepare the disk and activate the Ceph OSD Daemon for you.
+To learn more about peering and cluster health, see `Monitoring a Cluster`_.
+To learn more about Ceph OSD Daemon and placement group health, see
+`Monitoring OSDs and PGs`_.
+
+Once you deploy a Ceph cluster, you can try out some of the administration
+functionality, the ``rados`` object store command line, and then proceed to
+Quick Start guides for Ceph Block Device, Ceph Filesystem, and the Ceph Object
+Gateway.
-Multiple OSDs on the OS Disk (Demo Only)
-----------------------------------------
-For demonstration purposes, you may wish to add multiple OSDs to the OS disk
-(not recommended for production systems). To use Ceph OSDs daemons on the OS
-disk, you must use ``prepare`` and ``activate`` as separate steps. First,
-define a directory for the Ceph OSD daemon(s). ::
-
- mkdir /tmp/osd0
- mkdir /tmp/osd1
-
-Then, use ``prepare`` to prepare the directory(ies) for use with a
-Ceph OSD Daemon. ::
-
- ceph-deploy osd prepare {osd-node-name}:/tmp/osd0
- ceph-deploy osd prepare {osd-node-name}:/tmp/osd1
+Expanding Your Cluster
+======================
-Finally, use ``activate`` to activate the Ceph OSD Daemons. ::
+Once you have a basic cluster up and running, the next step is to expand
+cluster. Add a Ceph OSD Daemon and a Ceph Metadata Server to ``ceph-node1``.
+Then add a Ceph Monitor to ``ceph-node2`` and ``ceph-node3`` to establish a
+quorum of Ceph Monitors.
- ceph-deploy osd activate {osd-node-name}:/tmp/osd0
- ceph-deploy osd activate {osd-node-name}:/tmp/osd1
+.. ditaa::
+ /------------------\ /----------------\
+ | ceph–deploy | | ceph–node1 |
+ | Admin Node | | cCCC |
+ | +-------->+ mon.ceph–node1 |
+ | | | osd.2 |
+ | | | mds.ceph–node1 |
+ \---------+--------/ \----------------/
+ |
+ | /----------------\
+ | | ceph–node2 |
+ | | cCCC |
+ +----------------->+ |
+ | | osd.0 |
+ | | mon.ceph–node2 |
+ | \----------------/
+ |
+ | /----------------\
+ | | ceph–node3 |
+ | | cCCC |
+ +----------------->+ |
+ | osd.1 |
+ | mon.ceph–node3 |
+ \----------------/
-.. tip:: You need two OSDs to reach an ``active + clean`` state. You can
- add one OSD at a time, but OSDs need to communicate with each other
- for Ceph to run properly. Always use more than one OSD per cluster.
+Adding an OSD
+-------------
+Since you are running a 3-node cluster for demonstration purposes, add the OSD
+to the monitor node. ::
-List Disks
-----------
+ ssh ceph-node1
+ sudo mkdir /tmp/osd2
+ exit
-To list the available disk drives on a prospective :term:`Ceph Node`, execute
-the following::
+Then, from your ``ceph-deploy`` node, prepare the OSD. ::
- ceph-deploy disk list {osd-node-name}
- ceph-deploy disk list ceph-node
+ ceph-deploy osd prepare {ceph-node}:/path/to/directory
+ ceph-deploy osd prepare ceph-node1:/tmp/osd2
+Finally, activate the OSDs. ::
-Zap a Disk
-----------
+ ceph-deploy osd activate {ceph-node}:/path/to/directory
+ ceph-deploy osd activate ceph-node1:/tmp/osd2
-To zap a disk (delete its partition table) in preparation for use with Ceph,
-execute the following::
- ceph-deploy disk zap {osd-node-name}:{disk}
- ceph-deploy disk zap ceph-node:sdb ceph-node:sdb2
+Once you have added your new OSD, Ceph will begin rebalancing the cluster by
+migrating placement groups to your new OSD. You can observe this process with
+the ``ceph`` CLI. ::
-.. important:: This will delete all data on the disk.
+ ceph -w
+You should see the placement group states change from ``active+clean`` to active
+with some degraded objects, and finally ``active+clean`` when migration
+completes. (Control-c to exit.)
-Add OSDs on Standalone Disks
-----------------------------
-You can add OSDs using ``prepare`` and ``activate`` in two discrete
-steps. To prepare a disk for use with a Ceph OSD Daemon, execute the
-following::
+Add a Metadata Server
+---------------------
- ceph-deploy osd prepare {osd-node-name}:{osd-disk-name}[:/path/to/journal]
- ceph-deploy osd prepare ceph-node:sdb
+To use CephFS, you need at least one metadata server. Execute the following to
+create a metadata server::
-To activate the Ceph OSD Daemon, execute the following::
+ ceph-deploy mds create {ceph-node}
+ ceph-deploy mds create ceph-node1
- ceph-deploy osd activate {osd-node-name}:{osd-partition-name}
- ceph-deploy osd activate ceph-node:sdb1
-To prepare an OSD disk and activate it in one step, execute the following::
+.. note:: Currently Ceph runs in production with one metadata server only. You
+ may use more, but there is currently no commercial support for a cluster
+ with multiple metadata servers.
- ceph-deploy osd create {osd-node-name}:{osd-disk-name}[:/path/to/journal] [{osd-node-name}:{osd-disk-name}[:/path/to/journal]]
- ceph-deploy osd create ceph-node:sdb:/dev/ssd1 ceph-node:sdc:/dev/ssd2
+Adding Monitors
+---------------
-.. note:: The journal example assumes you will use a partition on a separate
- solid state drive (SSD). If you omit a journal drive or partition,
- ``ceph-deploy`` will use create a separate partition for the journal
- on the same drive. If you have already formatted your disks and created
- partitions, you may also use partition syntax for your OSD disk.
+A Ceph Storage Cluster requires at least one Ceph Monitor to run. For high
+availability, Ceph Storage Clusters typically run multiple Ceph
+Monitors so that the failure of a single Ceph Monitor will not bring down the
+Ceph Storage Cluster. Ceph uses the Paxos algorithm, which requires a majority
+of monitors (i.e., 1, 2:3, 3:4, 3:5, 4:6, etc.) to form a quorum.
-You must add a minimum of two Ceph OSD Daemons for the placement groups in
-a cluster to achieve an ``active + clean`` state.
+Add two Ceph Monitors to your cluster. ::
+ ceph-deploy mon create {ceph-node}
+ ceph-deploy mon create ceph-node2 ceph-node3
-Add a MDS
-=========
+Once you have added your new Ceph Monitors, Ceph will begin synchronizing
+the monitors and form a quorum. You can check the quorum status by executing
+the following::
-To use CephFS, you need at least one metadata node. Execute the following to
-create a metadata node::
+ ceph quorum_status
- ceph-deploy mds create {node-name}
- ceph-deploy mds create ceph-node
-.. note:: Currently Ceph runs in production with one metadata node only. You
- may use more, but there is currently no commercial support for a cluster
- with multiple metadata nodes.
+Storing/Retrieving Object Data
+==============================
+To store object data in the Ceph Storage Cluster, a Ceph client must:
-Summary
-=======
+#. Set an object name
+#. Specify a `pool`_
-Deploying a Ceph cluster with ``ceph-deploy`` automatically starts the cluster.
-To operate the cluster daemons, see `Running Ceph with Upstart`_.
+The Ceph Client retrieves the latest cluster map and the CRUSH algorithm
+calculates how to map the object to a `placement group`_, and then calculates
+how to assign the placement group to a Ceph OSD Daemon dynamically. To find the
+object location, all you need is the object name and the pool name. For
+example::
-Once you deploy a Ceph cluster, you can try out some of the administration
-functionality, the object store command line, and then proceed to Quick Start
-guides for RBD, CephFS, and the Ceph Gateway.
+ ceph osd map {poolname} {object-name}
-.. topic:: Other ceph-deploy Commands
+.. topic:: Exercise: Locate an Object
- To view other ``ceph-deploy`` commands, execute:
-
- ``ceph-deploy -h``
-
+ As an exercise, lets create an object. Specify an object name, a path to
+ a test file containing some object data and a pool name using the
+ ``rados put`` command on the command line. For example::
+
+ rados put {object-name} {file-path} --pool=data
+ rados put test-object-1 testfile.txt --pool=data
+
+ To verify that the Ceph Storage Cluster stored the object, execute
+ the following::
+
+ rados -p data ls
+
+ Now, identify the object location::
-See `Ceph Deploy`_ for additional details.
+ ceph osd map {pool-name} {object-name}
+ ceph osd map data test-object-1
+
+ Ceph should output the object's location. For example::
+
+ osdmap e537 pool 'data' (0) object 'test-object-1' -> pg 0.d1743484 (0.4) -> up [1,0] acting [1,0]
+
+ To remove the test object, simply delete it using the ``rados rm``
+ command. For example::
+
+ rados rm test-object-1 --pool=data
+
+As the cluster evolves, the object location may change dynamically. One benefit
+of Ceph's dynamic rebalancing is that Ceph relieves you from having to perform
+the migration manually.
.. _Preflight Checklist: ../quick-start-preflight
.. _Ceph Deploy: ../../rados/deployment
.. _ceph-deploy install -h: ../../rados/deployment/ceph-deploy-install
.. _ceph-deploy new -h: ../../rados/deployment/ceph-deploy-new
+.. _ceph-deploy osd: ../../rados/deployment/ceph-deploy-osd
.. _Running Ceph with Upstart: ../../rados/operations/operating#running-ceph-with-upstart
-.. _CRUSH Map: ../../rados/operations/crush-map \ No newline at end of file
+.. _Running Ceph with sysvinit: ../../rados/operations/operating#running-ceph-with-sysvinit
+.. _CRUSH Map: ../../rados/operations/crush-map
+.. _pool: ../../rados/operations/pools
+.. _placement group: ../../rados/operations/placement-groups
+.. _Monitoring a Cluster: ../../rados/operations/monitoring
+.. _Monitoring OSDs and PGs: ../../rados/operations/monitoring-osd-pg \ No newline at end of file
diff --git a/doc/start/quick-cephfs.rst b/doc/start/quick-cephfs.rst
index 18dadb005ec..5449e5a6fe3 100644
--- a/doc/start/quick-cephfs.rst
+++ b/doc/start/quick-cephfs.rst
@@ -3,7 +3,7 @@
=====================
To use the :term:`Ceph FS` Quick Start guide, you must have executed the
-procedures in the `Ceph Deploy Quick Start`_ guide first. Execute this quick
+procedures in the `Storage Cluster Quick Start`_ guide first. Execute this quick
start on the Admin Host.
Prerequisites
@@ -91,7 +91,7 @@ See `Ceph FS`_ for additional information. Ceph FS is not quite as stable
as the Ceph Block Device and Ceph Object Storage. See `Troubleshooting`_
if you encounter trouble.
-.. _Ceph Deploy Quick Start: ../quick-ceph-deploy
+.. _Storage Cluster Quick Start: ../quick-ceph-deploy
.. _Ceph FS: ../../cephfs/
.. _FAQ: http://wiki.ceph.com/03FAQs/01General_FAQ#How_Can_I_Give_Ceph_a_Try.3F
.. _Troubleshooting: ../../cephfs/troubleshooting \ No newline at end of file
diff --git a/doc/start/quick-rbd.rst b/doc/start/quick-rbd.rst
index a466771502d..9424457f8c2 100644
--- a/doc/start/quick-rbd.rst
+++ b/doc/start/quick-rbd.rst
@@ -2,47 +2,73 @@
Block Device Quick Start
==========================
-To use this guide, you must have executed the procedures in the `Object Store
-Quick Start`_ guide first. Ensure your :term:`Ceph Storage Cluster` is in an
-``active + clean`` state before working with the :term:`Ceph Block Device`.
-Execute this quick start on the admin node.
+To use this guide, you must have executed the procedures in the `Storage
+Cluster Quick Start`_ guide first. Ensure your :term:`Ceph Storage Cluster` is
+in an ``active + clean`` state before working with the :term:`Ceph Block
+Device`.
.. note:: The Ceph Block Device is also known as :term:`RBD` or :term:`RADOS`
Block Device.
-#. Install ``ceph-common``. ::
- sudo apt-get install ceph-common
+.. ditaa::
+ /------------------\ /----------------\
+ | Admin Node | | ceph–client |
+ | +-------->+ cCCC |
+ | ceph–deploy | | ceph |
+ \------------------/ \----------------/
-#. Create a block device image. ::
- rbd create foo --size 4096 [-m {mon-IP}] [-k /path/to/ceph.client.admin.keyring]
+You may use a virtual machine for your ``ceph-client`` node, but do not
+execute the following procedures on the same physical node as your Ceph
+Storage Cluster nodes (unless you use a VM). See `FAQ`_ for details.
-#. Load the ``rbd`` client module. ::
+
+Install Ceph
+============
+
+#. On the admin node, use ``ceph-deploy`` to install Ceph on your
+ ``ceph-client`` node. ::
+
+ ceph-deploy install ceph-client
+
+#. On the admin node, use ``ceph-deploy`` to copy the Ceph configuration file
+ and the ``ceph.client.admin.keyring`` to the ``ceph-client``. ::
+
+ ceph-deploy admin ceph-client
+
+
+Configure a Block Device
+========================
+
+#. On the ``ceph-client`` node, create a block device image. ::
+
+ rbd create foo --size 4096 [-m {mon-IP}] [-k /path/to/ceph.client.admin.keyring]
+
+#. On the ``ceph-client`` node, load the ``rbd`` client module. ::
sudo modprobe rbd
-#. Map the image to a block device. ::
+#. On the ``ceph-client`` node, map the image to a block device. ::
sudo rbd map foo --pool rbd --name client.admin [-m {mon-IP}] [-k /path/to/ceph.client.admin.keyring]
-#. Use the block device. In the following example, create a file system. ::
+#. Use the block device by creating a file system on the ``ceph-client``
+ node. ::
sudo mkfs.ext4 -m0 /dev/rbd/rbd/foo
This may take a few moments.
-#. Mount the file system. ::
+#. Mount the file system on the ``ceph-client`` node. ::
sudo mkdir /mnt/ceph-block-device
sudo mount /dev/rbd/rbd/foo /mnt/ceph-block-device
cd /mnt/ceph-block-device
-.. note:: Mount the block device on the client machine,
- not the server machine. See `FAQ`_ for details.
See `block devices`_ for additional details.
-.. _Object Store Quick Start: ../quick-ceph-deploy
+.. _Storage Cluster Quick Start: ../quick-ceph-deploy
.. _block devices: ../../rbd/rbd
.. _FAQ: http://wiki.ceph.com/03FAQs/01General_FAQ#How_Can_I_Give_Ceph_a_Try.3F
diff --git a/doc/start/quick-rgw.rst b/doc/start/quick-rgw.rst
index 76d4e2630a3..40cf7d4f4dc 100644
--- a/doc/start/quick-rgw.rst
+++ b/doc/start/quick-rgw.rst
@@ -2,7 +2,7 @@
Object Storage Quick Start
============================
-To use this guide, you must have executed the procedures in the `Ceph Deploy
+To use this guide, you must have executed the procedures in the `Storage Cluster
Quick Start`_ guide first. Ensure your :term:`Ceph Storage Cluster` is in an
``active + clean`` state before working with the :term:`Ceph Object Storage`.
@@ -306,7 +306,7 @@ Next, create a subuser for the Swift-compatible interface. ::
::
- sudo radosgw-admin key create --subuser=johndoe:swift --key-type=swift
+ sudo radosgw-admin key create --subuser=johndoe:swift --key-type=swift --gen-secret
.. code-block:: javascript
@@ -344,8 +344,8 @@ tutorials. See the `S3-compatible`_ and `Swift-compatible`_ APIs for details.
.. _Create rgw.conf: ../../radosgw/config/index.html#create-rgw-conf
-.. _Ceph Deploy Quick Start: ../quick-ceph-deploy
+.. _Storage Cluster Quick Start: ../quick-ceph-deploy
.. _Ceph Object Storage Manual Install: ../../radosgw/manual-install
.. _RGW Configuration: ../../radosgw/config
.. _S3-compatible: ../../radosgw/s3
-.. _Swift-compatible: ../../radosgw/swift \ No newline at end of file
+.. _Swift-compatible: ../../radosgw/swift
diff --git a/doc/start/quick-start-preflight.rst b/doc/start/quick-start-preflight.rst
index 58068f1df22..77a54795f19 100644
--- a/doc/start/quick-start-preflight.rst
+++ b/doc/start/quick-start-preflight.rst
@@ -4,74 +4,57 @@
.. versionadded:: 0.60
-Thank you for trying Ceph! Petabyte-scale data clusters are quite an
-undertaking. Before delving deeper into Ceph, we recommend setting up a two-node
-demo cluster to explore some of the functionality. This **Preflight Checklist**
-will help you prepare an admin node and a server node for use with
-``ceph-deploy``.
-
-.. ditaa::
- /----------------\ /----------------\
- | Admin Node |<------->| Server Node |
- | cCCC | | cCCC |
- \----------------/ \----------------/
-
-
-Before you can deploy Ceph using ``ceph-deploy``, you need to ensure that you
-have a few things set up first on your admin node and on nodes running Ceph
-daemons.
-
-
-Install an Operating System
-===========================
-
-Install a recent release of Debian or Ubuntu (e.g., 12.04, 12.10, 13.04) on your
-nodes. For additional details on operating systems or to use other operating
-systems other than Debian or Ubuntu, see `OS Recommendations`_.
-
-
-Install an SSH Server
-=====================
-
-The ``ceph-deploy`` utility requires ``ssh``, so your server node(s) require an
-SSH server. ::
-
- sudo apt-get install openssh-server
+Thank you for trying Ceph! We recommend setting up a ``ceph-deploy`` admin node
+and a 3-node :term:`Ceph Storage Cluster` to explore the basics of Ceph. This
+**Preflight Checklist** will help you prepare a ``ceph-deploy`` admin node and
+three Ceph Nodes (or virtual machines) that will host your Ceph Storage Cluster.
-Create a User
-=============
-
-Create a user on nodes running Ceph daemons.
-
-.. tip:: We recommend a username that brute force attackers won't
- guess easily (e.g., something other than ``root``, ``ceph``, etc).
-
-::
+.. ditaa::
+ /------------------\ /----------------\
+ | Admin Node | | ceph–node1 |
+ | +-------->+ |
+ | ceph–deploy | | cCCC |
+ \---------+--------/ \----------------/
+ |
+ | /----------------\
+ | | ceph–node2 |
+ +----------------->+ |
+ | | cCCC |
+ | \----------------/
+ |
+ | /----------------\
+ | | ceph–node3 |
+ +----------------->| |
+ | cCCC |
+ \----------------/
+
+
+Ceph Node Setup
+===============
+
+Perform the following steps:
+
+#. Create a user on each Ceph Node. ::
ssh user@ceph-server
sudo useradd -d /home/ceph -m ceph
sudo passwd ceph
-
-``ceph-deploy`` installs packages onto your nodes. This means that
-the user you create requires passwordless ``sudo`` privileges.
-
-.. note:: We **DO NOT** recommend enabling the ``root`` password
- for security reasons.
-
-To provide full privileges to the user, add the following to
-``/etc/sudoers.d/ceph``. ::
+#. Add ``root`` privileges for the user on each Ceph Node. ::
echo "ceph ALL = (root) NOPASSWD:ALL" | sudo tee /etc/sudoers.d/ceph
sudo chmod 0440 /etc/sudoers.d/ceph
-Configure SSH
-=============
+#. Install an SSH server (if necessary)::
-Configure your admin machine with password-less SSH access to each node
-running Ceph daemons (leave the passphrase empty). ::
+ sudo apt-get install openssh-server
+ sudo yum install openssh-server
+
+
+#. Configure your ``ceph-deploy`` admin node with password-less SSH access to
+ each Ceph Node. Leave the passphrase empty::
ssh-keygen
Generating public/private key pair.
@@ -81,74 +64,95 @@ running Ceph daemons (leave the passphrase empty). ::
Your identification has been saved in /ceph-client/.ssh/id_rsa.
Your public key has been saved in /ceph-client/.ssh/id_rsa.pub.
-Copy the key to each node running Ceph daemons::
+#. Copy the key to each Ceph Node. ::
ssh-copy-id ceph@ceph-server
-Modify your ~/.ssh/config file of your admin node so that it defaults
-to logging in as the user you created when no username is specified. ::
+
+#. Modify the ``~/.ssh/config`` file of your ``ceph-deploy`` admin node so that
+ it logs in to Ceph Nodes as the user you created (e.g., ``ceph``). ::
Host ceph-server
- Hostname ceph-server.fqdn-or-ip-address.com
- User ceph
+ Hostname ceph-server.fqdn-or-ip-address.com
+ User ceph
+
+#. Ensure connectivity using ``ping`` with hostnames (i.e., not IP addresses).
+ Address hostname resolution issues and firewall issues as necessary.
-Install ceph-deploy
-===================
-To install ``ceph-deploy``, execute the following::
+Ceph Deploy Setup
+=================
+
+Add Ceph repositories to the ``ceph-deploy`` admin node. Then, install
+``ceph-deploy``.
+
+.. important:: Do not call ``ceph-deploy`` with ``sudo`` or run it as ``root``
+ if you are logged in as a different user, because it will not issue ``sudo``
+ commands needed on the remote host.
+
+
+Advanced Package Tool (APT)
+---------------------------
+
+For Debian and Ubuntu distributions, perform the following steps:
+
+#. Add the release key::
wget -q -O- 'https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc' | sudo apt-key add -
echo deb http://ceph.com/debian-dumpling/ $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph.list
sudo apt-get update
sudo apt-get install ceph-deploy
+#. Add the Ceph packages to your repository. Replace ``{ceph-stable-release}``
+ with a stable Ceph release (e.g., ``cuttlefish``, ``dumpling``, etc.).
+ For example::
+
+ echo deb http://ceph.com/debian-{ceph-stable-release}/ $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph.list
-Ensure Connectivity
-===================
+#. Update your repository and install ``ceph-deploy``::
-Ensure that your admin node has connectivity to the network and to your Server
-node (e.g., ensure ``iptables``, ``ufw`` or other tools that may prevent
-connections, traffic forwarding, etc. to allow what you need).
+ sudo apt-get update && sudo apt-get install ceph-deploy
-.. tip:: The ``ceph-deploy`` tool is new and you may encounter some issues
- without effective error messages.
-Once you have completed this pre-flight checklist, you are ready to begin using
-``ceph-deploy``.
+Red Hat Package Manager (RPM)
+-----------------------------
+For Red Hat(rhel6), CentOS (el6), Fedora 17-19 (f17-f19), OpenSUSE 12
+(opensuse12), and SLES (sles11) perform the following steps:
-Hostname Resolution
-===================
+#. Add the package to your repository. Open a text editor and create a
+ Yellowdog Updater, Modified (YUM) entry. Use the file path
+ ``/etc/yum.repos.d/ceph.repo``. For example::
-Ensure that your admin node can resolve the server node's hostname. ::
+ sudo vim /etc/yum.repos.d/ceph.repo
- ping {server-node}
+ Paste the following example code. Replace ``{ceph-stable-release}`` with
+ the recent stable release of Ceph (e.g., ``dumpling``). Replace ``{distro}``
+ with your Linux distribution (e.g., ``el6`` for CentOS 6, ``rhel6`` for
+ Red Hat 6, ``fc18`` or ``fc19`` for Fedora 18 or Fedora 19, and ``sles11``
+ for SLES 11). Finally, save the contents to the
+ ``/etc/yum.repos.d/ceph.repo`` file. ::
-If you execute ``ceph-deploy`` against the localhost, ``ceph-deploy``
-must be able to resolve its IP address. Consider adding the IP address
-to your ``/etc/hosts`` file such that it resolves to the hostname. ::
+ [ceph-noarch]
+ name=Ceph noarch packages
+ baseurl=http://ceph.com/rpm-{ceph-stable-release}/{distro}/noarch
+ enabled=1
+ gpgcheck=1
+ type=rpm-md
+ gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc
- hostname
- host -4 {hostname}
- sudo vim /etc/hosts
- {ip-address} {hostname}
+#. Update your repository and install ``ceph-deploy``::
- ceph-deploy {command} {hostname}
+ sudo yum update && sudo yum install ceph-deploy
-.. tip:: The ``ceph-deploy`` tool will not resolve to ``localhost``. Use
- the hostname.
Summary
=======
-Once you have passwordless ``ssh`` connectivity, passwordless ``sudo``,
-installed ``ceph-deploy``, and you have ensured appropriate connectivity,
-proceed to the `Storage Cluster Quick Start`_.
-
-.. tip:: The ``ceph-deploy`` utility can install Ceph packages on remote
- machines from the admin node!
+This completes the Quick Start Preflight. Proceed to the `Storage Cluster
+Quick Start`_.
.. _Storage Cluster Quick Start: ../quick-ceph-deploy
.. _OS Recommendations: ../../install/os-recommendations
diff --git a/fusetrace/fusetrace_ll.cc b/fusetrace/fusetrace_ll.cc
index eb7100a867f..7f2b8438f1f 100644
--- a/fusetrace/fusetrace_ll.cc
+++ b/fusetrace/fusetrace_ll.cc
@@ -11,7 +11,7 @@
gcc -Wall `pkg-config fuse --cflags --libs` -lulockmgr fusexmp_fh.c -o fusexmp_fh
*/
-#define FUSE_USE_VERSION 26
+#define FUSE_USE_VERSION 30
#ifdef HAVE_CONFIG_H
#include <config.h>
diff --git a/m4/ax_check_compile_flag.m4 b/m4/ax_check_compile_flag.m4
new file mode 100644
index 00000000000..c3a8d695a1b
--- /dev/null
+++ b/m4/ax_check_compile_flag.m4
@@ -0,0 +1,72 @@
+# ===========================================================================
+# http://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+# AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS])
+#
+# DESCRIPTION
+#
+# Check whether the given FLAG works with the current language's compiler
+# or gives an error. (Warnings, however, are ignored)
+#
+# ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on
+# success/failure.
+#
+# If EXTRA-FLAGS is defined, it is added to the current language's default
+# flags (e.g. CFLAGS) when the check is done. The check is thus made with
+# the flags: "CFLAGS EXTRA-FLAGS FLAG". This can for example be used to
+# force the compiler to issue an error when a bad flag is given.
+#
+# NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this
+# macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG.
+#
+# LICENSE
+#
+# Copyright (c) 2008 Guido U. Draheim <guidod@gmx.de>
+# Copyright (c) 2011 Maarten Bosmans <mkbosmans@gmail.com>
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# As a special exception, the respective Autoconf Macro's copyright owner
+# gives unlimited permission to copy, distribute and modify the configure
+# scripts that are the output of Autoconf when processing the Macro. You
+# need not follow the terms of the GNU General Public License when using
+# or distributing such scripts, even though portions of the text of the
+# Macro appear in them. The GNU General Public License (GPL) does govern
+# all other use of the material that constitutes the Autoconf Macro.
+#
+# This special exception to the GPL applies to versions of the Autoconf
+# Macro released by the Autoconf Archive. When you make and distribute a
+# modified version of the Autoconf Macro, you may extend this special
+# exception to the GPL to apply to your modified version as well.
+
+#serial 2
+
+AC_DEFUN([AX_CHECK_COMPILE_FLAG],
+[AC_PREREQ(2.59)dnl for _AC_LANG_PREFIX
+AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl
+AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [
+ ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS
+ _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1"
+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM()],
+ [AS_VAR_SET(CACHEVAR,[yes])],
+ [AS_VAR_SET(CACHEVAR,[no])])
+ _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags])
+AS_IF([test x"AS_VAR_GET(CACHEVAR)" = xyes],
+ [m4_default([$2], :)],
+ [m4_default([$3], :)])
+AS_VAR_POPDEF([CACHEVAR])dnl
+])dnl AX_CHECK_COMPILE_FLAGS
diff --git a/man/rbd.8 b/man/rbd.8
index 27a74aaa19a..88048674614 100644
--- a/man/rbd.8
+++ b/man/rbd.8
@@ -148,6 +148,11 @@ Specifies output formatting (default: plain, json, xml)
.B \-\-pretty\-format
Make json or xml formatted output more human\-readable.
.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-read\-only
+Set device readonly when mapping image.
+.UNINDENT
.SH COMMANDS
.INDENT 0.0
.TP
diff --git a/qa/run_xfstests.sh b/qa/run_xfstests.sh
index f3dffca293f..f9c3e55a79d 100644
--- a/qa/run_xfstests.sh
+++ b/qa/run_xfstests.sh
@@ -276,6 +276,9 @@ function install_xfstests() {
cd xfstests
+ # FIXME: use an older version before the tests were rearranged!
+ git reset --hard e5f1a13792f20cfac097fef98007610b422f2cac
+
ncpu=$(getconf _NPROCESSORS_ONLN 2>&1)
[ -n "${ncpu}" -a "${ncpu}" -gt 1 ] && multiple="-j ${ncpu}"
diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh
index 7915e48a6ed..b098b13ad71 100755
--- a/qa/workunits/cephtool/test.sh
+++ b/qa/workunits/cephtool/test.sh
@@ -47,6 +47,30 @@ function check_response()
}
+# tiering
+ceph osd pool create cache 2
+ceph osd pool create cache2 2
+ceph osd tier add data cache
+ceph osd tier add data cache2
+expect_false ceph osd tier add metadata cache
+ceph osd tier cache-mode cache writeback
+ceph osd tier cache-mode cache readonly
+ceph osd tier cache-mode cache none
+ceph osd tier set-overlay data cache
+expect_false ceph osd tier set-overlay data cache2
+expect_false ceph osd tier remove data cache
+ceph osd tier remove-overlay data
+ceph osd tier set-overlay data cache2
+ceph osd tier remove-overlay data
+ceph osd tier remove data cache
+ceph osd tier add metadata cache
+expect_false ceph osd tier set-overlay data cache
+ceph osd tier set-overlay metadata cache
+ceph osd tier remove-overlay metadata
+ceph osd tier remove metadata cache
+ceph osd tier remove data cache2
+ceph osd pool delete cache cache --yes-i-really-really-mean-it
+ceph osd pool delete cache2 cache2 --yes-i-really-really-mean-it
#
# Assumes there are at least 3 MDSes and two OSDs
@@ -145,7 +169,16 @@ bl=192.168.0.1:0/1000
ceph osd blacklist add $bl
ceph osd blacklist ls | grep $bl
ceph osd blacklist rm $bl
-expect_false "(ceph osd blacklist ls | grep $bl)"
+expect_false "ceph osd blacklist ls | grep $bl"
+
+bl=192.168.0.1
+# test without nonce, invalid nonce
+ceph osd blacklist add $bl
+ceph osd blacklist ls | grep $bl
+ceph osd blacklist rm $bl
+expect_false "ceph osd blacklist ls | grep $bl"
+expect_false "ceph osd blacklist $bl/-1"
+expect_false "ceph osd blacklist $bl/foo"
ceph osd crush tunables legacy
ceph osd crush tunables bobtail
@@ -292,6 +325,9 @@ ceph osd pool set data size 3
ceph osd pool get data size | grep 'size: 3'
ceph osd pool set data size 2
+ceph osd pool set data hashpspool true
+ceph osd pool set data hashpspool false
+
ceph osd pool get rbd crush_ruleset | grep 'crush_ruleset: 2'
ceph osd thrash 10
@@ -310,4 +346,11 @@ ceph pg set_full_ratio 95 2>$TMPFILE; check_response $? 22 'not in range'
# expect "not in range" for invalid overload percentage
ceph osd reweight-by-utilization 80 2>$TMPFILE; check_response $? 22 'not in range'
+# expect 'heap' commands to be correctly parsed
+ceph heap stats
+ceph heap start_profiler
+ceph heap dump
+ceph heap stop_profiler
+ceph heap release
+
echo OK
diff --git a/qa/workunits/mon/crush_ops.sh b/qa/workunits/mon/crush_ops.sh
index 4f66e552153..f1770e171eb 100755
--- a/qa/workunits/mon/crush_ops.sh
+++ b/qa/workunits/mon/crush_ops.sh
@@ -64,4 +64,17 @@ ceph osd crush rm host2
ceph osd crush rm osd.$o1
ceph osd crush rm osd.$o2
+ceph osd crush add-bucket foo host
+ceph osd crush move foo root=default rack=localrack
+ceph osd crush rm foo
+
+# test reweight
+o3=`ceph osd create`
+ceph osd crush add $o3 123 root=default
+ceph osd tree | grep osd.$o3 | grep 123
+ceph osd crush reweight osd.$o3 113
+ceph osd tree | grep osd.$o3 | grep 113
+ceph osd crush rm osd.$o3
+ceph osd rm osd.$o3
+
echo OK
diff --git a/qa/workunits/mon/pool_ops.sh b/qa/workunits/mon/pool_ops.sh
index e98e1e4121e..2436cc4837e 100755
--- a/qa/workunits/mon/pool_ops.sh
+++ b/qa/workunits/mon/pool_ops.sh
@@ -2,7 +2,8 @@
set -e
-ceph osd pool create foo 123 123
+ceph osd pool create foo 123 123 key1=+++ && exit 1 || true
+ceph osd pool create foo 123 123 key1=value1 key2 key3=value3
ceph osd pool create fooo 123
ceph osd pool create foo 123 # idempotent
diff --git a/qa/workunits/mon/rbd_snaps_ops.sh b/qa/workunits/mon/rbd_snaps_ops.sh
new file mode 100755
index 00000000000..29e94df7cad
--- /dev/null
+++ b/qa/workunits/mon/rbd_snaps_ops.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+# attempt to trigger #6047
+
+
+cmd_no=0
+expect()
+{
+ cmd_no=$(($cmd_no+1))
+ cmd="$1"
+ expected=$2
+ echo "[$cmd_no] $cmd"
+ eval $cmd
+ ret=$?
+ if [[ $ret -ne $expected ]]; then
+ echo "[$cmd_no] unexpected return '$ret', expected '$expected'"
+ exit 1
+ fi
+}
+
+expect 'ceph osd pool create test 256 256' 0
+expect 'ceph osd pool mksnap test snapshot' 0
+expect 'ceph osd pool rmsnap test snapshot' 0
+
+expect 'rbd --pool=test create --size=102400 image' 0
+expect 'rbd --pool=test snap create image@snapshot' 22
+
+expect 'ceph osd pool delete test test --yes-i-really-really-mean-it' 0
+expect 'ceph osd pool create test 256 256' 0
+expect 'rbd --pool=test create --size=102400 image' 0
+expect 'rbd --pool=test snap create image@snapshot' 0
+expect 'rbd --pool=test snap ls image' 0
+expect 'rbd --pool=test snap rm image@snapshot' 0
+
+expect 'ceph osd pool mksnap test snapshot' 22
+
+expect 'ceph osd pool delete test test --yes-i-really-really-mean-it' 0
+
+echo OK
diff --git a/qa/workunits/rados/caching_redirects.sh b/qa/workunits/rados/caching_redirects.sh
new file mode 100755
index 00000000000..19b940b5b4c
--- /dev/null
+++ b/qa/workunits/rados/caching_redirects.sh
@@ -0,0 +1,59 @@
+#!/bin/bash -x
+
+set -e
+
+expect_false()
+{
+ set -x
+ if "$@"; then return 1; else return 0; fi
+}
+
+
+#create pools, set up tier relationship
+ceph osd pool create base_pool 2
+ceph osd pool create partial_cache 2
+ceph osd pool create data_cache 2
+ceph osd tier add base_pool partial_cache
+ceph osd tier add base_pool data_cache
+
+# populate base_pool and data_cache with some data
+echo "foo" > foo.txt
+echo "bar" > bar.txt
+echo "baz" > baz.txt
+rados -p base_pool put fooobj foo.txt
+rados -p base_pool put barobj bar.txt
+# data_cache is backwards so we can tell we read from it
+rados -p data_cache put fooobj bar.txt
+rados -p data_cache put barobj foo.txt
+# partial_cache gets barobj backwards
+rados -p partial_cache put barobj foo.txt
+
+# get the objects back before setting a caching pool
+rados -p base_pool get fooobj tmp.txt
+diff -q tmp.txt foo.txt
+rados -p base_pool get barobj tmp.txt
+diff -q tmp.txt bar.txt
+
+# set up redirect and make sure we get redirect-based results
+ceph osd tier set-overlay base_pool partial_cache
+ceph osd tier cache-mode partial_cache writeback
+rados -p base_pool get fooobj tmp.txt
+diff -q tmp.txt foo.txt
+rados -p base_pool get barobj tmp.txt
+diff -q tmp.txt foo.txt
+
+# switch cache pools and make sure contents differ
+ceph osd tier remove-overlay base_pool
+ceph osd tier set-overlay base_pool data_cache
+ceph osd tier cache-mode data_cache writeback
+rados -p base_pool get fooobj tmp.txt
+diff -q tmp.txt bar.txt
+rados -p base_pool get barobj tmp.txt
+diff -q tmp.txt foo.txt
+
+# drop the cache entirely and make sure contents are still the same
+ceph osd tier remove-overlay base_pool
+rados -p base_pool get fooobj tmp.txt
+diff -q tmp.txt foo.txt
+rados -p base_pool get barobj tmp.txt
+diff -q tmp.txt bar.txt
diff --git a/qa/workunits/rados/test_tmap_to_omap.sh b/qa/workunits/rados/test_tmap_to_omap.sh
new file mode 100755
index 00000000000..76656ad726b
--- /dev/null
+++ b/qa/workunits/rados/test_tmap_to_omap.sh
@@ -0,0 +1,28 @@
+#!/bin/sh -ex
+
+expect_false()
+{
+ set -x
+ if "$@"; then return 1; else return 0; fi
+}
+
+pool="pool-$$"
+rados mkpool $pool
+
+rados -p $pool tmap set foo key1 value1
+rados -p $pool tmap set foo key2 value2
+rados -p $pool tmap set foo key2 value2
+rados -p $pool tmap dump foo | grep key1
+rados -p $pool tmap dump foo | grep key2
+rados -p $pool tmap-to-omap foo
+expect_false rados -p $pool tmap dump foo
+expect_false rados -p $pool tmap dump foo
+
+rados -p $pool listomapkeys foo | grep key1
+rados -p $pool listomapkeys foo | grep key2
+rados -p $pool getomapval foo key1 | grep value1
+rados -p $pool getomapval foo key2 | grep value2
+
+rados rmpool $pool $pool --yes-i-really-really-mean-it
+
+echo OK
diff --git a/qa/workunits/rbd/copy.sh b/qa/workunits/rbd/copy.sh
index 8430fca7665..7abb3956c88 100755
--- a/qa/workunits/rbd/copy.sh
+++ b/qa/workunits/rbd/copy.sh
@@ -109,8 +109,8 @@ test_ls() {
rbd ls | grep test2
rbd ls | wc -l | grep 2
# look for fields in output of ls -l without worrying about space
- rbd ls -l | grep 'test1.*1024K.*1'
- rbd ls -l | grep 'test2.*1024K.*1'
+ rbd ls -l | grep 'test1.*1024k.*1'
+ rbd ls -l | grep 'test2.*1024k.*1'
rbd rm test1
rbd rm test2
@@ -120,8 +120,8 @@ test_ls() {
rbd ls | grep test1
rbd ls | grep test2
rbd ls | wc -l | grep 2
- rbd ls -l | grep 'test1.*1024K.*2'
- rbd ls -l | grep 'test2.*1024K.*2'
+ rbd ls -l | grep 'test1.*1024k.*2'
+ rbd ls -l | grep 'test2.*1024k.*2'
rbd rm test1
rbd rm test2
@@ -131,8 +131,8 @@ test_ls() {
rbd ls | grep test1
rbd ls | grep test2
rbd ls | wc -l | grep 2
- rbd ls -l | grep 'test1.*1024K.*2'
- rbd ls -l | grep 'test2.*1024K.*1'
+ rbd ls -l | grep 'test1.*1024k.*2'
+ rbd ls -l | grep 'test2.*1024k.*1'
remove_images
# test that many images can be shown by ls
diff --git a/qa/workunits/rbd/import_export.sh b/qa/workunits/rbd/import_export.sh
index 353a47fffbe..1813f7a9a88 100755
--- a/qa/workunits/rbd/import_export.sh
+++ b/qa/workunits/rbd/import_export.sh
@@ -66,7 +66,7 @@ dd if=/dev/urandom bs=1M count=1 of=/tmp/sparse2; truncate /tmp/sparse2 -s 2M
# 1M sparse, 1M data
rbd import $RBD_CREATE_ARGS --order 20 /tmp/sparse1
-rbd ls -l | grep sparse1 | grep '2048K'
+rbd ls -l | grep sparse1 | grep '2048k'
[ "$(objects sparse1)" = '1' ]
# export, compare contents and on-disk size
@@ -77,7 +77,7 @@ rbd rm sparse1
# 1M data, 1M sparse
rbd import $RBD_CREATE_ARGS --order 20 /tmp/sparse2
-rbd ls -l | grep sparse2 | grep '2048K'
+rbd ls -l | grep sparse2 | grep '2048k'
[ "$(objects sparse2)" = '0' ]
rbd export sparse2 /tmp/sparse2.out
compare_files_and_ondisk_sizes /tmp/sparse2 /tmp/sparse2.out
@@ -88,7 +88,7 @@ rbd rm sparse2
truncate /tmp/sparse1 -s 10M
# import from stdin just for fun, verify still sparse
rbd import $RBD_CREATE_ARGS --order 20 - sparse1 < /tmp/sparse1
-rbd ls -l | grep sparse1 | grep '10240K'
+rbd ls -l | grep sparse1 | grep '10240k'
[ "$(objects sparse1)" = '1' ]
rbd export sparse1 /tmp/sparse1.out
compare_files_and_ondisk_sizes /tmp/sparse1 /tmp/sparse1.out
@@ -99,7 +99,7 @@ rbd rm sparse1
dd if=/dev/urandom bs=2M count=1 of=/tmp/sparse2 oflag=append conv=notrunc
# again from stding
rbd import $RBD_CREATE_ARGS --order 20 - sparse2 < /tmp/sparse2
-rbd ls -l | grep sparse2 | grep '4096K'
+rbd ls -l | grep sparse2 | grep '4096k'
[ "$(objects sparse2)" = '0 2 3' ]
rbd export sparse2 /tmp/sparse2.out
compare_files_and_ondisk_sizes /tmp/sparse2 /tmp/sparse2.out
diff --git a/qa/workunits/snaps/snap-rm-diff.sh b/qa/workunits/snaps/snap-rm-diff.sh
index 8dff54f58b8..3d30dc7937a 100755
--- a/qa/workunits/snaps/snap-rm-diff.sh
+++ b/qa/workunits/snaps/snap-rm-diff.sh
@@ -1,5 +1,6 @@
#!/bin/sh -ex
+ceph mds set allow_new_snaps --yes-i-really-mean-it
wget -q http://ceph.com/qa/linux-2.6.33.tar.bz2
mkdir foo
cp linux* foo
diff --git a/qa/workunits/snaps/snaptest-0.sh b/qa/workunits/snaps/snaptest-0.sh
index 93e747af7dd..366249e7d25 100755
--- a/qa/workunits/snaps/snaptest-0.sh
+++ b/qa/workunits/snaps/snaptest-0.sh
@@ -1,7 +1,16 @@
#!/bin/sh -x
+expect_failure() {
+ if [ `"$@"` -e 0 ]; then
+ return 1
+ fi
+ return 0
+}
set -e
+expect_failure mkdir .snap/foo
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
echo asdf > foo
mkdir .snap/foo
grep asdf .snap/foo/foo
@@ -14,4 +23,7 @@ grep asdf .snap/bar/bar
rmdir .snap/bar
rm foo
+ceph mds unset allow_new_snaps --yes-i-really-mean-it
+expect_failure mkdir .snap/baz
+
echo OK \ No newline at end of file
diff --git a/qa/workunits/snaps/snaptest-1.sh b/qa/workunits/snaps/snaptest-1.sh
index 59d41ef688f..7c528dd432a 100755
--- a/qa/workunits/snaps/snaptest-1.sh
+++ b/qa/workunits/snaps/snaptest-1.sh
@@ -2,6 +2,8 @@
set -e
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
echo 1 > file1
echo 2 > file2
echo 3 > file3
diff --git a/qa/workunits/snaps/snaptest-2.sh b/qa/workunits/snaps/snaptest-2.sh
index 4b67999921c..b73bf9cb97f 100755
--- a/qa/workunits/snaps/snaptest-2.sh
+++ b/qa/workunits/snaps/snaptest-2.sh
@@ -1,5 +1,7 @@
#!/bin/bash
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
echo "Create dir 100 to 199 ..."
for i in $(seq 100 199); do
echo " create dir $i"
diff --git a/qa/workunits/snaps/snaptest-authwb.sh b/qa/workunits/snaps/snaptest-authwb.sh
index 128efb70d19..acbb599bda9 100755
--- a/qa/workunits/snaps/snaptest-authwb.sh
+++ b/qa/workunits/snaps/snaptest-authwb.sh
@@ -2,6 +2,8 @@
set -e
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
touch foo
chmod +x foo
mkdir .snap/s
diff --git a/qa/workunits/snaps/snaptest-capwb.sh b/qa/workunits/snaps/snaptest-capwb.sh
index 8c5a1333b69..9d0568cb6db 100755
--- a/qa/workunits/snaps/snaptest-capwb.sh
+++ b/qa/workunits/snaps/snaptest-capwb.sh
@@ -4,6 +4,8 @@ set -e
mkdir foo
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
# make sure mds handles it when the client does not send flushsnap
echo x > foo/x
sync
diff --git a/qa/workunits/snaps/snaptest-dir-rename.sh b/qa/workunits/snaps/snaptest-dir-rename.sh
index e81edf9c47f..6995f537a47 100755
--- a/qa/workunits/snaps/snaptest-dir-rename.sh
+++ b/qa/workunits/snaps/snaptest-dir-rename.sh
@@ -2,6 +2,8 @@
set -e
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
#
# make sure we keep an existing dn's seq
#
diff --git a/qa/workunits/snaps/snaptest-double-null.sh b/qa/workunits/snaps/snaptest-double-null.sh
index cdf32e4f0ef..5a673ff9c0d 100755
--- a/qa/workunits/snaps/snaptest-double-null.sh
+++ b/qa/workunits/snaps/snaptest-double-null.sh
@@ -2,6 +2,8 @@
set -e
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
# multiple intervening snapshots with no modifications, and thus no
# snapflush client_caps messages. make sure the mds can handle this.
diff --git a/qa/workunits/snaps/snaptest-estale.sh b/qa/workunits/snaps/snaptest-estale.sh
index a4fb94368d4..31ba5a87659 100755
--- a/qa/workunits/snaps/snaptest-estale.sh
+++ b/qa/workunits/snaps/snaptest-estale.sh
@@ -1,5 +1,7 @@
#!/bin/sh -x
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
mkdir .snap/foo
echo "We want ENOENT, not ESTALE, here."
diff --git a/qa/workunits/snaps/snaptest-git-ceph.sh b/qa/workunits/snaps/snaptest-git-ceph.sh
index 11532d8b14b..71a71e1d469 100755
--- a/qa/workunits/snaps/snaptest-git-ceph.sh
+++ b/qa/workunits/snaps/snaptest-git-ceph.sh
@@ -2,6 +2,8 @@
set -e
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
git clone git://ceph.com/git/ceph.git
cd ceph
diff --git a/qa/workunits/snaps/snaptest-intodir.sh b/qa/workunits/snaps/snaptest-intodir.sh
index 3cbbe01718e..d022cfd479e 100755
--- a/qa/workunits/snaps/snaptest-intodir.sh
+++ b/qa/workunits/snaps/snaptest-intodir.sh
@@ -1,5 +1,7 @@
#!/bin/sh -ex
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
# this tests fix for #1399
mkdir foo
mkdir foo/.snap/one
diff --git a/qa/workunits/snaps/snaptest-multiple-capsnaps.sh b/qa/workunits/snaps/snaptest-multiple-capsnaps.sh
index 5ebc852cf6c..d88722bde09 100755
--- a/qa/workunits/snaps/snaptest-multiple-capsnaps.sh
+++ b/qa/workunits/snaps/snaptest-multiple-capsnaps.sh
@@ -2,6 +2,8 @@
set -e
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
echo asdf > a
mkdir .snap/1
chmod 777 a
diff --git a/qa/workunits/snaps/snaptest-parents.sh b/qa/workunits/snaps/snaptest-parents.sh
index 7e5241a27c0..8963f628dc8 100644
--- a/qa/workunits/snaps/snaptest-parents.sh
+++ b/qa/workunits/snaps/snaptest-parents.sh
@@ -2,6 +2,8 @@
set -e
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
echo "making directory tree and files"
mkdir -p 1/a/b/c/
echo "i'm file1" > 1/a/file1
diff --git a/qa/workunits/snaps/snaptest-snap-rm-cmp.sh b/qa/workunits/snaps/snaptest-snap-rm-cmp.sh
index aa094e70789..68ecf37b73e 100755
--- a/qa/workunits/snaps/snaptest-snap-rm-cmp.sh
+++ b/qa/workunits/snaps/snaptest-snap-rm-cmp.sh
@@ -2,6 +2,8 @@
set -e
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
file=linux-2.6.33.tar.bz2
wget -q http://ceph.com/qa/$file
diff --git a/qa/workunits/snaps/snaptest-upchildrealms.sh b/qa/workunits/snaps/snaptest-upchildrealms.sh
index 63b7167b42d..b5b8830e9f0 100755
--- a/qa/workunits/snaps/snaptest-upchildrealms.sh
+++ b/qa/workunits/snaps/snaptest-upchildrealms.sh
@@ -2,6 +2,8 @@
set -e
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
#
# verify that a snap update on a parent realm will induce
# snap cap writeback for inodes child realms
diff --git a/qa/workunits/snaps/snaptest-xattrwb.sh b/qa/workunits/snaps/snaptest-xattrwb.sh
index b2dd7bc748a..c36e2575845 100755
--- a/qa/workunits/snaps/snaptest-xattrwb.sh
+++ b/qa/workunits/snaps/snaptest-xattrwb.sh
@@ -2,6 +2,8 @@
set -e
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
echo "testing simple xattr wb"
touch x
setfattr -n user.foo x
diff --git a/qa/workunits/snaps/untar_snap_rm.sh b/qa/workunits/snaps/untar_snap_rm.sh
index 5c71212df75..89e2db0cd10 100755
--- a/qa/workunits/snaps/untar_snap_rm.sh
+++ b/qa/workunits/snaps/untar_snap_rm.sh
@@ -2,6 +2,8 @@
set -e
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
do_tarball() {
wget http://ceph.com/qa/$1
tar xvf$2 $1
diff --git a/qa/workunits/suites/fsstress.sh b/qa/workunits/suites/fsstress.sh
index 7f945172687..394e5fad991 100755
--- a/qa/workunits/suites/fsstress.sh
+++ b/qa/workunits/suites/fsstress.sh
@@ -2,6 +2,7 @@
if [ ! -f /usr/lib/ltp/testcases/bin/fsstress ]
then
+ path=`pwd`
mkdir -p /tmp/fsstress
cd /tmp/fsstress
wget -q -O /tmp/fsstress/ltp-full.tgz http://ceph.com/qa/ltp-full-20091231.tgz
@@ -13,6 +14,7 @@ then
sudo cp -avf /tmp/fsstress/ltp-full-20091231/testcases/kernel/fs/fsstress/fsstress /usr/lib/ltp/testcases/bin/fsstress
sudo chmod 755 /usr/lib/ltp/testcases/bin/fsstress
rm -Rf /tmp/fsstress
+ cd $path
fi
command="/usr/lib/ltp/testcases/bin/fsstress -d fsstress-`hostname`$$ -l 1 -n 1000 -p 10 -v"
diff --git a/src/.gitignore b/src/.gitignore
index 4c98529bd87..6efe8dc6bc4 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -68,6 +68,7 @@ Makefile
/test_*
/cls_test_*
/unittest_*
+/get_command_descriptions
# old dir, may in use by older branches
/leveldb
diff --git a/src/Makefile-env.am b/src/Makefile-env.am
new file mode 100644
index 00000000000..6a4e09512a2
--- /dev/null
+++ b/src/Makefile-env.am
@@ -0,0 +1,179 @@
+AUTOMAKE_OPTIONS = gnu subdir-objects
+
+SUBDIRS =
+DIST_SUBDIRS =
+BUILT_SOURCES =
+EXTRA_DIST =
+CLEANFILES =
+
+noinst_HEADERS =
+bin_PROGRAMS =
+noinst_PROGRAMS =
+bin_SCRIPTS =
+sbin_PROGRAMS =
+sbin_SCRIPTS =
+dist_bin_SCRIPTS =
+lib_LTLIBRARIES =
+noinst_LTLIBRARIES =
+noinst_LIBRARIES =
+radoslib_LTLIBRARIES =
+
+# like bin_PROGRAMS, but these targets are only built for debug builds
+bin_DEBUGPROGRAMS =
+
+# like sbin_SCRIPTS but can be used to install to e.g. /usr/sbin
+ceph_sbindir = $(exec_prefix)$(sbindir)
+
+# C/C++ tests to build will be appended to this
+check_PROGRAMS =
+
+# tests scripts will be appended to this
+check_SCRIPTS =
+
+# python unit tests need to know where the scripts are located
+export PYTHONPATH=$(top_srcdir)/src/pybind
+
+# when doing a debug build, make sure to make the targets
+if WITH_DEBUG
+bin_PROGRAMS += $(bin_DEBUGPROGRAMS)
+endif
+
+
+##################################
+## automake environment
+
+AM_COMMON_CPPFLAGS = \
+ -D__CEPH__ \
+ -D_FILE_OFFSET_BITS=64 \
+ -D_REENTRANT \
+ -D_THREAD_SAFE \
+ -D__STDC_FORMAT_MACROS \
+ -D_GNU_SOURCE \
+ -DCEPH_LIBDIR=\"${libdir}\"
+
+AM_COMMON_CFLAGS = \
+ -rdynamic \
+ -Wall \
+ ${WARN_TYPE_LIMITS} \
+ ${WARN_IGNORED_QUALIFIERS} \
+ -Winit-self \
+ -Wpointer-arith \
+ -Werror=format-security \
+ -fno-strict-aliasing \
+ -fsigned-char
+
+AM_CFLAGS = $(AM_COMMON_CFLAGS)
+AM_CPPFLAGS = $(AM_COMMON_CPPFLAGS)
+AM_CXXFLAGS = \
+ @AM_CXXFLAGS@ \
+ $(AM_COMMON_CFLAGS) \
+ -Wnon-virtual-dtor \
+ -Wno-invalid-offsetof \
+ -Wstrict-null-sentinel
+
+# note: this is position dependant, it affects the -l options that
+# come after it on the command line. when you use ${AM_LDFLAGS} in
+# later rules, take care where you place it. for more information, see
+# http://blog.flameeyes.eu/2008/11/19/relationship-between-as-needed-and-no-undefined-part-1-what-do-they-do
+# http://blog.flameeyes.eu/2008/11/20/misguided-link-and-as-needed
+# http://www.gentoo.org/proj/en/qa/asneeded.xml
+# http://gcc.gnu.org/ml/gcc-help/2010-12/msg00338.html
+# http://sigquit.wordpress.com/2011/02/16/why-asneeded-doesnt-work-as-expected-for-your-libraries-on-your-autotools-project/
+AM_LDFLAGS = -Wl,--as-needed
+
+if USE_BOOST_SPIRIT_OLD_HDR
+AM_CXXFLAGS += -DUSE_BOOST_SPIRIT_OLD_HDR
+endif
+
+if WITH_LIBATOMIC
+AM_LDFLAGS += -latomic_ops
+endif
+
+if ENABLE_COVERAGE
+AM_CFLAGS += -fprofile-arcs -ftest-coverage
+AM_CXXFLAGS += -fprofile-arcs -ftest-coverage -O0
+endif
+
+CCAS = ${srcdir}/yasm-wrapper
+AM_CCASFLAGS = -f elf64
+
+
+#####################
+## library definitions and dependencies
+
+EXTRALIBS = -luuid -lm -lkeyutils
+if FREEBSD
+EXTRALIBS += -lexecinfo
+endif # FREEBSD
+
+if LINUX
+EXTRALIBS += -lrt
+endif # LINUX
+
+if WITH_PROFILER
+EXTRALIBS += -lprofiler
+endif # PROFILER
+
+LIBGLOBAL = libglobal.la
+LIBCOMMON = libcommon.la
+LIBARCH = libarch.la
+LIBPERFGLUE = libperfglue.la
+LIBAUTH = libauth.la
+LIBMSG = libmsg.la
+LIBCRUSH = libcrush.la
+LIBJSON_SPIRIT = libjson_spirit.la
+LIBLOG = liblog.la
+LIBOS = libos.la
+LIBOSD = libosd.la
+LIBOSDC = libosdc.la
+LIBMON = libmon.la
+LIBMDS = libmds.la
+LIBCLIENT = libclient.la
+LIBCLIENT_FUSE = libclient_fuse.la
+LIBRADOS = librados.la
+LIBRGW = librgw.la
+LIBRBD = librbd.la
+LIBCEPHFS = libcephfs.la
+
+if WITH_LIBAIO
+LIBOS += -laio
+endif # WITH_LIBAIO
+
+if WITH_LIBZFS
+LIBOS += libos_zfs.a -lzfs
+endif # WITH_LIBZFS
+
+if WITH_TCMALLOC
+LIBPERFGLUE += -ltcmalloc
+endif # WITH_TCMALLOC
+
+if ENABLE_COVERAGE
+EXTRALIBS += -lgcov
+endif # ENABLE_COVERAGE
+
+# Libosd always needs osdc and os
+LIBOSD += $(LIBOSDC) $(LIBOS)
+
+# These have references to syms like ceph_using_tcmalloc(), glue libperfglue to them
+LIBMON += $(LIBPERFGLUE)
+LIBOSD += $(LIBPERFGLUE)
+LIBMDS += $(LIBPERFGLUE)
+
+# Always use system leveldb
+LIBOS += -lleveldb -lsnappy
+
+# Use this for binaries requiring libglobal
+CEPH_GLOBAL = $(LIBGLOBAL) $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS)
+
+# This is set by [lib]/Makefile.am and used for build tests
+LIBCOMMON_DEPS =
+LIBRADOS_DEPS =
+LIBRGW_DEPS =
+
+# This is used by the dencoder test
+DENCODER_SOURCES =
+DENCODER_DEPS =
+
+
+radoslibdir = $(libdir)/rados-classes
+
diff --git a/src/Makefile.am b/src/Makefile.am
index e895b74a0cc..280b268479e 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,1286 +1,125 @@
-AUTOMAKE_OPTIONS = gnu
-SUBDIRS = ocf java
-DIST_SUBDIRS = gtest ocf libs3 java
-
-EXTRA_DIST = \
- libs3/COPYING \
- libs3/ChangeLog \
- libs3/GNUmakefile \
- libs3/GNUmakefile.mingw \
- libs3/GNUmakefile.osx \
- libs3/INSTALL \
- libs3/LICENSE \
- libs3/README \
- libs3/TODO \
- libs3/archlinux \
- libs3/debian \
- libs3/doxyfile \
- libs3/inc \
- libs3/libs3.spec \
- libs3/mswin \
- libs3/src \
- libs3/test \
- unittest_bufferlist.sh \
- yasm-wrapper
-
-CLEANFILES =
-bin_PROGRAMS =
-# like bin_PROGRAMS, but these targets are only built for debug builds
-bin_DEBUGPROGRAMS =
-sbin_PROGRAMS =
-# like sbin_SCRIPTS but can be used to install to e.g. /usr/sbin
-ceph_sbindir = $(exec_prefix)$(sbindir)
-ceph_sbin_SCRIPTS = \
- ceph-disk \
- ceph-disk-prepare \
- ceph-disk-activate \
- ceph-disk-udev \
- ceph-create-keys
-
-sbin_SCRIPTS = \
- mount.fuse.ceph
-
-bin_SCRIPTS = ceph ceph-run ceph-rest-api ceph-clsinfo ceph-debugpack ceph-rbdnamer ceph-post-file
-dist_bin_SCRIPTS =
-# C/C++ tests to build will be appended to this
-check_PROGRAMS =
-# tests to actually run on "make check"; if you need extra, non-test,
-# executables built, you need to replace this with manual assignments
-# target by target
-TESTS = $(check_PROGRAMS) unittest_bufferlist.sh
-
-check-local:
- $(srcdir)/test/encoding/check-generated.sh
- $(srcdir)/test/encoding/readable.sh ../ceph-object-corpus
-
-EXTRALIBS = -luuid
-if FREEBSD
-EXTRALIBS += -lexecinfo
-endif
-if LINUX
-EXTRALIBS += -lrt
-endif
-if WITH_PROFILER
-EXTRALIBS += -lprofiler
-endif
-
-LIBGLOBAL_LDA = libglobal.la $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS)
-
-LIBOS_LDA = libos.a
-
-if WITH_LIBAIO
-LIBOS_LDA += -laio
-endif
-
-if WITH_LIBZFS
-LIBOS_LDA += libos_zfs.a -lzfs
-endif
-
-# use system leveldb
-LIBOS_LDA += -lleveldb -lsnappy
-
-# monitor
-ceph_mon_SOURCES = ceph_mon.cc common/TextTable.cc
-ceph_mon_LDFLAGS = $(AM_LDFLAGS)
-ceph_mon_LDADD = libmon.a $(LIBOS_LDA) $(LIBGLOBAL_LDA)
-ceph_mon_CXXFLAGS = ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS}
+include Makefile-env.am
+
+SUBDIRS += ocf java
+DIST_SUBDIRS += gtest ocf libs3 java
+
+
+# subdirs
+
+include arch/Makefile.am
+include auth/Makefile.am
+include crush/Makefile.am
+include mon/Makefile.am
+include mds/Makefile.am
+include os/Makefile.am
+include osd/Makefile.am
+include osdc/Makefile.am
+include client/Makefile.am
+include global/Makefile.am
+include json_spirit/Makefile.am
+include log/Makefile.am
+include perfglue/Makefile.am
+include common/Makefile.am
+include msg/Makefile.am
+include messages/Makefile.am
+include include/Makefile.am
+include librados/Makefile.am
+include librbd/Makefile.am
+include rgw/Makefile.am
+include cls/Makefile.am
+include key_value_store/Makefile.am
+include test/Makefile.am
+include tools/Makefile.am
+
+
+# core daemons
+
+ceph_mon_SOURCES = ceph_mon.cc
+ceph_mon_LDADD = $(LIBMON) $(LIBOS) $(CEPH_GLOBAL)
bin_PROGRAMS += ceph-mon
-ceph_mon_store_converter_SOURCES = mon_store_converter.cc \
- mon/MonitorStore.cc
-ceph_mon_store_converter_LDFLAGS = ${AM_LDFLAGS}
-ceph_mon_store_converter_LDADD = $(LIBOS_LDA) $(LIBGLOBAL_LDA)
-ceph_mon_store_converter_CXXFLAGS = ${AM_CXXFLAGS}
-bin_PROGRAMS += ceph_mon_store_converter
-
-
-# osd
ceph_osd_SOURCES = ceph_osd.cc
-ceph_osd_LDADD = libosd.a $(LIBOS_LDA) $(LIBGLOBAL_LDA)
-ceph_osd_CXXFLAGS = ${AM_CXXFLAGS}
-bin_PROGRAMS += ceph-osd
-
+ceph_osd_LDADD = $(LIBOSD) $(CEPH_GLOBAL)
if LINUX
ceph_osd_LDADD += -ldl
-endif
+endif # LINUX
+bin_PROGRAMS += ceph-osd
-# mds
ceph_mds_SOURCES = ceph_mds.cc
-ceph_mds_LDADD = libmds.a libosdc.la $(LIBGLOBAL_LDA)
+ceph_mds_LDADD = $(LIBMDS) $(LIBOSDC) $(CEPH_GLOBAL)
bin_PROGRAMS += ceph-mds
-ceph_mds_CXXFLAGS = ${AM_CXXFLAGS}
+
# admin tools
-ceph_conf_SOURCES = ceph_conf.cc
-ceph_conf_LDADD = $(LIBGLOBAL_LDA)
-ceph_conf_CXXFLAGS = ${AM_CXXFLAGS}
-ceph_authtool_SOURCES = ceph_authtool.cc
-ceph_authtool_LDADD = $(LIBGLOBAL_LDA)
-ceph_filestore_dump_SOURCES = tools/ceph-filestore-dump.cc
-ceph_filestore_dump_SOURCES += perfglue/disabled_heap_profiler.cc
-ceph_filestore_dump_LDADD = libosd.a $(LIBOS_LDA) $(LIBGLOBAL_LDA) -lboost_program_options
-if LINUX
-ceph_filestore_dump_LDADD += -ldl
-endif
-bin_PROGRAMS += ceph-conf ceph-authtool ceph_filestore_dump
-
-ceph_osdomap_tool_SOURCES = tools/ceph-osdomap-tool.cc \
- os/LevelDBStore.cc
-ceph_osdomap_tool_LDFLAGS = ${AM_LDFLAGS}
-ceph_osdomap_tool_LDADD = $(LIBOS_LDA) $(LIBGLOBAL_LDA) -lboost_program_options
-ceph_osdomap_tool_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph-osdomap-tool
-
-ceph_monstore_tool_SOURCES = tools/ceph-monstore-tool.cc \
- os/LevelDBStore.cc
-ceph_monstore_tool_LDFLAGS = ${AM_LDFLAGS}
-ceph_monstore_tool_LDADD = $(LIBOS_LDA) $(LIBGLOBAL_LDA) -lboost_program_options
-ceph_monstore_tool_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph-monstore-tool
-
-monmaptool_SOURCES = monmaptool.cc
-monmaptool_LDADD = $(LIBGLOBAL_LDA)
-crushtool_SOURCES = crushtool.cc
-crushtool_LDADD = $(LIBGLOBAL_LDA)
-osdmaptool_SOURCES = osdmaptool.cc
-osdmaptool_LDADD = $(LIBGLOBAL_LDA)
-bin_PROGRAMS += monmaptool crushtool osdmaptool
-
-rgw_dencoder_src = rgw/rgw_dencoder.cc \
- rgw/rgw_acl.cc \
- rgw/rgw_common.cc \
- rgw/rgw_env.cc \
- rgw/rgw_json_enc.cc
-
-ceph_dencoder_SOURCES = test/encoding/ceph_dencoder.cc ${rgw_dencoder_src} perfglue/disabled_heap_profiler.cc
-ceph_dencoder_CXXFLAGS = ${AM_CXXFLAGS}
-ceph_dencoder_LDADD = $(LIBGLOBAL_LDA) libcls_lock_client.a \
- libcls_rgw_client.a \
- libcls_replica_log_client.a \
- libcls_refcount_client.a \
- libosd.a libmds.a libosdc.la $(LIBOS_LDA) libmon.a
-bin_PROGRAMS += ceph-dencoder
-
-mount_ceph_SOURCES = mount/mount.ceph.c common/armor.c common/safe_io.c common/secret.c include/addr_parsing.c
-mount_ceph_LDADD = -lkeyutils
+
+# user tools
+
+mount_ceph_SOURCES = mount/mount.ceph.c
+mount_ceph_LDADD = $(LIBCOMMON)
if LINUX
sbin_PROGRAMS += mount.ceph
-endif
+endif # LINUX
+sbin_SCRIPTS += mount.fuse.ceph
-# user tools
cephfs_SOURCES = cephfs.cc
-cephfs_LDADD = libcommon.la
+cephfs_LDADD = $(LIBCOMMON)
bin_PROGRAMS += cephfs
librados_config_SOURCES = librados-config.cc
-librados_config_LDADD = librados.la $(LIBGLOBAL_LDA)
+librados_config_LDADD = $(LIBRADOS) $(CEPH_GLOBAL)
bin_PROGRAMS += librados-config
-# synthetic client
-ceph_syn_SOURCES = ceph_syn.cc client/SyntheticClient.cc
-ceph_syn_LDADD = libclient.la $(LIBGLOBAL_LDA)
+ceph_syn_SOURCES = ceph_syn.cc
+ceph_syn_SOURCES += client/SyntheticClient.cc # uses g_conf.. needs cleanup
+ceph_syn_LDADD = $(LIBCLIENT) $(CEPH_GLOBAL)
bin_PROGRAMS += ceph-syn
-base: ceph-mon ceph-osd ceph-mds \
- cephfs \
- ceph-syn \
- rados radosgw librados-config \
- ceph-conf monmaptool osdmaptool crushtool ceph-authtool \
- init-ceph mkcephfs mon_store_converter ceph-post-file
+rbd_SOURCES = rbd.cc
+rbd_LDADD = $(LIBRBD) $(LIBRADOS) $(CEPH_GLOBAL)
+if LINUX
+bin_PROGRAMS += rbd
+endif #LINUX
+
+
+# Fuse targets
-# fuse targets?
if WITH_FUSE
-ceph_fuse_SOURCES = ceph_fuse.cc client/fuse_ll.cc
-ceph_fuse_LDADD = -lfuse libclient.la $(LIBGLOBAL_LDA)
-ceph_fuse_CXXFLAGS = ${AM_CXXFLAGS}
+ceph_fuse_SOURCES = ceph_fuse.cc
+ceph_fuse_LDADD = $(LIBCLIENT_FUSE) $(CEPH_GLOBAL)
bin_PROGRAMS += ceph-fuse
rbd_fuse_SOURCES = rbd_fuse/rbd-fuse.c
-rbd_fuse_LDADD = -lfuse librados.la librbd.la $(LIBGLOBAL_LDA)
-rbd_fuse_CXXFLAGS = ${AM_CXXFLAGS}
+rbd_fuse_LDADD = -lfuse $(LIBRBD) $(LIBRADOS) $(CEPH_GLOBAL)
bin_PROGRAMS += rbd-fuse
+endif # WITH_FUSE
-endif
-# tcmalloc?
-if WITH_TCMALLOC
-tcmalloc_safety_flags = -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
-ceph_osd_LDADD += -ltcmalloc
-ceph_osd_CXXFLAGS += ${tcmalloc_safety_flags}
-ceph_osd_SOURCES += perfglue/heap_profiler.cc
-ceph_mds_LDADD += -ltcmalloc
-ceph_mds_CXXFLAGS += ${tcmalloc_safety_flags}
-ceph_mds_SOURCES += perfglue/heap_profiler.cc
-ceph_mon_LDADD += -ltcmalloc
-ceph_mon_CXXFLAGS += ${tcmalloc_safety_flags}
-ceph_mon_SOURCES += perfglue/heap_profiler.cc
-if WITH_FUSE
-ceph_fuse_LDADD += -ltcmalloc
-ceph_fuse_CXXFLAGS += ${tcmalloc_safety_flags}
-endif #WITH_FUSE
-else
-ceph_osd_SOURCES += perfglue/disabled_heap_profiler.cc
-ceph_mds_SOURCES += perfglue/disabled_heap_profiler.cc
-ceph_mon_SOURCES += perfglue/disabled_heap_profiler.cc
-endif # WITH_TCMALLOC
-
-# debug targets
-ceph_psim_SOURCES = psim.cc
-ceph_psim_LDADD = $(LIBGLOBAL_LDA)
-bin_DEBUGPROGRAMS += ceph_psim
-
-ceph_test_mutate_SOURCES = test/test_mutate.cc
-ceph_test_mutate_LDADD = librados.la $(LIBGLOBAL_LDA)
-bin_DEBUGPROGRAMS += ceph_test_mutate
-
-ceph_test_rewrite_latency_SOURCES = test/test_rewrite_latency.cc
-ceph_test_rewrite_latency_LDADD = libcommon.la $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS)
-bin_DEBUGPROGRAMS += ceph_test_rewrite_latency
-
-ceph_test_msgr_SOURCES = testmsgr.cc
-ceph_test_msgr_LDADD = $(LIBGLOBAL_LDA)
-bin_DEBUGPROGRAMS += ceph_test_msgr
-
-ceph_test_ioctls_SOURCES = client/test_ioctls.c
-bin_DEBUGPROGRAMS += ceph_test_ioctls
-
-ceph_dupstore_SOURCES = dupstore.cc
-ceph_dupstore_CXXFLAGS= ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS}
-ceph_dupstore_LDADD = $(LIBOS_LDA) $(LIBGLOBAL_LDA)
-ceph_streamtest_SOURCES = streamtest.cc
-ceph_streamtest_CXXFLAGS= ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS}
-ceph_streamtest_LDADD = $(LIBOS_LDA) $(LIBGLOBAL_LDA)
-bin_DEBUGPROGRAMS += ceph_dupstore ceph_streamtest
-
-ceph_test_trans_SOURCES = test_trans.cc
-ceph_test_trans_CXXFLAGS= ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS}
-ceph_test_trans_LDADD = $(LIBOS_LDA) $(LIBGLOBAL_LDA)
-bin_DEBUGPROGRAMS += ceph_test_trans
-
-ceph_test_rados_SOURCES = test/osd/TestRados.cc test/osd/TestOpStat.cc test/osd/Object.cc test/osd/RadosModel.cc
-ceph_test_rados_LDADD = librados.la $(LIBGLOBAL_LDA)
-bin_DEBUGPROGRAMS += ceph_test_rados
-
-ceph_smalliobench_SOURCES = test/bench/small_io_bench.cc test/bench/rados_backend.cc test/bench/detailed_stat_collector.cc test/bench/bencher.cc
-ceph_smalliobench_LDADD = librados.la -lboost_program_options $(LIBGLOBAL_LDA)
-bin_DEBUGPROGRAMS += ceph_smalliobench
-
-ceph_smalliobenchfs_SOURCES = test/bench/small_io_bench_fs.cc test/bench/testfilestore_backend.cc test/bench/detailed_stat_collector.cc test/bench/bencher.cc
-ceph_smalliobenchfs_LDADD = librados.la -lboost_program_options $(LIBOS_LDA) $(LIBGLOBAL_LDA)
-ceph_smalliobenchfs_CXXFLAGS = ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_smalliobenchfs
-
-ceph_smalliobenchdumb_SOURCES = test/bench/small_io_bench_dumb.cc test/bench/dumb_backend.cc test/bench/detailed_stat_collector.cc test/bench/bencher.cc
-ceph_smalliobenchdumb_LDADD = librados.la -lboost_program_options $(LIBOS_LDA) $(LIBGLOBAL_LDA)
-ceph_smalliobenchdumb_CXXFLAGS = ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_smalliobenchdumb
-
-ceph_smalliobenchrbd_SOURCES = test/bench/small_io_bench_rbd.cc test/bench/rbd_backend.cc test/bench/detailed_stat_collector.cc test/bench/bencher.cc
-ceph_smalliobenchrbd_LDADD = librados.la librbd.la -lboost_program_options $(LIBGLOBAL_LDA)
-bin_DEBUGPROGRAMS += ceph_smalliobenchrbd
-
-ceph_tpbench_SOURCES = test/bench/tp_bench.cc test/bench/detailed_stat_collector.cc
-ceph_tpbench_LDADD = librados.la -lboost_program_options $(LIBOS_LDA) $(LIBGLOBAL_LDA)
-bin_DEBUGPROGRAMS += ceph_tpbench
-
-ceph_omapbench_SOURCES = test/omap_bench.cc
-ceph_omapbench_LDADD = librados.la $(LIBGLOBAL_LDA)
-bin_DEBUGPROGRAMS += ceph_omapbench
-
-ceph_kvstorebench_SOURCES = test/kv_store_bench.cc key_value_store/kv_flat_btree_async.cc
-ceph_kvstorebench_LDADD = librados.la $(LIBGLOBAL_LDA)
-bin_DEBUGPROGRAMS += ceph_kvstorebench
-
-ceph_multi_stress_watch_SOURCES = test/multi_stress_watch.cc test/librados/test.cc
-ceph_multi_stress_watch_LDADD = librados.la $(LIBGLOBAL_LDA)
-bin_DEBUGPROGRAMS += ceph_multi_stress_watch
-
-if WITH_BUILD_TESTS
-ceph_test_libcommon_build_SOURCES = test/test_libcommon_build.cc $(libcommon_files)
-ceph_test_libcommon_build_LDADD = $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS)
-bin_DEBUGPROGRAMS += ceph_test_libcommon_build
-
-ceph_test_librados_build_SOURCES = test/test_libcommon_build.cc $(libcommon_files) $(librados_SOURCES)
-ceph_test_librados_build_LDADD = $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS)
-ceph_test_librados_build_CXXFLAGS = $(AM_CXXFLAGS)
-bin_DEBUGPROGRAMS += ceph_test_librados_build
-
-ceph_test_librgw_build_SOURCES = test/test_libcommon_build.cc $(libcommon_files) \
- $(librados_SOURCES) $(librgw_la_SOURCES)
-ceph_test_librgw_build_LDADD = -lexpat $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS)
-ceph_test_librgw_build_CXXFLAGS = $(AM_CXXFLAGS)
-bin_DEBUGPROGRAMS += ceph_test_librgw_build
-
-ceph_test_libcephfs_build_SOURCES = test/test_libcommon_build.cc $(libcommon_files) \
- $(libosdc_la_SOURCES)
-ceph_test_libcephfs_build_LDADD = libcephfs.la -lexpat $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS)
-ceph_test_libcephfs_build_CXXFLAGS = $(AM_CXXFLAGS)
-bin_DEBUGPROGRAMS += ceph_test_libcephfs_build
-endif
-
-if WITH_HADOOPCLIENT
-ceph_test_libhadoopcephfs_build_SOURCES = test/test_libcommon_build.cc \
- $(libhadoopcephfs_la_SOURCES) \
- $(libosdc_la_SOURCES) $(libcommon_files)
-ceph_test_libhadoopcephfs_build_LDADD = libcephfs.la -lexpat $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS)
-ceph_test_libhadoopcephfs_build_CXXFLAGS = $(AM_CXXFLAGS)
-bin_DEBUGPROGRAMS += ceph_test_libhadoopcephfs_build
-endif
+# libcephfs (this should go somewhere else in the future)
-##########
-BUILT_SOURCES =
-lib_LTLIBRARIES =
-noinst_LTLIBRARIES =
-noinst_LIBRARIES =
-
-# libcephfs
-libcephfs_la_SOURCES = \
- libcephfs.cc
-libcephfs_la_CFLAGS= ${CRYPTO_CFLAGS} ${AM_CFLAGS}
-libcephfs_la_CXXFLAGS= ${AM_CXXFLAGS}
-libcephfs_la_LIBADD = libclient.la
-libcephfs_la_LDFLAGS = $(PTHREAD_LIBS) $(CRYPTO_LIBS) $(EXTRALIBS) \
- ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '^ceph_.*'
+libcephfs_la_SOURCES = libcephfs.cc
+libcephfs_la_LIBADD = $(LIBCLIENT) $(LIBCOMMON) $(PTHREAD_LIBS) $(CRYPTO_LIBS) $(EXTRALIBS)
+libcephfs_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '^ceph_.*'
lib_LTLIBRARIES += libcephfs.la
-ceph_test_timers_SOURCES = test/TestTimers.cc
-ceph_test_timers_LDADD = $(LIBGLOBAL_LDA)
-bin_DEBUGPROGRAMS += ceph_test_timers
-
-ceph_test_signal_handlers_SOURCES = test/TestSignalHandlers.cc
-ceph_test_signal_handlers_LDADD = $(LIBGLOBAL_LDA)
-bin_DEBUGPROGRAMS += ceph_test_signal_handlers
-
-# librados
-librados_SOURCES = \
- librados/librados.cc \
- librados/RadosClient.cc \
- librados/IoCtxImpl.cc \
- osdc/Objecter.cc \
- osdc/Striper.cc \
- cls/lock/cls_lock_client.cc \
- cls/lock/cls_lock_types.cc \
- cls/lock/cls_lock_ops.cc
-librados_la_SOURCES = ${librados_SOURCES}
-librados_la_CFLAGS = ${CRYPTO_CFLAGS} ${AM_CFLAGS}
-librados_la_CXXFLAGS = ${AM_CXXFLAGS}
-librados_la_LIBADD = libcommon.la $(PTHREAD_LIBS) $(CRYPTO_LIBS) $(EXTRALIBS)
-librados_la_LDFLAGS = ${AM_LDFLAGS} -version-info 2:0:0 -export-symbols-regex '^rados_.*'
-lib_LTLIBRARIES += librados.la
-
-if WITH_RADOSGW
-
-# rgw
-librgw_a_SOURCES = \
- rgw/librgw.cc \
- rgw/rgw_acl.cc \
- rgw/rgw_acl_s3.cc \
- rgw/rgw_acl_swift.cc \
- rgw/rgw_client_io.cc \
- rgw/rgw_fcgi.cc \
- rgw/rgw_xml.cc \
- rgw/rgw_usage.cc \
- rgw/rgw_json_enc.cc \
- rgw/rgw_user.cc \
- rgw/rgw_bucket.cc\
- rgw/rgw_tools.cc \
- rgw/rgw_rados.cc \
- rgw/rgw_http_client.cc \
- rgw/rgw_rest_client.cc \
- rgw/rgw_rest_conn.cc \
- rgw/rgw_op.cc \
- rgw/rgw_common.cc \
- rgw/rgw_cache.cc \
- rgw/rgw_formats.cc \
- rgw/rgw_log.cc \
- rgw/rgw_multi.cc \
- rgw/rgw_policy_s3.cc \
- rgw/rgw_gc.cc \
- rgw/rgw_multi_del.cc \
- rgw/rgw_env.cc \
- rgw/rgw_cors.cc \
- rgw/rgw_cors_s3.cc \
- rgw/rgw_auth_s3.cc \
- rgw/rgw_metadata.cc \
- rgw/rgw_replica_log.cc
-librgw_a_CFLAGS = ${CRYPTO_CFLAGS} ${AM_CFLAGS}
-librgw_a_CXXFLAGS = -Woverloaded-virtual ${AM_CXXFLAGS}
-noinst_LIBRARIES += librgw.a
-
-my_radosgw_ldadd = \
- librgw.a librados.la libcls_rgw_client.a libcls_log_client.a \
- libcls_statelog_client.a libcls_replica_log_client.a libcls_lock_client.a \
- libcls_refcount_client.a libcls_version_client.a -lcurl -lexpat \
- $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS) $(LIBGLOBAL_LDA)
-
-radosgw_SOURCES = \
- rgw/rgw_resolve.cc \
- rgw/rgw_rest.cc \
- rgw/rgw_rest_swift.cc \
- rgw/rgw_rest_s3.cc \
- rgw/rgw_rest_usage.cc \
- rgw/rgw_rest_user.cc \
- rgw/rgw_rest_bucket.cc \
- rgw/rgw_rest_metadata.cc \
- rgw/rgw_replica_log.cc \
- rgw/rgw_rest_log.cc \
- rgw/rgw_rest_opstate.cc \
- rgw/rgw_rest_replica_log.cc \
- rgw/rgw_rest_config.cc \
- rgw/rgw_http_client.cc \
- rgw/rgw_swift.cc \
- rgw/rgw_swift_auth.cc \
- rgw/rgw_main.cc
-radosgw_LDADD = $(my_radosgw_ldadd) -lfcgi -lresolv
-radosgw_CXXFLAGS = ${AM_CXXFLAGS}
-bin_PROGRAMS += radosgw
-
-radosgw_admin_SOURCES = rgw/rgw_admin.cc
-radosgw_admin_CXXFLAGS = ${AM_CXXFLAGS}
-radosgw_admin_LDADD = $(my_radosgw_ldadd)
-bin_PROGRAMS += radosgw-admin
-
-ceph_rgw_multiparser_SOURCES = rgw/rgw_multiparser.cc
-ceph_rgw_multiparser_CXXFLAGS = ${AM_CXXFLAGS}
-ceph_rgw_multiparser_LDADD = $(my_radosgw_ldadd)
-bin_DEBUGPROGRAMS += ceph_rgw_multiparser
-
-ceph_rgw_jsonparser_SOURCES = rgw/rgw_jsonparser.cc rgw/rgw_common.cc rgw/rgw_env.cc rgw/rgw_json_enc.cc
-ceph_rgw_jsonparser_CXXFLAGS = ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS}
-ceph_rgw_jsonparser_LDADD = $(my_radosgw_ldadd)
-bin_DEBUGPROGRAMS += ceph_rgw_jsonparser
-
-endif
-
-# librbd
-librbd_la_SOURCES = \
- librbd/librbd.cc \
- librbd/AioCompletion.cc \
- librbd/AioRequest.cc \
- cls/rbd/cls_rbd_client.cc \
- librbd/ImageCtx.cc \
- librbd/internal.cc \
- librbd/LibrbdWriteback.cc \
- librbd/WatchCtx.cc \
- osdc/ObjectCacher.cc \
- osdc/Striper.cc \
- librados/snap_set_diff.cc \
- cls/lock/cls_lock_client.cc \
- cls/lock/cls_lock_types.cc \
- cls/lock/cls_lock_ops.cc \
- common/util.cc
-librbd_la_CFLAGS = ${AM_CFLAGS} ${CRYPTO_CFLAGS}
-librbd_la_CXXFLAGS = ${AM_CXXFLAGS}
-librbd_la_LIBADD = librados.la
-librbd_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 \
- -export-symbols-regex '^rbd_.*' $(PTHREAD_LIBS) $(EXTRALIBS)
-lib_LTLIBRARIES += librbd.la
-
-rados_SOURCES = rados.cc rados_import.cc rados_export.cc rados_sync.cc common/obj_bencher.cc
-rados_LDADD = libcls_lock_client.a librados.la $(LIBGLOBAL_LDA)
-bin_PROGRAMS += rados
-
-if WITH_REST_BENCH
-
-rest_bench_SOURCES = tools/rest_bench.cc common/obj_bencher.cc
-rest_bench_LDADD = $(LIBGLOBAL_LDA)
-rest_bench_CXXFLAGS = ${AM_CXXFLAGS}
-bin_PROGRAMS += rest-bench
-
-if WITH_SYSTEM_LIBS3
-rest_bench_LDADD += -ls3
-else
-rest_bench_LDADD += libs3/build/lib/libs3.a -lcurl -lxml2
-rest_bench_CXXFLAGS += -I$(top_srcdir)/src/libs3/inc
-SUBDIRS += libs3
-endif
-
-endif
+# jni library (java source is in src/java)
-ceph_scratchtool_SOURCES = scratchtool.c
-ceph_scratchtool_LDADD = librados.la $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS)
-ceph_scratchtoolpp_SOURCES = scratchtoolpp.cc
-ceph_scratchtoolpp_LDADD = librados.la $(PTHREAD_LIBS) -lm
-ceph_radosacl_SOURCES = radosacl.cc
-ceph_radosacl_LDADD = librados.la $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS)
-bin_DEBUGPROGRAMS += ceph_scratchtool ceph_scratchtoolpp ceph_radosacl
-
-rbd_SOURCES = rbd.cc common/secret.c common/TextTable.cc common/util.cc
-rbd_CXXFLAGS = ${AM_CXXFLAGS}
-rbd_LDADD = librbd.la librados.la $(LIBGLOBAL_LDA) -lkeyutils
-if LINUX
-bin_PROGRAMS += rbd
-endif
-
-
-ceph_test_crypto_SOURCES = testcrypto.cc
-ceph_test_crypto_LDADD = $(LIBGLOBAL_LDA)
-ceph_test_crypto_CXXFLAGS = ${AM_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_crypto
-
-ceph_test_keys_SOURCES = testkeys.cc
-ceph_test_keys_LDADD = libmon.a $(LIBGLOBAL_LDA)
-ceph_test_keys_CXXFLAGS = ${AM_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_keys
-
-if WITH_TCMALLOC
-ceph_test_keys_LDADD += -ltcmalloc
-ceph_test_keys_CXXFLAGS += ${tcmalloc_safety_flags}
-ceph_test_keys_SOURCES += perfglue/heap_profiler.cc
-endif
-
-
-## rados object classes
-
-radoslibdir = $(libdir)/rados-classes
-radoslib_LTLIBRARIES =
-
-# hello world class
-libcls_hello_la_SOURCES = cls/hello/cls_hello.cc
-libcls_hello_la_CFLAGS = ${AM_CFLAGS}
-libcls_hello_la_CXXFLAGS= ${AM_CXXFLAGS}
-libcls_hello_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
-libcls_hello_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__cls_.*'
-
-radoslib_LTLIBRARIES += libcls_hello.la
-
-# rbd: rados block device class
-libcls_rbd_la_SOURCES = cls/rbd/cls_rbd.cc
-libcls_rbd_la_CFLAGS = ${AM_CFLAGS}
-libcls_rbd_la_CXXFLAGS= ${AM_CXXFLAGS}
-libcls_rbd_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
-libcls_rbd_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__cls_.*'
-
-radoslib_LTLIBRARIES += libcls_rbd.la
-
-# lock class
-libcls_lock_la_SOURCES = cls/lock/cls_lock.cc
-libcls_lock_la_CFLAGS = ${AM_CFLAGS}
-libcls_lock_la_CXXFLAGS= ${AM_CXXFLAGS}
-libcls_lock_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
-libcls_lock_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__cls_.*'
-radoslib_LTLIBRARIES += libcls_lock.la
-
-# refcount class
-libcls_refcount_la_SOURCES = \
- cls/refcount/cls_refcount.cc \
- cls/refcount/cls_refcount_ops.cc \
- common/ceph_json.cc \
- json_spirit/json_spirit_reader.cpp \
- json_spirit/json_spirit_writer.cpp \
- json_spirit/json_spirit_value.cpp
-libcls_refcount_la_CFLAGS = ${AM_CFLAGS}
-libcls_refcount_la_CXXFLAGS= ${AM_CXXFLAGS}
-libcls_refcount_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
-libcls_refcount_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__cls_.*'
-
-radoslib_LTLIBRARIES += libcls_refcount.la
-
-# version class
-libcls_version_la_SOURCES = cls/version/cls_version.cc
-libcls_version_la_CFLAGS = ${AM_CFLAGS}
-libcls_version_la_CXXFLAGS= ${AM_CXXFLAGS}
-libcls_version_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
-libcls_version_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__cls_.*'
-
-radoslib_LTLIBRARIES += libcls_version.la
-
-# log class
-libcls_log_la_SOURCES = cls/log/cls_log.cc
-libcls_log_la_CFLAGS = ${AM_CFLAGS}
-libcls_log_la_CXXFLAGS= ${AM_CXXFLAGS}
-libcls_log_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
-libcls_log_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__cls_.*'
-
-radoslib_LTLIBRARIES += libcls_log.la
-
-libcls_statelog_la_SOURCES = cls/statelog/cls_statelog.cc
-libcls_statelog_la_CFLAGS = ${AM_CFLAGS}
-libcls_statelog_la_CXXFLAGS= ${AM_CXXFLAGS}
-libcls_statelog_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
-libcls_statelog_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__cls_.*'
-
-radoslib_LTLIBRARIES += libcls_statelog.la
-
-# replica log class
-libcls_replica_log_la_SOURCES = cls/replica_log/cls_replica_log.cc
-libcls_replica_log_la_CFLAGS = ${AM_CFLAGS}
-libcls_replica_log_la_CXXFLAGS= ${AM_CXXFLAGS}
-libcls_replica_log_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
-libcls_replica_log_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__cls_.*'
-
-radoslib_LTLIBRARIES += libcls_replica_log.la
-
-# rgw: rados gateway
-libcls_rgw_la_SOURCES = \
- cls/rgw/cls_rgw.cc \
- cls/rgw/cls_rgw_ops.cc \
- cls/rgw/cls_rgw_types.cc \
- common/ceph_json.cc \
- json_spirit/json_spirit_reader.cpp \
- json_spirit/json_spirit_writer.cpp \
- json_spirit/json_spirit_value.cpp
-libcls_rgw_la_CFLAGS = ${AM_CFLAGS}
-libcls_rgw_la_CXXFLAGS= ${AM_CXXFLAGS}
-libcls_rgw_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
-libcls_rgw_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__cls_.*'
-
-radoslib_LTLIBRARIES += libcls_rgw.la
-
-libcls_lock_client_a_SOURCES = \
- cls/lock/cls_lock_client.cc \
- cls/lock/cls_lock_types.cc \
- cls/lock/cls_lock_ops.cc
-noinst_LIBRARIES += libcls_lock_client.a
-
-libcls_refcount_client_a_SOURCES = \
- cls/refcount/cls_refcount_client.cc \
- cls/refcount/cls_refcount_ops.cc
-noinst_LIBRARIES += libcls_refcount_client.a
-
-libcls_version_client_a_SOURCES = \
- cls/version/cls_version_client.cc \
- cls/version/cls_version_types.cc
-noinst_LIBRARIES += libcls_version_client.a
-
-libcls_log_client_a_SOURCES = \
- cls/log/cls_log_client.cc
-noinst_LIBRARIES += libcls_log_client.a
-
-libcls_statelog_client_a_SOURCES = \
- cls/statelog/cls_statelog_client.cc
-noinst_LIBRARIES += libcls_statelog_client.a
-
-libcls_replica_log_client_a_SOURCES = \
- cls/replica_log/cls_replica_log_types.cc \
- cls/replica_log/cls_replica_log_ops.cc \
- cls/replica_log/cls_replica_log_client.cc
-noinst_LIBRARIES += libcls_replica_log_client.a
-
-libcls_rgw_client_a_SOURCES = \
- cls/rgw/cls_rgw_client.cc \
- cls/rgw/cls_rgw_types.cc \
- cls/rgw/cls_rgw_ops.cc
-noinst_LIBRARIES += libcls_rgw_client.a
-
-## hadoop client
-if WITH_HADOOPCLIENT
-JAVA_BASE = /usr/lib/jvm/java-6-sun
-libhadoopcephfs_la_SOURCES = client/hadoop/CephFSInterface.cc
-libhadoopcephfs_la_LIBADD = libcephfs.la
-libhadoopcephfs_la_CFLAGS = ${AM_CFLAGS}
-libhadoopcephfs_la_CXXFLAGS = ${AM_CXXFLAGS}
-libhadoopcephfs_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex 'hadoopcephfs_.*'
-lib_LTLIBRARIES += libhadoopcephfs.la
-
-endif
-
-## CephFS Java Wrappers
-## - The JNI library is here
-## - The Java source Makefile.am is in src/java
if ENABLE_CEPHFS_JAVA
libcephfs_jni_la_SOURCES = \
java/native/libcephfs_jni.cc \
java/native/ScopedLocalRef.h \
java/native/JniConstants.cpp \
java/native/JniConstants.h
-libcephfs_jni_la_LIBADD = libcephfs.la $(EXTRALIBS)
-libcephfs_jni_la_CPPFLAGS = $(JDK_CPPFLAGS)
-libcephfs_jni_la_CXXFLAGS = ${AM_CXXFLAGS}
+libcephfs_jni_la_LIBADD = $(LIBCEPHFS) $(EXTRALIBS)
+libcephfs_jni_la_CPPFLAGS = $(JDK_CPPFLAGS) $(AM_CPPFLAGS)
libcephfs_jni_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0
lib_LTLIBRARIES += libcephfs_jni.la
endif
-## key_value_store classes
-
-# key_value_store: key value store class
-libcls_kvs_la_SOURCES = key_value_store/cls_kvs.cc
-libcls_kvs_la_CFLAGS = ${AM_CFLAGS}
-libcls_kvs_la_CXXFLAGS= ${AM_CXXFLAGS}
-libcls_kvs_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
-libcls_kvs_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__cls_.*'
-
-radoslib_LTLIBRARIES += libcls_kvs.la
-
-## System tests
-libsystest_la_SOURCES = \
- test/system/cross_process_sem.cc \
- test/system/systest_runnable.cc \
- test/system/systest_settings.cc
-libsystest_la_LIBADD = $(LIBGLOBAL_LDA)
-noinst_LTLIBRARIES += libsystest.la
-
-ceph_test_rados_list_parallel_SOURCES = \
- test/system/rados_list_parallel.cc \
- test/system/st_rados_create_pool.cc \
- test/system/st_rados_list_objects.cc
-ceph_test_rados_list_parallel_LDADD = libsystest.la librados.la $(PTHREAD_LIBS)
-bin_DEBUGPROGRAMS += ceph_test_rados_list_parallel
-
-ceph_test_rados_open_pools_parallel_SOURCES = \
- test/system/rados_open_pools_parallel.cc \
- test/system/st_rados_create_pool.cc
-ceph_test_rados_open_pools_parallel_LDADD = libsystest.la librados.la $(PTHREAD_LIBS)
-bin_DEBUGPROGRAMS += ceph_test_rados_open_pools_parallel
-
-ceph_test_rados_delete_pools_parallel_SOURCES = \
- test/system/rados_delete_pools_parallel.cc \
- test/system/st_rados_create_pool.cc \
- test/system/st_rados_delete_pool.cc \
- test/system/st_rados_list_objects.cc
-ceph_test_rados_delete_pools_parallel_LDADD = libsystest.la librados.la $(PTHREAD_LIBS)
-bin_DEBUGPROGRAMS += ceph_test_rados_delete_pools_parallel
-
-ceph_test_rados_watch_notify_SOURCES = \
- test/system/rados_watch_notify.cc \
- test/system/st_rados_create_pool.cc \
- test/system/st_rados_delete_pool.cc \
- test/system/st_rados_delete_objs.cc \
- test/system/st_rados_watch.cc \
- test/system/st_rados_notify.cc
-ceph_test_rados_watch_notify_LDADD = libsystest.la librados.la $(PTHREAD_LIBS)
-bin_DEBUGPROGRAMS += ceph_test_rados_watch_notify
-
-ceph_bench_log_SOURCES = \
- test/bench_log.cc
-ceph_bench_log_LDADD = libcommon.la $(LIBGLOBAL_LDA)
-bin_DEBUGPROGRAMS += ceph_bench_log
-
-## unit tests
-
-# target to build but not run the unit tests
-unittests:: $(check_PROGRAMS)
-
-UNITTEST_CXXFLAGS = \
- -I$(top_srcdir)/src/gtest/include \
- -I$(top_builddir)/src/gtest/include
-UNITTEST_STATIC_LDADD = \
- $(top_builddir)/src/gtest/lib/libgtest.a \
- $(top_builddir)/src/gtest/lib/libgtest_main.a \
- $(PTHREAD_LIBS)
-UNITTEST_LDADD = ${UNITTEST_STATIC_LDADD}
-
-unittest_encoding_SOURCES = test/encoding.cc
-unittest_encoding_LDADD = libcephfs.la librados.la $(PTHREAD_LIBS) -lm \
- ${UNITTEST_LDADD}
-unittest_encoding_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} \
- -fno-strict-aliasing
-check_PROGRAMS += unittest_encoding
-
-unittest_addrs_SOURCES = test/test_addrs.cc
-unittest_addrs_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-unittest_addrs_LDADD = $(LIBGLOBAL_LDA) ${UNITTEST_LDADD}
-check_PROGRAMS += unittest_addrs
-
-unittest_sharedptr_registry_SOURCES = test/common/test_sharedptr_registry.cc
-unittest_sharedptr_registry_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-unittest_sharedptr_registry_LDADD = libcommon.la ${LIBGLOBAL_LDA} ${UNITTEST_LDADD}
-check_PROGRAMS += unittest_sharedptr_registry
-
-unittest_util_SOURCES = test/common/test_util.cc common/util.cc
-unittest_util_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-unittest_util_LDADD = libcommon.la $(PTHREAD_LIBS) -lm ${UNITTEST_LDADD} $(CRYPTO_LIBS) $(EXTRALIBS)
-check_PROGRAMS += unittest_util
-
-unittest_workqueue_SOURCES = test/test_workqueue.cc
-unittest_workqueue_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-unittest_workqueue_LDADD = $(LIBGLOBAL_LDA) ${UNITTEST_LDADD}
-check_PROGRAMS += unittest_workqueue
-
-unittest_striper_SOURCES = test/test_striper.cc
-unittest_striper_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-unittest_striper_LDADD = libosdc.la $(LIBGLOBAL_LDA) ${UNITTEST_LDADD}
-check_PROGRAMS += unittest_striper
-
-unittest_prebufferedstreambuf_SOURCES = test/test_prebufferedstreambuf.cc common/PrebufferedStreambuf.cc
-unittest_prebufferedstreambuf_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-unittest_prebufferedstreambuf_LDADD = ${UNITTEST_LDADD} $(EXTRALIBS)
-check_PROGRAMS += unittest_prebufferedstreambuf
-
-unittest_str_list_SOURCES = test/test_str_list.cc
-unittest_str_list_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-unittest_str_list_LDADD = $(LIBGLOBAL_LDA) ${UNITTEST_LDADD}
-check_PROGRAMS += unittest_str_list
-
-unittest_log_SOURCES = log/test.cc common/PrebufferedStreambuf.cc
-unittest_log_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS}
-unittest_log_LDADD = libcommon.la ${UNITTEST_LDADD}
-unittest_log_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} -O2
-check_PROGRAMS += unittest_log
-
-unittest_throttle_SOURCES = test/common/Throttle.cc
-unittest_throttle_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS}
-unittest_throttle_LDADD = libcommon.la ${LIBGLOBAL_LDA} ${UNITTEST_LDADD}
-unittest_throttle_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} -O2
-check_PROGRAMS += unittest_throttle
-
-unittest_base64_SOURCES = test/base64.cc
-unittest_base64_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS}
-unittest_base64_LDADD = libcephfs.la -lm ${UNITTEST_LDADD}
-unittest_base64_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_base64
-
-unittest_ceph_argparse_SOURCES = test/ceph_argparse.cc
-unittest_ceph_argparse_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS}
-unittest_ceph_argparse_LDADD = $(LIBGLOBAL_LDA) ${UNITTEST_LDADD}
-unittest_ceph_argparse_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_ceph_argparse
-
-unittest_osd_types_SOURCES = test/test_osd_types.cc
-unittest_osd_types_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-unittest_osd_types_LDADD = $(LIBGLOBAL_LDA) libcommon.la ${UNITTEST_LDADD}
-check_PROGRAMS += unittest_osd_types
-
-unittest_pglog_SOURCES = test/osd/TestPGLog.cc perfglue/disabled_heap_profiler.cc
-unittest_pglog_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-unittest_pglog_LDADD = libosd.a $(LIBOS_LDA) $(LIBGLOBAL_LDA) ${UNITTEST_LDADD}
-check_PROGRAMS += unittest_pglog
-
-if LINUX
-unittest_pglog_LDADD += -ldl
-endif
-
-unittest_gather_SOURCES = test/gather.cc
-unittest_gather_LDADD = ${LIBGLOBAL_LDA} ${UNITTEST_LDADD}
-unittest_gather_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_gather
-
-unittest_run_cmd_SOURCES = test/run_cmd.cc
-unittest_run_cmd_LDADD = libcephfs.la ${UNITTEST_LDADD}
-unittest_run_cmd_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_run_cmd
-
-unittest_signals_SOURCES = test/signals.cc
-unittest_signals_LDADD = ${LIBGLOBAL_LDA} ${UNITTEST_LDADD}
-unittest_signals_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_signals
-
-unittest_simple_spin_SOURCES = test/simple_spin.cc
-unittest_simple_spin_LDADD = libcephfs.la ${UNITTEST_LDADD}
-unittest_simple_spin_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_simple_spin
-
-unittest_librados_SOURCES = test/librados/librados.cc
-unittest_librados_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS}
-unittest_librados_LDADD = librados.la ${UNITTEST_LDADD}
-unittest_librados_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_librados
-
-unittest_bufferlist_SOURCES = test/bufferlist.cc
-unittest_bufferlist_LDADD = ${UNITTEST_LDADD} $(LIBGLOBAL_LDA)
-unittest_bufferlist_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_bufferlist
-
-unittest_crypto_SOURCES = test/crypto.cc
-unittest_crypto_LDFLAGS = ${CRYPTO_LDFLAGS} ${AM_LDFLAGS}
-unittest_crypto_LDADD = ${LIBGLOBAL_LDA} ${UNITTEST_LDADD}
-unittest_crypto_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_crypto
-
-unittest_perf_counters_SOURCES = test/perf_counters.cc
-unittest_perf_counters_LDFLAGS = ${AM_LDFLAGS}
-unittest_perf_counters_LDADD = ${LIBGLOBAL_LDA} ${UNITTEST_LDADD}
-unittest_perf_counters_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_perf_counters
-
-unittest_admin_socket_SOURCES = test/admin_socket.cc
-unittest_admin_socket_LDFLAGS = ${AM_LDFLAGS}
-unittest_admin_socket_LDADD = ${LIBGLOBAL_LDA} ${UNITTEST_LDADD} libcommon.la
-unittest_admin_socket_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_admin_socket
-
-unittest_ceph_crypto_SOURCES = test/ceph_crypto.cc
-unittest_ceph_crypto_LDFLAGS = ${CRYPTO_LDFLAGS} ${AM_LDFLAGS}
-unittest_ceph_crypto_LDADD = ${UNITTEST_LDADD} $(LIBGLOBAL_LDA)
-unittest_ceph_crypto_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_ceph_crypto
-
-unittest_utf8_SOURCES = test/utf8.cc
-unittest_utf8_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS}
-unittest_utf8_LDADD = ${UNITTEST_LDADD} $(LIBGLOBAL_LDA)
-unittest_utf8_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_utf8
-
-unittest_mime_SOURCES = test/mime.cc
-unittest_mime_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS}
-unittest_mime_LDADD = ${UNITTEST_LDADD} $(LIBGLOBAL_LDA)
-unittest_mime_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_mime
-
-unittest_escape_SOURCES = test/escape.cc
-unittest_escape_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS}
-unittest_escape_LDADD = ${UNITTEST_LDADD} $(LIBGLOBAL_LDA)
-unittest_escape_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_escape
-
-unittest_chain_xattr_SOURCES = test/filestore/chain_xattr.cc
-unittest_chain_xattr_LDFLAGS = ${AM_LDFLAGS}
-unittest_chain_xattr_LDADD = ${UNITTEST_STATIC_LDADD} $(LIBOS_LDA) $(LIBGLOBAL_LDA)
-unittest_chain_xattr_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} ${CRYPTO_CXXFLAGS}
-check_PROGRAMS += unittest_chain_xattr
-
-unittest_flatindex_SOURCES = test/os/TestFlatIndex.cc
-unittest_flatindex_LDFLAGS = ${AM_LDFLAGS}
-unittest_flatindex_LDADD = ${UNITTEST_STATIC_LDADD} $(LIBOS_LDA) $(LIBGLOBAL_LDA)
-unittest_flatindex_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} ${CRYPTO_CXXFLAGS}
-check_PROGRAMS += unittest_flatindex
-
-unittest_strtol_SOURCES = test/strtol.cc
-unittest_strtol_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS}
-unittest_strtol_LDADD = ${UNITTEST_LDADD} $(LIBGLOBAL_LDA)
-unittest_strtol_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_strtol
-
-unittest_confutils_SOURCES = test/confutils.cc
-unittest_confutils_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS}
-unittest_confutils_LDADD = ${UNITTEST_LDADD} $(LIBGLOBAL_LDA)
-unittest_confutils_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_confutils
-
-unittest_heartbeatmap_SOURCES = test/heartbeat_map.cc common/HeartbeatMap.cc
-unittest_heartbeatmap_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS}
-unittest_heartbeatmap_LDADD = ${UNITTEST_LDADD} $(LIBGLOBAL_LDA)
-unittest_heartbeatmap_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_heartbeatmap
-
-unittest_formatter_SOURCES = test/formatter.cc rgw/rgw_formats.cc
-unittest_formatter_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS}
-unittest_formatter_LDADD = ${UNITTEST_LDADD} $(LIBGLOBAL_LDA)
-unittest_formatter_CXXFLAGS = ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_formatter
-
-unittest_libcephfs_config_SOURCES = test/libcephfs_config.cc
-unittest_libcephfs_config_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS}
-unittest_libcephfs_config_LDADD = libcephfs.la ${UNITTEST_LDADD}
-unittest_libcephfs_config_CXXFLAGS = ${CRYPTO_CFLAGS} ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_libcephfs_config
-
-unittest_lfnindex_SOURCES = test/os/TestLFNIndex.cc
-unittest_lfnindex_LDFLAGS = ${AM_LDFLAGS}
-unittest_lfnindex_LDADD = ${UNITTEST_STATIC_LDADD} $(LIBOS_LDA) $(LIBGLOBAL_LDA)
-unittest_lfnindex_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} ${CRYPTO_CXXFLAGS}
-check_PROGRAMS += unittest_lfnindex
-
-unittest_librados_config_SOURCES = test/librados/librados_config.cc
-unittest_librados_config_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS}
-unittest_librados_config_LDADD = librados.la ${UNITTEST_LDADD}
-unittest_librados_config_CXXFLAGS = ${CRYPTO_CFLAGS} ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_librados_config
-
-#unittest_librgw_link_SOURCES = test/librgw_link.cc
-#unittest_librgw_link_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS}
-#unittest_librgw_link_LDADD = librgw.la ${UNITTEST_LDADD}
-#unittest_librgw_link_CXXFLAGS = ${CRYPTO_CFLAGS} ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-#check_PROGRAMS += unittest_librgw_link
-
-unittest_daemon_config_SOURCES = test/daemon_config.cc
-unittest_daemon_config_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS}
-unittest_daemon_config_LDADD = ${UNITTEST_LDADD} ${LIBGLOBAL_LDA}
-unittest_daemon_config_CXXFLAGS = ${CRYPTO_CFLAGS} ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_daemon_config
-
-unittest_osd_osdcap_SOURCES = test/osd/osdcap.cc osd/OSDCap.cc
-unittest_osd_osdcap_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS}
-unittest_osd_osdcap_LDADD = ${UNITTEST_LDADD} ${LIBGLOBAL_LDA}
-unittest_osd_osdcap_CXXFLAGS = ${CRYPTO_CFLAGS} ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_osd_osdcap
-
-unittest_mon_moncap_SOURCES = test/mon/moncap.cc mon/MonCap.cc
-unittest_mon_moncap_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS}
-unittest_mon_moncap_LDADD = ${UNITTEST_LDADD} ${LIBGLOBAL_LDA}
-unittest_mon_moncap_CXXFLAGS = ${CRYPTO_CFLAGS} ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_mon_moncap
-
-#if WITH_RADOSGW
-#unittest_librgw_SOURCES = test/librgw.cc
-#unittest_librgw_LDFLAGS = -lrt $(PTHREAD_CFLAGS) -lcurl ${AM_LDFLAGS}
-#unittest_librgw_LDADD = librgw.la librados.la ${UNITTEST_LDADD} -lexpat $(LIBGLOBAL_LDA)
-#unittest_librgw_CXXFLAGS = ${CRYPTO_CFLAGS} ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-#check_PROGRAMS += unittest_librgw
-#endif
-
-unittest_ipaddr_SOURCES = test/test_ipaddr.cc
-unittest_ipaddr_LDADD = ${UNITTEST_LDADD} $(LIBGLOBAL_LDA)
-unittest_ipaddr_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_ipaddr
-
-unittest_texttable_SOURCES = test/test_texttable.cc common/TextTable.cc
-unittest_texttable_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS}
-unittest_texttable_LDADD = librados.la ${UNITTEST_LDADD}
-unittest_texttable_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-check_PROGRAMS += unittest_texttable
-
-if WITH_RADOSGW
-ceph_test_cors_SOURCES = test/test_cors.cc
-ceph_test_cors_LDADD = librados.la librgw.a $(LIBGLOBAL_LDA) ${UNITTEST_LDADD} ${UNITTEST_STATIC_LDADD} -lcurl -luuid -lexpat
-ceph_test_cors_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_cors
-
-ceph_test_cls_rgw_meta_SOURCES = test/test_rgw_admin_meta.cc
-ceph_test_cls_rgw_meta_LDADD = librgw.a ${UNITTEST_LDADD} ${UNITTEST_STATIC_LDADD} $(LIBGLOBAL_LDA) $(CRYPTO_LIBS) -lcurl -luuid -lexpat librados.la libcls_version_client.a \
- libcls_log_client.a libcls_statelog_client.a libcls_refcount_client.a libcls_rgw_client.a libcls_lock_client.a
-ceph_test_cls_rgw_meta_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_cls_rgw_meta
-
-ceph_test_cls_rgw_log_SOURCES = test/test_rgw_admin_log.cc
-ceph_test_cls_rgw_log_LDADD = librgw.a ${UNITTEST_LDADD} ${UNITTEST_STATIC_LDADD} ${LIBGLOBAL_LDA} $(CRYPTO_LIBS) -lcurl -luuid -lexpat librados.la libcls_version_client.a \
- libcls_log_client.a libcls_statelog_client.a libcls_refcount_client.a libcls_rgw_client.a libcls_lock_client.a
-ceph_test_cls_rgw_log_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_cls_rgw_log
-
-ceph_test_cls_rgw_opstate_SOURCES = test/test_rgw_admin_opstate.cc
-ceph_test_cls_rgw_opstate_LDADD = librgw.a ${UNITTEST_LDADD} ${UNITTEST_STATIC_LDADD} ${LIBGLOBAL_LDA} $(CRYPTO_LIBS) -lcurl -luuid -lexpat librados.la libcls_version_client.a \
- libcls_log_client.a libcls_statelog_client.a libcls_refcount_client.a libcls_rgw_client.a libcls_lock_client.a
-ceph_test_cls_rgw_opstate_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_cls_rgw_opstate
-endif
-
-ceph_test_librbd_SOURCES = test/librbd/test_librbd.cc test/librados/test.cc
-ceph_test_librbd_LDADD = librbd.la librados.la ${UNITTEST_STATIC_LDADD}
-ceph_test_librbd_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_librbd
-
-ceph_test_librbd_fsx_SOURCES = test/librbd/fsx.c
-ceph_test_librbd_fsx_LDADD = librbd.la librados.la -lm
-ceph_test_librbd_fsx_CFLAGS = ${AM_CFLAGS} -Wno-format
-bin_DEBUGPROGRAMS += ceph_test_librbd_fsx
-
-ceph_test_cls_rbd_SOURCES = test/cls_rbd/test_cls_rbd.cc \
- test/librados/test.cc \
- cls/rbd/cls_rbd_client.cc \
- cls/lock/cls_lock_client.cc \
- cls/lock/cls_lock_types.cc \
- cls/lock/cls_lock_ops.cc
-ceph_test_cls_rbd_LDADD = librados.la ${UNITTEST_STATIC_LDADD}
-ceph_test_cls_rbd_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_cls_rbd
-
-ceph_test_cls_refcount_SOURCES = test/cls_refcount/test_cls_refcount.cc \
- test/librados/test.cc
-ceph_test_cls_refcount_LDADD = librados.la libcls_refcount_client.a ${UNITTEST_STATIC_LDADD}
-ceph_test_cls_refcount_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_cls_refcount
-
-ceph_test_cls_version_SOURCES = test/cls_version/test_cls_version.cc \
- test/librados/test.cc
-ceph_test_cls_version_LDADD = librados.la libcls_version_client.a ${UNITTEST_STATIC_LDADD}
-ceph_test_cls_version_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_cls_version
-
-ceph_test_cls_log_SOURCES = test/cls_log/test_cls_log.cc \
- test/librados/test.cc
-ceph_test_cls_log_LDADD = librados.la libcls_log_client.a ${UNITTEST_STATIC_LDADD} ${LIBGLOBAL_LDA}
-ceph_test_cls_log_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_cls_log
-
-ceph_test_cls_statelog_SOURCES = test/cls_statelog/test_cls_statelog.cc \
- test/librados/test.cc
-ceph_test_cls_statelog_LDADD = librados.la libcls_statelog_client.a ${UNITTEST_STATIC_LDADD} ${LIBGLOBAL_LDA}
-ceph_test_cls_statelog_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_cls_statelog
-ceph_test_cls_replica_log_SOURCES = \
- test/cls_replica_log/test_cls_replica_log.cc \
- test/librados/test.cc
-ceph_test_cls_replica_log_LDADD = librados.la \
- libcls_replica_log_client.a ${UNITTEST_STATIC_LDADD} ${LIBGLOBAL_LDA}
-ceph_test_cls_replica_log_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_cls_replica_log
-
-ceph_test_cls_lock_SOURCES = test/cls_lock/test_cls_lock.cc test/librados/test.cc
-ceph_test_cls_lock_LDFLAGS = ${AM_LDFLAGS}
-ceph_test_cls_lock_LDADD = libcls_lock_client.a librados.la ${UNITTEST_STATIC_LDADD}
-ceph_test_cls_lock_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_cls_lock
-
-ceph_test_cls_hello_SOURCES = test/cls_hello/test_cls_hello.cc test/librados/test.cc
-ceph_test_cls_hello_LDADD = ${UNITTEST_LDADD} ${UNITTEST_STATIC_LDADD} $(LIBGLOBAL_LDA) $(CRYPTO_LIBS) librados.la
-ceph_test_cls_hello_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_cls_hello
-
-if WITH_RADOSGW
-
-ceph_test_cls_rgw_SOURCES = test/cls_rgw/test_cls_rgw.cc \
- test/librados/test.cc
-ceph_test_cls_rgw_LDADD = librados.la libcls_rgw_client.a ${UNITTEST_STATIC_LDADD}
-ceph_test_cls_rgw_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_cls_rgw
-
-endif
-
-ceph_test_mon_workloadgen_SOURCES = \
- test/mon/test_mon_workloadgen.cc \
- osdc/Objecter.cc \
- osdc/Striper.cc
-ceph_test_mon_workloadgen_LDADD = $(LIBOS_LDA) $(LIBGLOBAL_LDA)
-ceph_test_mon_workloadgen_CXXFLAGS = ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_mon_workloadgen
-
-ceph_test_rados_api_cmd_SOURCES = test/librados/cmd.cc test/librados/test.cc
-ceph_test_rados_api_cmd_LDFLAGS = ${AM_LDFLAGS}
-ceph_test_rados_api_cmd_LDADD = librados.la ${UNITTEST_STATIC_LDADD}
-ceph_test_rados_api_cmd_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_rados_api_cmd
-
-ceph_test_rados_api_io_SOURCES = test/librados/io.cc test/librados/test.cc
-ceph_test_rados_api_io_LDFLAGS = ${AM_LDFLAGS}
-ceph_test_rados_api_io_LDADD = librados.la ${UNITTEST_STATIC_LDADD}
-ceph_test_rados_api_io_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_rados_api_io
-
-ceph_test_rados_api_aio_SOURCES = test/librados/aio.cc test/librados/test.cc
-ceph_test_rados_api_aio_LDFLAGS = ${AM_LDFLAGS}
-ceph_test_rados_api_aio_LDADD = librados.la ${UNITTEST_STATIC_LDADD}
-ceph_test_rados_api_aio_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_rados_api_aio
-
-ceph_test_rados_api_list_SOURCES = test/librados/list.cc test/librados/test.cc
-ceph_test_rados_api_list_LDFLAGS = ${AM_LDFLAGS}
-ceph_test_rados_api_list_LDADD = librados.la ${UNITTEST_STATIC_LDADD}
-ceph_test_rados_api_list_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_rados_api_list
-
-ceph_test_rados_api_pool_SOURCES = test/librados/pool.cc test/librados/test.cc
-ceph_test_rados_api_pool_LDFLAGS = ${AM_LDFLAGS}
-ceph_test_rados_api_pool_LDADD = librados.la ${UNITTEST_STATIC_LDADD}
-ceph_test_rados_api_pool_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_rados_api_pool
-
-ceph_test_rados_api_stat_SOURCES = test/librados/stat.cc test/librados/test.cc
-ceph_test_rados_api_stat_LDFLAGS = ${AM_LDFLAGS}
-ceph_test_rados_api_stat_LDADD = librados.la ${UNITTEST_STATIC_LDADD}
-ceph_test_rados_api_stat_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_rados_api_stat
-
-ceph_test_rados_api_watch_notify_SOURCES = test/librados/watch_notify.cc test/librados/test.cc
-ceph_test_rados_api_watch_notify_LDFLAGS = ${AM_LDFLAGS}
-ceph_test_rados_api_watch_notify_LDADD = librados.la ${UNITTEST_STATIC_LDADD}
-ceph_test_rados_api_watch_notify_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_rados_api_watch_notify
-
-ceph_test_rados_api_snapshots_SOURCES = test/librados/snapshots.cc test/librados/test.cc
-ceph_test_rados_api_snapshots_LDFLAGS = ${AM_LDFLAGS}
-ceph_test_rados_api_snapshots_LDADD = librados.la ${UNITTEST_STATIC_LDADD}
-ceph_test_rados_api_snapshots_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_rados_api_snapshots
-
-ceph_test_rados_api_cls_SOURCES = test/librados/cls.cc test/librados/test.cc
-ceph_test_rados_api_cls_LDFLAGS = ${AM_LDFLAGS}
-ceph_test_rados_api_cls_LDADD = librados.la ${UNITTEST_STATIC_LDADD}
-ceph_test_rados_api_cls_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_rados_api_cls
-
-ceph_test_rados_api_misc_SOURCES = test/librados/misc.cc test/librados/test.cc
-ceph_test_rados_api_misc_LDFLAGS = ${AM_LDFLAGS}
-ceph_test_rados_api_misc_LDADD = librados.la $(LIBGLOBAL_LDA) ${UNITTEST_STATIC_LDADD}
-ceph_test_rados_api_misc_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_rados_api_misc
-
-ceph_test_rados_api_lock_SOURCES = test/librados/lock.cc test/librados/test.cc
-ceph_test_rados_api_lock_LDFLAGS = ${AM_LDFLAGS}
-ceph_test_rados_api_lock_LDADD = librados.la ${UNITTEST_STATIC_LDADD}
-ceph_test_rados_api_lock_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_rados_api_lock
-
-ceph_test_libcephfs_SOURCES = test/libcephfs/test.cc test/libcephfs/readdir_r_cb.cc test/libcephfs/caps.cc test/libcephfs/multiclient.cc
-ceph_test_libcephfs_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS}
-ceph_test_libcephfs_LDADD = ${UNITTEST_STATIC_LDADD} libcephfs.la
-ceph_test_libcephfs_CXXFLAGS = $(AM_CXXFLAGS) ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_libcephfs
-
-ceph_test_filestore_SOURCES = test/filestore/store_test.cc
-ceph_test_filestore_LDFLAGS = ${AM_LDFLAGS}
-ceph_test_filestore_LDADD = ${UNITTEST_STATIC_LDADD} $(LIBOS_LDA) $(LIBGLOBAL_LDA)
-ceph_test_filestore_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} ${CRYPTO_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_filestore
-
-ceph_test_filestore_workloadgen_SOURCES = \
- test/filestore/workload_generator.cc \
- test/filestore/TestFileStoreState.cc
-ceph_test_filestore_workloadgen_LDFLAGS = ${AM_LDFLAGS}
-ceph_test_filestore_workloadgen_LDADD = $(LIBOS_LDA) $(LIBGLOBAL_LDA)
-ceph_test_filestore_workloadgen_CXXFLAGS = ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_filestore_workloadgen
-
-ceph_test_filestore_idempotent_SOURCES = test/filestore/test_idempotent.cc test/filestore/FileStoreTracker.cc test/common/ObjectContents.cc
-ceph_test_filestore_idempotent_LDADD = $(LIBOS_LDA) $(LIBGLOBAL_LDA)
-ceph_test_filestore_idempotent_CXXFLAGS = $(AM_CXXFLAGS)
-bin_DEBUGPROGRAMS += ceph_test_filestore_idempotent
-
-ceph_test_filestore_idempotent_sequence_SOURCES = \
- test/filestore/test_idempotent_sequence.cc \
- test/filestore/DeterministicOpSequence.cc \
- test/filestore/TestFileStoreState.cc \
- test/filestore/FileStoreDiff.cc
-ceph_test_filestore_idempotent_sequence_CXXFLAGS = ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS}
-ceph_test_filestore_idempotent_sequence_LDADD = $(LIBOS_LDA) $(LIBGLOBAL_LDA)
-bin_DEBUGPROGRAMS += ceph_test_filestore_idempotent_sequence
-
-ceph_xattr_bench_SOURCES = test/xattr_bench.cc
-ceph_xattr_bench_LDFLAGS = ${AM_LDFLAGS}
-ceph_xattr_bench_LDADD = ${UNITTEST_STATIC_LDADD} $(LIBOS_LDA) $(LIBGLOBAL_LDA)
-ceph_xattr_bench_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} ${CRYPTO_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_xattr_bench
-
-ceph_test_filejournal_SOURCES = test/test_filejournal.cc
-ceph_test_filejournal_LDFLAGS = ${AM_LDFLAGS}
-ceph_test_filejournal_LDADD = ${UNITTEST_STATIC_LDADD} $(LIBOS_LDA) $(LIBGLOBAL_LDA)
-ceph_test_filejournal_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_filejournal
-
-ceph_test_stress_watch_SOURCES = test/test_stress_watch.cc test/librados/test.cc
-ceph_test_stress_watch_LDFLAGS = ${AM_LDFLAGS}
-ceph_test_stress_watch_LDADD = librados.la ${UNITTEST_STATIC_LDADD}
-ceph_test_stress_watch_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_stress_watch
-
-ceph_test_objectcacher_stress_SOURCES = test/osdc/object_cacher_stress.cc test/osdc/FakeWriteback.cc osdc/ObjectCacher.cc
-ceph_test_objectcacher_stress_LDFLAGS = ${AM_LDFLAGS}
-ceph_test_objectcacher_stress_LDADD = $(LIBGLOBAL_LDA)
-ceph_test_objectcacher_stress_CXXFLAGS = ${AM_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_objectcacher_stress
-
-ceph_test_snap_mapper_SOURCES = test/test_snap_mapper.cc osd/SnapMapper.cc
-ceph_test_snap_mapper_LDFLAGS = ${AM_LDFLAGS}
-ceph_test_snap_mapper_LDADD = ${UNITTEST_STATIC_LDADD} $(LIBOS_LDA) $(LIBGLOBAL_LDA)
-ceph_test_snap_mapper_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} $(LEVELDB_INCLUDE) ${CRYPTO_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_snap_mapper
-
-ceph_test_object_map_SOURCES = test/ObjectMap/test_object_map.cc test/ObjectMap/KeyValueDBMemory.cc os/DBObjectMap.cc os/LevelDBStore.cc
-ceph_test_object_map_LDFLAGS = ${AM_LDFLAGS}
-ceph_test_object_map_LDADD = ${UNITTEST_STATIC_LDADD} $(LIBOS_LDA) $(LIBGLOBAL_LDA)
-ceph_test_object_map_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} ${CRYPTO_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_object_map
-
-ceph_test_keyvaluedb_atomicity_SOURCES = test/ObjectMap/test_keyvaluedb_atomicity.cc os/LevelDBStore.cc
-ceph_test_keyvaluedb_atomicity_LDFLAGS = ${AM_LDFLAGS}
-ceph_test_keyvaluedb_atomicity_LDADD = ${UNITTEST_STATIC_LDADD} $(LIBOS_LDA) $(LIBGLOBAL_LDA)
-ceph_test_keyvaluedb_atomicity_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} ${CRYPTO_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_keyvaluedb_atomicity
-
-ceph_test_keyvaluedb_iterators_SOURCES = test/ObjectMap/test_keyvaluedb_iterators.cc \
- test/ObjectMap/KeyValueDBMemory.cc \
- os/LevelDBStore.cc
-ceph_test_keyvaluedb_iterators_LDFLAGS = ${AM_LDFLAGS}
-ceph_test_keyvaluedb_iterators_LDADD = ${UNITTEST_STATIC_LDADD} $(LIBOS_LDA) $(LIBGLOBAL_LDA)
-ceph_test_keyvaluedb_iterators_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} ${CRYPTO_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_keyvaluedb_iterators
-
-ceph_test_store_tool_SOURCES = test/ObjectMap/test_store_tool/test_store_tool.cc \
- os/LevelDBStore.cc
-ceph_test_store_tool_LDFLAGS = ${AM_LDFLAGS}
-ceph_test_store_tool_LDADD = $(LIBOS_LDA) $(LIBGLOBAL_LDA)
-ceph_test_store_tool_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_store_tool
-
-ceph_test_cfuse_cache_invalidate_SOURCES = test/test_cfuse_cache_invalidate.cc
-ceph_test_cfuse_cache_invalidate_LDFLAGS = ${AM_LDFLAGS}
-ceph_test_cfuse_cache_invalidate_LDADD =
-ceph_test_cfuse_cache_invalidate_CXXFLAGS = ${AM_CXXFLAGS}
-bin_DEBUGPROGRAMS += ceph_test_cfuse_cache_invalidate
# shell scripts
+
editpaths = sed \
-e 's|@bindir[@]|$(bindir)|g' \
-e 's|@sbindir[@]|$(sbindir)|g' \
@@ -1289,11 +128,8 @@ editpaths = sed \
-e 's|@datadir[@]|$(pkgdatadir)|g' \
-e 's|@prefix[@]|$(prefix)|g' \
-e 's|@@GCOV_PREFIX_STRIP[@][@]|$(GCOV_PREFIX_STRIP)|g'
-
-shell_scripts = init-ceph mkcephfs ceph-debugpack ceph-coverage ceph-post-file
-
+shell_scripts = ceph-debugpack ceph-post-file
$(shell_scripts): Makefile
-
$(shell_scripts): %: %.in
rm -f $@ $@.tmp
$(editpaths) '$(srcdir)/$@.in' >$@.tmp
@@ -1301,77 +137,26 @@ $(shell_scripts): %: %.in
chmod a-w $@.tmp
mv $@.tmp $@
-BUILT_SOURCES += init-ceph
-sbin_SCRIPTS += mkcephfs
-
-bin_SCRIPTS += ceph-coverage
-
+EXTRA_DIST += $(srcdir)/$(shell_scripts:%=%.in)
CLEANFILES += $(shell_scripts)
-CLEANFILES += ceph_ver.h sample.fetch_config
-
-##
-
-AM_COMMON_FLAGS = \
- -D__CEPH__ \
- -D_FILE_OFFSET_BITS=64 \
- -D_REENTRANT \
- -D_THREAD_SAFE \
- -D__STDC_FORMAT_MACROS \
- -D_GNU_SOURCE \
- -rdynamic \
- -Wall \
- ${WARN_TYPE_LIMITS} \
- ${WARN_IGNORED_QUALIFIERS} \
- -Winit-self \
- -Wpointer-arith \
- -Werror=format-security \
- -fno-strict-aliasing \
- -fsigned-char
-
-AM_CFLAGS = $(AM_COMMON_FLAGS)
-AM_CXXFLAGS = \
- @AM_CXXFLAGS@ \
- $(AM_COMMON_FLAGS) \
- -DCEPH_LIBDIR=\"${libdir}\" \
- -Wnon-virtual-dtor \
- -Wno-invalid-offsetof \
- -Wstrict-null-sentinel
-# note: this is position dependant, it affects the -l options that
-# come after it on the command line. when you use ${AM_LDFLAGS} in
-# later rules, take care where you place it. for more information, see
-# http://blog.flameeyes.eu/2008/11/19/relationship-between-as-needed-and-no-undefined-part-1-what-do-they-do
-# http://blog.flameeyes.eu/2008/11/20/misguided-link-and-as-needed
-# http://www.gentoo.org/proj/en/qa/asneeded.xml
-# http://gcc.gnu.org/ml/gcc-help/2010-12/msg00338.html
-# http://sigquit.wordpress.com/2011/02/16/why-asneeded-doesnt-work-as-expected-for-your-libraries-on-your-autotools-project/
-AM_LDFLAGS = -Wl,--as-needed
-
-if USE_BOOST_SPIRIT_OLD_HDR
-AM_CXXFLAGS += -DUSE_BOOST_SPIRIT_OLD_HDR
-endif
-
-if WITH_LIBATOMIC
-AM_LDFLAGS += -latomic_ops
-endif
-
-if ENABLE_COVERAGE
-AM_CFLAGS += -fprofile-arcs -ftest-coverage
-AM_CXXFLAGS += -fprofile-arcs -ftest-coverage -O0
-EXTRALIBS += -lgcov
-endif
# extra bits
+
EXTRA_DIST += \
- $(srcdir)/verify-mds-journal.sh $(srcdir)/vstart.sh $(srcdir)/stop.sh \
- ceph-run $(srcdir)/ceph_common.sh \
+ $(srcdir)/verify-mds-journal.sh \
+ $(srcdir)/vstart.sh \
+ $(srcdir)/stop.sh \
+ ceph-run \
+ $(srcdir)/ceph_common.sh \
$(srcdir)/init-radosgw \
$(srcdir)/init-radosgw.sysv \
$(srcdir)/init-rbdmap \
- $(srcdir)/ceph-clsinfo $(srcdir)/make_version $(srcdir)/check_version \
+ $(srcdir)/ceph-clsinfo \
+ $(srcdir)/make_version \
+ $(srcdir)/check_version \
$(srcdir)/.git_version \
$(srcdir)/ceph-rbdnamer \
- $(ceph_tool_gui_DATA) \
$(srcdir)/test/encoding/readable.sh \
$(srcdir)/test/encoding/check-generated.sh \
$(srcdir)/upstart/ceph-all.conf \
@@ -1394,21 +179,39 @@ EXTRA_DIST += \
ceph-disk-activate \
ceph-disk-udev \
ceph-create-keys \
- mount.fuse.ceph \
ceph-rest-api \
mount.fuse.ceph \
- rbdmap
+ rbdmap \
+ unittest_bufferlist.sh \
+ yasm-wrapper
-EXTRA_DIST += $(srcdir)/$(shell_scripts:%=%.in)
+EXTRA_DIST += \
+ libs3/COPYING \
+ libs3/ChangeLog \
+ libs3/GNUmakefile \
+ libs3/GNUmakefile.mingw \
+ libs3/GNUmakefile.osx \
+ libs3/INSTALL \
+ libs3/LICENSE \
+ libs3/README \
+ libs3/TODO \
+ libs3/archlinux \
+ libs3/debian \
+ libs3/doxyfile \
+ libs3/inc \
+ libs3/libs3.spec \
+ libs3/mswin \
+ libs3/src \
+ libs3/test \
+ unittest_bufferlist.sh
# work around old versions of automake that don't define $docdir
# NOTE: this won't work on suse, where docdir is /usr/share/doc/packages/$package.
docdir ?= ${datadir}/doc/ceph
-
doc_DATA = $(srcdir)/sample.ceph.conf sample.fetch_config
-sample.fetch_config: fetch_config
- cp -f $(srcdir)/fetch_config ./sample.fetch_config
+
+# various scripts
shell_commondir = $(libdir)/ceph
shell_common_SCRIPTS = ceph_common.sh
@@ -1419,31 +222,54 @@ bash_completion_DATA = $(srcdir)/bash_completion/ceph \
$(srcdir)/bash_completion/rbd \
$(srcdir)/bash_completion/radosgw-admin
-libcephfs_includedir = $(includedir)/cephfs
-libcephfs_include_DATA = $(srcdir)/include/cephfs/libcephfs.h
-
-librbd_includedir = $(includedir)/rbd
-librbd_include_DATA = \
- $(srcdir)/include/rbd/features.h \
- $(srcdir)/include/rbd/librbd.h \
- $(srcdir)/include/rbd/librbd.hpp
-
-rados_includedir = $(includedir)/rados
-rados_include_DATA = \
- $(srcdir)/include/rados/librados.h \
- $(srcdir)/include/rados/rados_types.h \
- $(srcdir)/include/rados/rados_types.hpp \
- $(srcdir)/include/rados/librados.hpp \
- $(srcdir)/include/buffer.h \
- $(srcdir)/include/page.h \
- $(srcdir)/include/crc32c.h
-
-#crush_includedir = $(includedir)/crush
-#crush_include_DATA = \
-# $(srcdir)/crush/hash.h \
-# $(srcdir)/crush/crush.h \
-# $(srcdir)/crush/mapper.h \
-# $(srcdir)/crush/types.h
+ceph_sbin_SCRIPTS = \
+ ceph-disk \
+ ceph-disk-prepare \
+ ceph-disk-activate \
+ ceph-disk-udev \
+ ceph-create-keys
+
+bin_SCRIPTS += \
+ ceph \
+ ceph-run \
+ ceph-rest-api \
+ ceph-clsinfo \
+ ceph-debugpack \
+ ceph-rbdnamer \
+ ceph-post-file
+
+BUILT_SOURCES += init-ceph
+sbin_SCRIPTS += mkcephfs
+
+shell_scripts += init-ceph mkcephfs
+
+
+
+
+
+# tests to actually run on "make check"; if you need extra, non-test,
+# executables built, you need to replace this with manual assignments
+# target by target
+
+TESTS = \
+ $(check_PROGRAMS) \
+ $(check_SCRIPTS)
+
+check-local:
+ $(srcdir)/test/encoding/readable.sh ../ceph-object-corpus
+
+
+# base targets
+
+core-daemons: ceph-mon ceph-osd ceph-mds radosgw
+admin-tools: monmaptool osdmaptool crushtool ceph-authtool
+base: core-daemons admin-tools \
+ cephfs ceph-syn ceph-conf \
+ rados librados-config \
+ init-ceph mkcephfs ceph_mon_store_converter ceph-post-file
+
+
+# version stuff
FORCE:
.git_version: FORCE
@@ -1455,304 +281,37 @@ ceph_ver.c: ./ceph_ver.h
common/version.cc: ./ceph_ver.h
test/encoding/ceph_dencoder.cc: ./ceph_ver.h
+sample.fetch_config: fetch_config
+ cp -f $(srcdir)/fetch_config ./sample.fetch_config
+
+dist-hook:
+ $(srcdir)/check_version $(srcdir)/.git_version
+
+CLEANFILES += ceph_ver.h sample.fetch_config
+
+
# assemble Python script with global version variables
# NB: depends on format of ceph_ver.h
ceph: ceph.in ./ceph_ver.h Makefile
rm -f $@ $@.tmp
- echo "#!/usr/bin/python" >$@.tmp
+ echo "#!/usr/bin/env python" >$@.tmp
grep "#define CEPH_GIT_NICE_VER" ./ceph_ver.h | \
sed -e 's/#define \(.*VER\) /\1=/' >>$@.tmp
grep "#define CEPH_GIT_VER" ./ceph_ver.h | \
sed -e 's/#define \(.*VER\) /\1=/' -e 's/=\(.*\)$$/="\1"/' >>$@.tmp
- cat $@.in >>$@.tmp
+ cat $(srcdir)/$@.in >>$@.tmp
chmod a+x $@.tmp
chmod a-w $@.tmp
mv $@.tmp $@
# cleaning
+
clean-local:
-rm *.so *.gcno *.gcda
-# libs
-CCAS = ${srcdir}/yasm-wrapper
-AM_CCASFLAGS = -f elf64
-
-# crc
-libcrc_la_SOURCES = \
- common/sctp_crc32.c \
- common/crc32c.cc \
- common/crc32c_intel_baseline.c \
- common/crc32c_intel_fast.c
-
-if WITH_GOOD_YASM_ELF64
-libcrc_la_SOURCES += common/crc32c_intel_fast_asm.S
-libcrc_la_LIBTOOLFLAGS = --tag=CC
-endif
-
-noinst_LTLIBRARIES += libcrc.la
-
-# common
-libcommon_la_SOURCES = $(libcommon_files)
-libcommon_la_CFLAGS= ${CRYPTO_CFLAGS} ${AM_CFLAGS}
-libcommon_la_CXXFLAGS= ${AM_CXXFLAGS}
-libcommon_la_LDFLAGS = -lrt
-libcommon_la_LIBADD = libcrc.la
-noinst_LTLIBRARIES += libcommon.la
-
-crush_files = \
- crush/builder.c \
- crush/mapper.c \
- crush/crush.c \
- crush/hash.c \
- crush/CrushWrapper.cc \
- crush/CrushCompiler.cc \
- crush/CrushTester.cc
-
-# this list ommits the ceph_ver.c file
-libcommon_files = \
- ./ceph_ver.c \
- arch/probe.cc \
- arch/intel.c \
- auth/AuthAuthorizeHandler.cc \
- auth/AuthClientHandler.cc \
- auth/AuthSessionHandler.cc \
- auth/AuthMethodList.cc \
- auth/cephx/CephxAuthorizeHandler.cc \
- auth/cephx/CephxClientHandler.cc \
- auth/cephx/CephxProtocol.cc \
- auth/cephx/CephxSessionHandler.cc \
- auth/none/AuthNoneAuthorizeHandler.cc \
- auth/unknown/AuthUnknownAuthorizeHandler.cc \
- auth/Crypto.cc \
- auth/KeyRing.cc \
- auth/RotatingKeyRing.cc \
- common/DecayCounter.cc \
- common/LogClient.cc \
- common/LogEntry.cc \
- common/PrebufferedStreambuf.cc \
- common/BackTrace.cc \
- common/perf_counters.cc \
- common/Mutex.cc \
- common/OutputDataSocket.cc \
- common/admin_socket.cc \
- common/admin_socket_client.cc \
- common/cmdparse.cc \
- common/escape.c \
- common/Clock.cc \
- common/Throttle.cc \
- common/Timer.cc \
- common/Finisher.cc \
- common/environment.cc\
- common/assert.cc \
- common/run_cmd.cc \
- common/WorkQueue.cc \
- common/ConfUtils.cc \
- common/MemoryModel.cc \
- common/armor.c \
- common/fd.cc \
- common/xattr.c \
- common/safe_io.c \
- common/snap_types.cc \
- common/str_list.cc \
- common/errno.cc \
- json_spirit/json_spirit_reader.cpp \
- json_spirit/json_spirit_writer.cpp \
- json_spirit/json_spirit_value.cpp \
- log/Log.cc \
- log/SubsystemMap.cc \
- mon/MonCap.cc \
- mon/MonClient.cc \
- mon/MonMap.cc \
- msg/Accepter.cc \
- msg/DispatchQueue.cc \
- msg/Message.cc \
- common/RefCountedObj.cc \
- msg/Messenger.cc \
- msg/Pipe.cc \
- msg/SimpleMessenger.cc \
- msg/msg_types.cc \
- os/hobject.cc \
- osd/OSDMap.cc \
- osd/osd_types.cc \
- mds/MDSMap.cc \
- mds/inode_backtrace.cc \
- mds/mdstypes.cc \
- common/blkdev.cc \
- common/common_init.cc \
- common/pipe.c \
- common/ceph_argparse.cc \
- common/ceph_context.cc \
- common/buffer.cc \
- common/code_environment.cc \
- common/dout.cc \
- common/signal.cc \
- common/simple_spin.cc \
- common/Thread.cc \
- common/Formatter.cc \
- common/HeartbeatMap.cc \
- include/ceph_fs.cc \
- include/ceph_hash.cc \
- include/ceph_strings.cc \
- include/ceph_frag.cc \
- common/config.cc \
- common/utf8.c \
- common/mime.c \
- common/strtol.cc \
- common/page.cc \
- common/lockdep.cc \
- common/version.cc \
- common/hex.cc \
- common/entity_name.cc \
- common/ceph_crypto.cc \
- common/ceph_crypto_cms.cc \
- common/ceph_json.cc \
- common/ipaddr.cc \
- common/pick_address.cc \
- include/addr_parsing.c \
- $(crush_files)
-
-if WITH_PROFILER
-libcommon_files += perfglue/cpu_profiler.cc
-else
-libcommon_files += perfglue/disabled_stubs.cc
-endif
-
-# global
-libglobal_la_SOURCES = \
- global/global_context.cc \
- global/global_init.cc \
- global/pidfile.cc \
- global/signal_handler.cc
-libglobal_la_CFLAGS= ${CRYPTO_CFLAGS} ${AM_CFLAGS}
-libglobal_la_CXXFLAGS= ${AM_CXXFLAGS}
-libglobal_la_LIBADD= libcommon.la
-noinst_LTLIBRARIES += libglobal.la
-
-
-libmon_a_SOURCES = \
- auth/cephx/CephxKeyServer.cc \
- auth/cephx/CephxServiceHandler.cc \
- auth/cephx/CephxSessionHandler.cc \
- auth/AuthServiceHandler.cc \
- auth/AuthSessionHandler.cc \
- mon/Monitor.cc \
- mon/Paxos.cc \
- mon/PaxosService.cc \
- mon/OSDMonitor.cc \
- mon/MDSMonitor.cc \
- mon/MonmapMonitor.cc \
- mon/PGMonitor.cc \
- mon/PGMap.cc \
- mon/LogMonitor.cc \
- mon/AuthMonitor.cc \
- mon/Elector.cc \
- mon/MonitorStore.cc \
- os/LevelDBStore.cc \
- mon/HealthMonitor.cc \
- mon/DataHealthService.cc \
- mon/ConfigKeyService.cc \
- common/util.cc \
- common/TextTable.cc
-libmon_a_CXXFLAGS= ${AM_CXXFLAGS}
-noinst_LIBRARIES += libmon.a
-
-libmds_a_SOURCES = \
- mds/Anchor.cc \
- mds/Capability.cc \
- mds/Dumper.cc \
- mds/Resetter.cc \
- mds/MDS.cc \
- mds/flock.cc \
- mds/locks.c \
- mds/journal.cc \
- mds/Server.cc \
- mds/Mutation.cc \
- mds/MDCache.cc \
- mds/Locker.cc \
- mds/Migrator.cc \
- mds/MDBalancer.cc \
- mds/CDentry.cc \
- mds/CDir.cc \
- mds/CInode.cc \
- mds/LogEvent.cc \
- mds/MDSTable.cc \
- mds/InoTable.cc \
- mds/MDSTableClient.cc \
- mds/MDSTableServer.cc \
- mds/AnchorServer.cc \
- mds/AnchorClient.cc \
- mds/SnapRealm.cc \
- mds/SnapServer.cc \
- mds/snap.cc \
- mds/SessionMap.cc \
- mds/MDLog.cc \
- osdc/Journaler.cc
-noinst_LIBRARIES += libmds.a
-
-libos_a_SOURCES = \
- os/FileJournal.cc \
- os/FileStore.cc \
- os/chain_xattr.cc \
- os/ObjectStore.cc \
- os/JournalingObjectStore.cc \
- os/LFNIndex.cc \
- os/HashIndex.cc \
- os/IndexManager.cc \
- os/FlatIndex.cc \
- os/DBObjectMap.cc \
- os/LevelDBStore.cc \
- os/WBThrottle.cc \
- os/BtrfsFileStoreBackend.cc \
- os/GenericFileStoreBackend.cc \
- os/ZFSFileStoreBackend.cc
-libos_a_CXXFLAGS= ${AM_CXXFLAGS}
-noinst_LIBRARIES += libos.a
-
-if WITH_LIBZFS
-libos_zfs_a_SOURCES = os/ZFS.cc
-libos_zfs_a_CXXFLAGS= ${AM_CXXFLAGS} ${LIBZFS_CFLAGS}
-noinst_LIBRARIES += libos_zfs.a
-endif
-
-libosd_a_SOURCES = \
- osd/PG.cc \
- osd/PGLog.cc \
- osd/ReplicatedPG.cc \
- osd/Ager.cc \
- osd/OSD.cc \
- osd/OSDCap.cc \
- osd/Watch.cc \
- osd/ClassHandler.cc \
- osd/OpRequest.cc \
- osd/SnapMapper.cc \
- objclass/class_api.cc
-libosd_a_CXXFLAGS= ${AM_CXXFLAGS}
-noinst_LIBRARIES += libosd.a
-
-libosdc_la_SOURCES = \
- osdc/Objecter.cc \
- osdc/ObjectCacher.cc \
- osdc/Filer.cc \
- osdc/Striper.cc
-libosdc_la_CXXFLAGS= ${AM_CXXFLAGS}
-libosdc_la_LIBADD = libcommon.la
-noinst_LTLIBRARIES += libosdc.la
-
-libclient_la_SOURCES = \
- client/Client.cc \
- client/Inode.cc \
- client/Dentry.cc \
- client/MetaRequest.cc \
- client/ClientSnapRealm.cc \
- client/MetaSession.cc \
- client/Trace.cc
-libclient_la_LIBADD = libosdc.la $(LIBEDIT_LIBS)
-noinst_LTLIBRARIES += libclient.la
-
-dist-hook:
- $(srcdir)/check_version $(srcdir)/.git_version
+# pybind
python_PYTHON = pybind/rados.py \
pybind/rbd.py \
@@ -1760,596 +319,29 @@ python_PYTHON = pybind/rados.py \
pybind/ceph_argparse.py \
pybind/ceph_rest_api.py
-# headers... and everything else we want to include in a 'make dist'
-# that autotools doesn't magically identify.
-noinst_HEADERS = \
- rados_sync.h \
- arch/probe.h \
- arch/intel.h \
- auth/cephx/CephxAuthorizeHandler.h\
- auth/cephx/CephxKeyServer.h\
- auth/cephx/CephxProtocol.h\
- auth/cephx/CephxClientHandler.h\
- auth/cephx/CephxServiceHandler.h\
- auth/cephx/CephxSessionHandler.h\
- auth/none/AuthNoneAuthorizeHandler.h\
- auth/none/AuthNoneClientHandler.h\
- auth/none/AuthNoneServiceHandler.h\
- auth/none/AuthNoneSessionHandler.h\
- auth/none/AuthNoneProtocol.h\
- auth/unknown/AuthUnknownAuthorizeHandler.h\
- auth/unknown/AuthUnknownClientHandler.h\
- auth/unknown/AuthUnknownServiceHandler.h\
- auth/unknown/AuthUnknownSessionHandler.h\
- auth/unknown/AuthUnknownProtocol.h\
- auth/Auth.h\
- auth/AuthMethodList.h\
- auth/AuthClientHandler.h\
- auth/AuthServiceHandler.h\
- auth/AuthSessionHandler.h\
- auth/AuthAuthorizeHandler.h\
- auth/KeyRing.h\
- auth/RotatingKeyRing.h\
- auth/Crypto.h\
- bash_completion/ceph\
- bash_completion/rados\
- bash_completion/rbd\
- bash_completion/radosgw-admin\
- client/Client.h\
- client/Dentry.h\
- client/Dir.h\
- client/Fh.h\
- client/Inode.h\
- client/MetaRequest.h\
- client/MetaSession.h\
- client/ClientSnapRealm.h\
- client/SyntheticClient.h\
- client/Trace.h\
- client/fuse_ll.h\
- client/ioctl.h\
- client/hadoop/CephFSInterface.h\
- client/ObjecterWriteback.h\
+
+# everything else we want to include in a 'make dist'
+
+noinst_HEADERS += \
cls_acl.cc\
cls_crypto.cc\
- cls/lock/cls_lock_types.h\
- cls/lock/cls_lock_ops.h\
- cls/lock/cls_lock_client.h\
- cls/rbd/cls_rbd.h\
- cls/rbd/cls_rbd_client.h\
- cls/refcount/cls_refcount_ops.h\
- cls/refcount/cls_refcount_client.h\
- cls/version/cls_version_types.h\
- cls/version/cls_version_ops.h\
- cls/version/cls_version_client.h\
- cls/log/cls_log_types.h\
- cls/log/cls_log_ops.h\
- cls/log/cls_log_client.h\
- cls/statelog/cls_statelog_types.h\
- cls/statelog/cls_statelog_ops.h\
- cls/statelog/cls_statelog_client.h\
- cls/replica_log/cls_replica_log_types.h\
- cls/replica_log/cls_replica_log_ops.h\
- cls/replica_log/cls_replica_log_client.h\
- cls/rgw/cls_rgw_client.h\
- cls/rgw/cls_rgw_ops.h\
- cls/rgw/cls_rgw_types.h\
- common/BackTrace.h\
- common/RefCountedObj.h\
- common/HeartbeatMap.h\
- common/LogClient.h\
- common/LogEntry.h\
- common/Preforker.h\
- common/WorkQueue.h\
- common/PrioritizedQueue.h\
- common/ceph_argparse.h\
- common/ceph_context.h\
- common/xattr.h\
- common/blkdev.h\
- common/compiler_extensions.h\
- common/debug.h\
- common/dout.h\
- common/escape.h\
- common/fd.h\
- common/version.h\
- common/hex.h\
- common/entity_name.h\
- common/errno.h\
- common/environment.h\
- common/likely.h\
- common/lockdep.h\
- common/obj_bencher.h\
- common/snap_types.h\
- common/Clock.h\
- common/Cond.h\
- common/ConfUtils.h\
- common/DecayCounter.h\
- common/Finisher.h\
- common/Formatter.h\
- common/perf_counters.h\
- common/OutputDataSocket.h \
- common/admin_socket.h \
- common/admin_socket_client.h \
- common/shared_cache.hpp \
- common/tracked_int_ptr.hpp \
- common/simple_cache.hpp \
- common/sharedptr_registry.hpp \
- common/map_cacher.hpp \
- common/MemoryModel.h\
- common/Mutex.h\
- common/PrebufferedStreambuf.h\
- common/RWLock.h\
- common/Semaphore.h\
- common/SimpleRNG.h\
- common/TextTable.h\
- common/Thread.h\
- common/Throttle.h\
- common/Timer.h\
- common/TrackedOp.h\
- common/arch.h\
- common/armor.h\
- global/global_init.h \
- global/global_context.h \
- common/common_init.h\
- common/pipe.h\
- common/code_environment.h \
- common/signal.h\
- global/signal_handler.h\
- common/simple_spin.h\
- common/run_cmd.h\
- common/safe_io.h\
- common/config.h\
- common/config_obs.h\
- common/config_opts.h\
- common/ceph_crypto.h\
- common/ceph_crypto_cms.h\
- common/ceph_json.h\
- common/crc32c_intel_baseline.h\
- common/crc32c_intel_fast.h\
- common/lru_map.h\
- common/utf8.h\
- common/mime.h\
- common/pick_address.h\
- common/sctp_crc32.h\
- common/secret.h\
- common/strtol.h\
- common/static_assert.h\
- common/AsyncReserver.h\
- crush/CrushCompiler.h\
- crush/CrushTester.h\
- crush/CrushWrapper.h\
- crush/CrushWrapper.i\
- crush/builder.h\
- crush/crush.h\
- crush/grammar.h\
- crush/hash.h\
- crush/mapper.h\
- crush/sample.txt\
- crush/types.h\
fetch_config\
- include/bloom_filter.hpp\
- include/Context.h\
- include/CompatSet.h\
- include/Distribution.h\
- include/addr_parsing.h\
- include/assert.h\
- include/atomic.h\
- include/bitmapper.h\
- include/blobhash.h\
- include/buffer.h\
- include/byteorder.h\
- include/cephfs/libcephfs.h\
- include/ceph_features.h\
- include/ceph_frag.h\
- include/ceph_fs.h\
- include/ceph_hash.h\
- include/cmp.h\
- include/color.h\
- include/compat.h\
- include/crc32c.h\
- include/encoding.h\
- include/err.h\
- include/error.h\
- include/filepath.h\
- include/frag.h\
- include/hash.h\
- include/intarith.h\
- include/interval_set.h\
- include/inttypes.h\
- include/ipaddr.h\
- include/linux_fiemap.h\
- include/lru.h\
- include/msgr.h\
- include/object.h\
- include/page.h\
- include/rangeset.h\
- include/rados.h\
- include/rbd_types.h\
- include/statlite.h\
- include/str_list.h\
- include/stringify.h\
- include/triple.h\
- include/types.h\
- include/utime.h\
- include/dlist.h\
- include/elist.h\
- include/uuid.h\
- include/xlist.h\
- include/rados/librados.h\
- include/rados/rados_types.h\
- include/rados/rados_types.hpp\
- include/rados/librados.hpp\
- include/rados/librgw.h\
- include/rados/page.h\
- include/rados/crc32c.h\
- include/rados/buffer.h\
- include/rbd/features.h\
- include/rbd/librbd.h\
- include/rbd/librbd.hpp\
- include/util.h\
- librados/snap_set_diff.h\
- librados/AioCompletionImpl.h\
- librados/IoCtxImpl.h\
- librados/PoolAsyncCompletionImpl.h\
- librados/RadosClient.h\
- librbd/AioCompletion.h\
- librbd/AioRequest.h\
- librbd/ImageCtx.h\
- librbd/internal.h\
- librbd/LibrbdWriteback.h\
- librbd/parent_types.h\
- librbd/SnapInfo.h\
- librbd/WatchCtx.h\
logrotate.conf\
- json_spirit/json_spirit.h\
- json_spirit/json_spirit_error_position.h\
- json_spirit/json_spirit_reader.h\
- json_spirit/json_spirit_reader_template.h\
- json_spirit/json_spirit_stream_reader.h\
- json_spirit/json_spirit_utils.h\
- json_spirit/json_spirit_value.h\
- json_spirit/json_spirit_writer.h\
- json_spirit/json_spirit_writer_options.h\
- json_spirit/json_spirit_writer_template.h\
- key_value_store/key_value_structure.h\
- key_value_store/kv_flat_btree_async.h\
- key_value_store/kvs_arg_types.h\
- log/Entry.h\
- log/EntryQueue.h\
- log/Log.h\
- log/SubsystemMap.h\
- mds/inode_backtrace.h\
- mds/flock.h\
- mds/locks.c\
- mds/locks.h\
- mds/Anchor.h\
- mds/AnchorClient.h\
- mds/AnchorServer.h\
- mds/CDentry.h\
- mds/CDir.h\
- mds/CInode.h\
- mds/Capability.h\
- mds/Dumper.h\
- mds/InoTable.h\
- mds/LocalLock.h\
- mds/Locker.h\
- mds/LogEvent.h\
- mds/LogSegment.h\
- mds/MDBalancer.h\
- mds/MDCache.h\
- mds/MDLog.h\
- mds/MDS.h\
- mds/MDSMap.h\
- mds/MDSTable.h\
- mds/MDSTableServer.h\
- mds/MDSTableClient.h\
- mds/Mutation.h\
- mds/Migrator.h\
- mds/Resetter.h\
- mds/ScatterLock.h\
- mds/Server.h\
- mds/SessionMap.h\
- mds/SimpleLock.h\
- mds/SnapClient.h\
- mds/SnapRealm.h\
- mds/SnapServer.h\
- mds/events/ECommitted.h\
- mds/events/EExport.h\
- mds/events/EFragment.h\
- mds/events/EImportFinish.h\
- mds/events/EImportStart.h\
- mds/events/EMetaBlob.h\
- mds/events/EOpen.h\
- mds/events/EResetJournal.h\
- mds/events/ESession.h\
- mds/events/ESessions.h\
- mds/events/ESlaveUpdate.h\
- mds/events/ESubtreeMap.h\
- mds/events/ETableClient.h\
- mds/events/ETableServer.h\
- mds/events/EUpdate.h\
- mds/mds_table_types.h\
- mds/mdstypes.h\
- mds/snap.h\
- messages/MAuth.h\
- messages/MAuthReply.h\
- messages/MCacheExpire.h\
- messages/MClientCaps.h\
- messages/MClientCapRelease.h\
- messages/MClientLease.h\
- messages/MClientReconnect.h\
- messages/MClientReply.h\
- messages/MClientRequest.h\
- messages/MClientRequestForward.h\
- messages/MClientSession.h\
- messages/MClientSnap.h\
- messages/MCommand.h\
- messages/MCommandReply.h\
- messages/MDentryLink.h\
- messages/MDentryUnlink.h\
- messages/MDirUpdate.h\
- messages/MDiscover.h\
- messages/MDiscoverReply.h\
- messages/MExportCaps.h\
- messages/MExportCapsAck.h\
- messages/MExportDir.h\
- messages/MExportDirAck.h\
- messages/MExportDirCancel.h\
- messages/MExportDirDiscover.h\
- messages/MExportDirDiscoverAck.h\
- messages/MExportDirFinish.h\
- messages/MExportDirNotify.h\
- messages/MExportDirNotifyAck.h\
- messages/MExportDirPrep.h\
- messages/MExportDirPrepAck.h\
- messages/MGenericMessage.h\
- messages/MGetPoolStats.h\
- messages/MGetPoolStatsReply.h\
- messages/MHeartbeat.h\
- messages/MInodeFileCaps.h\
- messages/MLock.h\
- messages/MLog.h\
- messages/MLogAck.h\
- messages/MMDSBeacon.h\
- messages/MMDSCacheRejoin.h\
- messages/MMDSLoadTargets.h\
- messages/MMDSFindIno.h\
- messages/MMDSFindInoReply.h\
- messages/MMDSFragmentNotify.h\
- messages/MMDSMap.h\
- messages/MMDSOpenIno.h \
- messages/MMDSOpenInoReply.h \
- messages/MMDSResolve.h\
- messages/MMDSResolveAck.h\
- messages/MMDSSlaveRequest.h\
- messages/MMDSTableRequest.h\
- messages/MMonCommand.h\
- messages/MMonCommandAck.h\
- messages/MMonElection.h\
- messages/MMonGetMap.h\
- messages/MMonGetVersion.h\
- messages/MMonGetVersionReply.h\
- messages/MMonGlobalID.h\
- messages/MMonHealth.h\
- messages/MMonJoin.h\
- messages/MMonMap.h\
- messages/MMonPaxos.h\
- messages/MMonProbe.h\
- messages/MMonScrub.h \
- messages/MMonSubscribe.h\
- messages/MMonSubscribeAck.h\
- messages/MMonSync.h \
- messages/MOSDAlive.h\
- messages/MOSDBoot.h\
- messages/MOSDFailure.h\
- messages/MOSDMarkMeDown.h\
- messages/MOSDMap.h\
- messages/MOSDOp.h\
- messages/MOSDOpReply.h\
- messages/MOSDPGBackfill.h\
- messages/MOSDPGCreate.h\
- messages/MOSDPGPush.h\
- messages/MOSDPGPull.h\
- messages/MOSDPGPushReply.h\
- messages/MOSDPGInfo.h\
- messages/MOSDPGLog.h\
- messages/MOSDPGMissing.h\
- messages/MOSDPGNotify.h\
- messages/MOSDPGQuery.h\
- messages/MOSDPGRemove.h\
- messages/MOSDPGScan.h\
- messages/MBackfillReserve.h\
- messages/MRecoveryReserve.h\
- messages/MMonQuorumService.h\
- messages/MOSDPGTemp.h\
- messages/MOSDPGTrim.h\
- messages/MOSDPing.h\
- messages/MOSDRepScrub.h\
- messages/MOSDScrub.h\
- messages/MOSDSubOp.h\
- messages/MOSDSubOpReply.h\
- messages/MPGStats.h\
- messages/MPGStatsAck.h\
- messages/MPing.h\
- messages/MPoolOp.h\
- messages/MPoolOpReply.h\
- messages/MRemoveSnaps.h\
- messages/MRoute.h\
- messages/MForward.h\
- messages/MStatfs.h\
- messages/MStatfsReply.h\
- messages/MTimeCheck.h\
- messages/MWatchNotify.h\
- messages/PaxosServiceMessage.h\
- mon/AuthMonitor.h\
- mon/DataHealthService.h\
- mon/Elector.h\
- mon/LogMonitor.h\
- mon/ConfigKeyService.h\
- mon/HealthMonitor.h\
- mon/HealthService.h\
- mon/MDSMonitor.h\
- mon/MonmapMonitor.h\
- mon/MonCap.h\
- mon/MonClient.h\
- mon/MonCommands.h\
- mon/MonMap.h\
- mon/Monitor.h\
- mon/MonitorStore.h\
- mon/MonitorDBStore.h\
- mon/OSDMonitor.h\
- mon/PGMap.h\
- mon/PGMonitor.h\
- mon/Paxos.h\
- mon/PaxosService.h\
- mon/QuorumService.h\
- mon/Session.h\
- mon/mon_types.h\
- mount/canonicalize.c\
- mount/mtab.c\
- msg/Accepter.h\
- msg/DispatchQueue.h\
- msg/Dispatcher.h\
- msg/Message.h\
- msg/Messenger.h\
- msg/Pipe.h\
- msg/SimpleMessenger.h\
- msg/msg_types.h\
- objclass/objclass.h\
- os/btrfs_ioctl.h\
- os/ZFS.h\
- os/chain_xattr.h\
- os/hobject.h \
- os/CollectionIndex.h\
- os/FileJournal.h\
- os/FileStore.h\
- os/BtrfsFileStoreBackend.h\
- os/GenericFileStoreBackend.h\
- os/ZFSFileStoreBackend.h\
- os/FlatIndex.h\
- os/HashIndex.h\
- os/FDCache.h\
- os/WBThrottle.h\
- os/IndexManager.h\
- os/Journal.h\
- os/JournalingObjectStore.h\
- os/LFNIndex.h\
- os/ObjectStore.h\
- os/SequencerPosition.h\
- osd/Ager.h\
- osd/ClassHandler.h\
- osd/OSD.h\
- osd/OSDCap.h\
- osd/OSDMap.h\
- osd/ObjectVersioner.h\
- osd/OpRequest.h\
- osd/SnapMapper.h\
- osd/PG.h\
- osd/PGLog.h\
- osd/ReplicatedPG.h\
- osd/Watch.h\
- osd/osd_types.h\
- osdc/Blinker.h\
- osdc/Filer.h\
- osdc/Journaler.h\
- osdc/ObjectCacher.h\
- osdc/Objecter.h\
- osdc/Striper.h\
- osdc/WritebackHandler.h\
- perfglue/cpu_profiler.h\
- perfglue/heap_profiler.h\
- rgw/logrotate.conf\
- rgw/rgw_acl.h\
- rgw/rgw_acl_s3.h\
- rgw/rgw_acl_swift.h\
- rgw/rgw_client_io.h\
- rgw/rgw_fcgi.h\
- rgw/rgw_xml.h\
- rgw/rgw_cache.h\
- rgw/rgw_common.h\
- rgw/rgw_cors.h\
- rgw/rgw_cors_s3.h\
- rgw/rgw_cors_swift.h\
- rgw/rgw_string.h\
- rgw/rgw_formats.h\
- rgw/rgw_http_errors.h\
- rgw/rgw_log.h\
- rgw/rgw_multi.h\
- rgw/rgw_policy_s3.h\
- rgw/rgw_gc.h\
- rgw/rgw_metadata.h\
- rgw/rgw_multi_del.h\
- rgw/rgw_op.h\
- rgw/rgw_http_client.h\
- rgw/rgw_swift.h\
- rgw/rgw_swift_auth.h\
- rgw/rgw_rados.h\
- rgw/rgw_replica_log.h \
- rgw/rgw_resolve.h\
- rgw/rgw_rest.h\
- rgw/rgw_rest_swift.h\
- rgw/rgw_rest_s3.h\
- rgw/rgw_auth_s3.h\
- rgw/rgw_rest_admin.h\
- rgw/rgw_rest_usage.h\
- rgw/rgw_rest_user.h\
- rgw/rgw_rest_bucket.h\
- rgw/rgw_rest_client.h\
- rgw/rgw_rest_conn.h\
- rgw/rgw_tools.h\
- rgw/rgw_rest_metadata.h\
- rgw/rgw_rest_log.h\
- rgw/rgw_rest_opstate.h\
- rgw/rgw_rest_replica_log.h\
- rgw/rgw_rest_config.h\
- rgw/rgw_usage.h\
- rgw/rgw_user.h\
- rgw/rgw_bucket.h\
- sample.ceph.conf\
- tools/common.h\
- test/osd/RadosModel.h\
- test/osd/Object.h\
- test/osd/TestOpStat.h\
- global/pidfile.h\
- common/sync_filesystem.h \
- test/bench/distribution.h \
- test/bench/rados_backend.h \
- test/bench/rbd_backend.h \
- test/bench/bencher.h \
- test/bench/backend.h \
- test/bench/dumb_backend.h \
- test/bench/stat_collector.h \
- test/bench/detailed_stat_collector.h \
- test/bench/testfilestore_backend.h \
- test/common/ObjectContents.h \
- test/encoding/types.h \
- test/filestore/DeterministicOpSequence.h \
- test/filestore/FileStoreTracker.h \
- test/filestore/FileStoreDiff.h \
- test/filestore/TestFileStoreState.h \
- test/filestore/workload_generator.h \
- test/kv_store_bench.h \
- test/librados/test.h \
- test/ObjectMap/KeyValueDBMemory.h \
- test/omap_bench.h \
- test/osd/Object.h \
- test/osd/RadosModel.h \
- test/osd/TestOpStat.h \
- test/osdc/FakeWriteback.h \
- test/system/cross_process_sem.h \
- test/system/st_rados_create_pool.h \
- test/system/st_rados_list_objects.h \
- test/system/st_rados_delete_objs.h \
- test/system/st_rados_delete_pool.h \
- test/system/st_rados_notify.h \
- test/system/st_rados_watch.h \
- test/system/systest_runnable.h \
- test/system/systest_settings.h \
- test/unit.h \
- os/ObjectMap.h \
- os/DBObjectMap.h \
- os/KeyValueDB.h \
- os/LevelDBStore.h \
- common/cmdparse.h
+ sample.ceph.conf\
+ bash_completion/ceph \
+ bash_completion/rados \
+ bash_completion/rbd \
+ bash_completion/radosgw-admin \
+ mount/canonicalize.c \
+ mount/mtab.c \
+ objclass/objclass.h
+
+
+# coverage
+
+shell_scripts += ceph-coverage
+bin_SCRIPTS += ceph-coverage
+
if ENABLE_COVERAGE
COV_DIR = $(DESTDIR)$(libdir)/ceph/coverage
@@ -2387,13 +379,6 @@ uninstall-local: uninstall-coverage
-rmdir -p $(DESTDIR)$(localstatedir)/log/ceph
-rmdir -p $(DESTDIR)$(localstatedir)/lib/ceph/tmp
-# if we are doing a debug build, tell make to actually build the debug
-# targets
-if WITH_DEBUG
-bin_PROGRAMS += $(bin_DEBUGPROGRAMS)
-endif
-
-
#
# coverity rules expect:
# - cov-build to be in the path
diff --git a/src/arch/Makefile.am b/src/arch/Makefile.am
new file mode 100644
index 00000000000..27342078150
--- /dev/null
+++ b/src/arch/Makefile.am
@@ -0,0 +1,11 @@
+libarch_la_SOURCES = \
+ arch/intel.c \
+ arch/neon.c \
+ arch/probe.cc
+
+noinst_LTLIBRARIES += libarch.la
+
+noinst_HEADERS += \
+ arch/intel.h \
+ arch/neon.h \
+ arch/probe.h
diff --git a/src/arch/intel.c b/src/arch/intel.c
index 0513da53c23..8b2d2ccab12 100644
--- a/src/arch/intel.c
+++ b/src/arch/intel.c
@@ -4,8 +4,7 @@
int ceph_arch_intel_sse42 = 0;
-/* this probably isn't specific enough for x86_64? fix me someday */
-#ifdef __LP64__
+#ifdef __x86_64__
/* intel cpu? */
static void do_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx,
@@ -35,7 +34,7 @@ int ceph_arch_intel_probe(void)
return 0;
}
-#else // __LP64__
+#else // __x86_64__
int ceph_arch_intel_probe(void)
{
@@ -43,4 +42,4 @@ int ceph_arch_intel_probe(void)
return 0;
}
-#endif // __LP64__
+#endif // __x86_64__
diff --git a/src/arch/neon.c b/src/arch/neon.c
new file mode 100644
index 00000000000..32c1f621ef7
--- /dev/null
+++ b/src/arch/neon.c
@@ -0,0 +1,51 @@
+#include "arch/probe.h"
+
+/* flags we export */
+int ceph_arch_neon = 0;
+
+#include <stdio.h>
+
+#if __linux__
+
+#include <elf.h>
+#include <link.h> // ElfW macro
+
+#if __arm__
+#include <asm/hwcap.h>
+#endif // __arm__
+
+static unsigned long get_auxval(unsigned long type)
+{
+ unsigned long result = 0;
+ FILE *f = fopen("/proc/self/auxv", "r");
+ if (f) {
+ ElfW(auxv_t) entry;
+ while (fread(&entry, sizeof(entry), 1, f)) {
+ if (entry.a_type == type) {
+ result = entry.a_un.a_val;
+ break;
+ }
+ }
+ fclose(f);
+ }
+ return result;
+}
+
+static unsigned long get_hwcap(void)
+{
+ return get_auxval(AT_HWCAP);
+}
+
+#endif // __linux__
+
+int ceph_arch_neon_probe(void)
+{
+#if __arm__ && __linux__
+ ceph_arch_neon = (get_hwcap() & HWCAP_NEON) == HWCAP_NEON;
+#else
+ if (0)
+ get_hwcap(); // make compiler shut up
+#endif
+ return 0;
+}
+
diff --git a/src/arch/neon.h b/src/arch/neon.h
new file mode 100644
index 00000000000..0c8aacf5e87
--- /dev/null
+++ b/src/arch/neon.h
@@ -0,0 +1,16 @@
+#ifndef CEPH_ARCH_NEON_H
+#define CEPH_ARCH_NEON_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int ceph_arch_neon; /* true if we have ARM NEON abilities */
+
+extern int ceph_arch_neon_probe(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/arch/probe.cc b/src/arch/probe.cc
index 9f8bc9d2d0f..8648e54d945 100644
--- a/src/arch/probe.cc
+++ b/src/arch/probe.cc
@@ -4,6 +4,7 @@
#include "arch/probe.h"
#include "arch/intel.h"
+#include "arch/neon.h"
int ceph_arch_probe(void)
{
@@ -11,6 +12,7 @@ int ceph_arch_probe(void)
return 1;
ceph_arch_intel_probe();
+ ceph_arch_neon_probe();
ceph_arch_probed = 1;
return 1;
diff --git a/src/auth/AuthMethodList.h b/src/auth/AuthMethodList.h
index 7b21b325aeb..b5aae0bba49 100644
--- a/src/auth/AuthMethodList.h
+++ b/src/auth/AuthMethodList.h
@@ -15,7 +15,8 @@
#ifndef CEPH_AUTHMETHODLIST_H
#define CEPH_AUTHMETHODLIST_H
-#include "include/inttypes.h"
+#include "include/int_types.h"
+
#include <list>
#include <set>
#include <string>
diff --git a/src/auth/Makefile.am b/src/auth/Makefile.am
new file mode 100644
index 00000000000..f7f3b386df5
--- /dev/null
+++ b/src/auth/Makefile.am
@@ -0,0 +1,46 @@
+libauth_la_SOURCES = \
+ auth/AuthAuthorizeHandler.cc \
+ auth/AuthClientHandler.cc \
+ auth/AuthSessionHandler.cc \
+ auth/AuthServiceHandler.cc \
+ auth/AuthMethodList.cc \
+ auth/cephx/CephxAuthorizeHandler.cc \
+ auth/cephx/CephxClientHandler.cc \
+ auth/cephx/CephxProtocol.cc \
+ auth/cephx/CephxServiceHandler.cc \
+ auth/cephx/CephxSessionHandler.cc \
+ auth/cephx/CephxKeyServer.cc \
+ auth/none/AuthNoneAuthorizeHandler.cc \
+ auth/unknown/AuthUnknownAuthorizeHandler.cc \
+ auth/Crypto.cc \
+ auth/KeyRing.cc \
+ auth/RotatingKeyRing.cc
+noinst_LTLIBRARIES += libauth.la
+
+noinst_HEADERS += \
+ auth/cephx/CephxAuthorizeHandler.h \
+ auth/cephx/CephxKeyServer.h \
+ auth/cephx/CephxProtocol.h \
+ auth/cephx/CephxClientHandler.h \
+ auth/cephx/CephxServiceHandler.h \
+ auth/cephx/CephxSessionHandler.h \
+ auth/none/AuthNoneAuthorizeHandler.h \
+ auth/none/AuthNoneClientHandler.h \
+ auth/none/AuthNoneServiceHandler.h \
+ auth/none/AuthNoneSessionHandler.h \
+ auth/none/AuthNoneProtocol.h \
+ auth/unknown/AuthUnknownAuthorizeHandler.h \
+ auth/unknown/AuthUnknownClientHandler.h \
+ auth/unknown/AuthUnknownServiceHandler.h \
+ auth/unknown/AuthUnknownSessionHandler.h \
+ auth/unknown/AuthUnknownProtocol.h \
+ auth/Auth.h \
+ auth/AuthMethodList.h \
+ auth/AuthClientHandler.h \
+ auth/AuthServiceHandler.h \
+ auth/AuthSessionHandler.h \
+ auth/AuthAuthorizeHandler.h \
+ auth/KeyRing.h \
+ auth/RotatingKeyRing.h \
+ auth/Crypto.h
+
diff --git a/src/auth/cephx/CephxKeyServer.cc b/src/auth/cephx/CephxKeyServer.cc
index e0c8174a2a1..e57b5575142 100644
--- a/src/auth/cephx/CephxKeyServer.cc
+++ b/src/auth/cephx/CephxKeyServer.cc
@@ -163,7 +163,7 @@ bool KeyServer::_check_rotating_secrets()
ldout(cct, 10) << __func__ << " added " << added << dendl;
data.rotating_ver++;
//data.next_rotating_time = ceph_clock_now(cct);
- //data.next_rotating_time += MIN(g_conf->auth_mon_ticket_ttl, g_conf->auth_service_ticket_ttl);
+ //data.next_rotating_time += MIN(cct->_conf->auth_mon_ticket_ttl, cct->_conf->auth_service_ticket_ttl);
_dump_rotating_secrets();
return true;
}
@@ -191,7 +191,7 @@ int KeyServer::_rotate_secret(uint32_t service_id)
RotatingSecrets& r = data.rotating_secrets[service_id];
int added = 0;
utime_t now = ceph_clock_now(cct);
- double ttl = service_id == CEPH_ENTITY_TYPE_AUTH ? g_conf->auth_mon_ticket_ttl : g_conf->auth_service_ticket_ttl;
+ double ttl = service_id == CEPH_ENTITY_TYPE_AUTH ? cct->_conf->auth_mon_ticket_ttl : cct->_conf->auth_service_ticket_ttl;
while (r.need_new_secrets(now)) {
ExpiringCryptoKey ek;
@@ -424,7 +424,7 @@ int KeyServer::_build_session_auth_info(uint32_t service_id, CephXServiceTicketI
{
info.service_id = service_id;
info.ticket = auth_ticket_info.ticket;
- info.ticket.init_timestamps(ceph_clock_now(cct), g_conf->auth_service_ticket_ttl);
+ info.ticket.init_timestamps(ceph_clock_now(cct), cct->_conf->auth_service_ticket_ttl);
generate_secret(info.session_key);
diff --git a/src/ceph-create-keys b/src/ceph-create-keys
index 176b06e7a38..0359228d5f8 100755
--- a/src/ceph-create-keys
+++ b/src/ceph-create-keys
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
import argparse
import errno
import json
diff --git a/src/ceph-disk b/src/ceph-disk
index ddaa605ebb8..64d944d9db0 100755
--- a/src/ceph-disk
+++ b/src/ceph-disk
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
import argparse
import errno
@@ -570,7 +570,7 @@ def get_fsid(cluster):
fsid = get_conf(cluster=cluster, variable='fsid')
if fsid is None:
raise Error('getting cluster uuid from configuration failed')
- return fsid
+ return fsid.lower()
def get_or_create_dmcrypt_key(
@@ -671,6 +671,7 @@ def mount(
subprocess.check_call(
args=[
'mount',
+ '-t', fstype,
'-o', options,
'--',
dev,
@@ -887,15 +888,12 @@ def prepare_journal_dev(
def prepare_journal_file(
- journal,
- journal_size):
+ journal):
if not os.path.exists(journal):
- LOG.debug('Creating journal file %s with size %dM', journal, journal_size)
+ LOG.debug('Creating journal file %s with size 0 (ceph-osd will resize and allocate)', journal)
with file(journal, 'wb') as journal_file:
- journal_file.truncate(journal_size * 1048576)
-
- # FIXME: should we resize an existing journal file?
+ pass
LOG.debug('Journal is file %s', journal)
LOG.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
@@ -920,13 +918,13 @@ def prepare_journal(
if not os.path.exists(journal):
if force_dev:
raise Error('Journal does not exist; not a block device', journal)
- return prepare_journal_file(journal, journal_size)
+ return prepare_journal_file(journal)
jmode = os.stat(journal).st_mode
if stat.S_ISREG(jmode):
if force_dev:
raise Error('Journal is not a block device', journal)
- return prepare_journal_file(journal, journal_size)
+ return prepare_journal_file(journal)
if stat.S_ISBLK(jmode):
if force_file:
@@ -1603,6 +1601,7 @@ def find_cluster_by_uuid(_uuid):
Find a cluster name by searching /etc/ceph/*.conf for a conf file
with the right uuid.
"""
+ _uuid = _uuid.lower()
no_fsid = []
if not os.path.exists('/etc/ceph'):
return None
@@ -1610,11 +1609,15 @@ def find_cluster_by_uuid(_uuid):
if not conf_file.endswith('.conf'):
continue
cluster = conf_file[:-5]
- fsid = get_conf(cluster, 'fsid')
- if fsid is None:
+ try:
+ fsid = get_fsid(cluster)
+ except Error as e:
+ if e.message != 'getting cluster uuid from configuration failed':
+ raise e
no_fsid.append(cluster)
- elif fsid == _uuid:
- return cluster
+ else:
+ if fsid == _uuid:
+ return cluster
# be tolerant of /etc/ceph/ceph.conf without an fsid defined.
if len(no_fsid) == 1 and no_fsid[0] == 'ceph':
LOG.warning('No fsid defined in /etc/ceph/ceph.conf; using anyway')
diff --git a/src/ceph-post-file.in b/src/ceph-post-file.in
index 27fea287fc6..ba366dbfd4a 100755
--- a/src/ceph-post-file.in
+++ b/src/ceph-post-file.in
@@ -97,10 +97,10 @@ nonce=`uuidgen`
# stick the user info in the dir too
dir="${id}_${user}_${nonce}"
-t1=$(tempfile) || exit
-t2=$(tempfile) || exit
-t3=$(tempfile) || exit
-t4=$(tempfile) || exit
+t1=$(mktemp) || exit
+t2=$(mktemp) || exit
+t3=$(mktemp) || exit
+t4=$(mktemp) || exit
trap "rm -f -- '$t1' '$t2' '$t3' '$t4'" EXIT
cat > $t1 <<EOF
mkdir post/$dir
diff --git a/src/ceph-rest-api b/src/ceph-rest-api
index ae5245b4f76..772b3d20fcd 100755
--- a/src/ceph-rest-api
+++ b/src/ceph-rest-api
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
# vim: ts=4 sw=4 smarttab expandtab
import argparse
diff --git a/src/ceph.in b/src/ceph.in
index ac356d80e75..075ec80c20b 100755
--- a/src/ceph.in
+++ b/src/ceph.in
@@ -35,7 +35,6 @@ if MYDIR.endswith('src') and \
if 'LD_LIBRARY_PATH' in os.environ:
if MYLIBPATH not in os.environ['LD_LIBRARY_PATH']:
os.environ['LD_LIBRARY_PATH'] += ':' + MYLIBPATH
- os.environ['PATH'] += ':' + MYDIR
print >> sys.stderr, DEVMODEMSG
os.execvp('python', ['python'] + sys.argv)
else:
@@ -43,6 +42,8 @@ if MYDIR.endswith('src') and \
print >> sys.stderr, DEVMODEMSG
os.execvp('python', ['python'] + sys.argv)
sys.path.insert(0, os.path.join(MYDIR, 'pybind'))
+ if MYDIR not in os.environ['PATH']:
+ os.environ['PATH'] += ':' + MYDIR
import argparse
import errno
@@ -475,6 +476,9 @@ def complete(sigdict, args, target):
###
def main():
+ ceph_args = os.environ.get('CEPH_ARGS')
+ if ceph_args:
+ sys.argv.extend(ceph_args.split())
parser, parsed_args, childargs = parse_cmdargs()
@@ -555,7 +559,6 @@ def main():
cluster_handle = rados.Rados(name=name, clustername=clustername,
conf_defaults=conf_defaults, conffile=conffile)
- cluster_handle.conf_parse_env()
retargs = cluster_handle.conf_parse_argv(childargs)
#tmp = childargs
childargs = retargs
@@ -641,7 +644,7 @@ def main():
if parsed_args.output_file:
try:
outf = open(parsed_args.output_file, 'w')
- except:
+ except Exception as e:
print >> sys.stderr, \
'Can\'t open output file {0}: {1}'.\
format(parsed_args.output_file, e)
diff --git a/src/ceph_osd.cc b/src/ceph_osd.cc
index d8590bff817..2388762f1df 100644
--- a/src/ceph_osd.cc
+++ b/src/ceph_osd.cc
@@ -170,7 +170,7 @@ int main(int argc, const char **argv)
if (mc.get_monmap_privately() < 0)
return -1;
- int err = OSD::mkfs(g_conf->osd_data, g_conf->osd_journal, mc.monmap.fsid, whoami);
+ int err = OSD::mkfs(g_ceph_context, g_conf->osd_data, g_conf->osd_journal, mc.monmap.fsid, whoami);
if (err < 0) {
derr << TEXT_RED << " ** ERROR: error creating empty object store in "
<< g_conf->osd_data << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl;
@@ -213,7 +213,7 @@ int main(int argc, const char **argv)
exit(0);
if (mkjournal) {
common_init_finish(g_ceph_context);
- int err = OSD::mkjournal(g_conf->osd_data, g_conf->osd_journal);
+ int err = OSD::mkjournal(g_ceph_context, g_conf->osd_data, g_conf->osd_journal);
if (err < 0) {
derr << TEXT_RED << " ** ERROR: error creating fresh journal " << g_conf->osd_journal
<< " for object store " << g_conf->osd_data
@@ -226,7 +226,7 @@ int main(int argc, const char **argv)
}
if (flushjournal) {
common_init_finish(g_ceph_context);
- int err = OSD::flushjournal(g_conf->osd_data, g_conf->osd_journal);
+ int err = OSD::flushjournal(g_ceph_context, g_conf->osd_data, g_conf->osd_journal);
if (err < 0) {
derr << TEXT_RED << " ** ERROR: error flushing journal " << g_conf->osd_journal
<< " for object store " << g_conf->osd_data
@@ -240,7 +240,7 @@ int main(int argc, const char **argv)
}
if (dump_journal) {
common_init_finish(g_ceph_context);
- int err = OSD::dump_journal(g_conf->osd_data, g_conf->osd_journal, cout);
+ int err = OSD::dump_journal(g_ceph_context, g_conf->osd_data, g_conf->osd_journal, cout);
if (err < 0) {
derr << TEXT_RED << " ** ERROR: error dumping journal " << g_conf->osd_journal
<< " for object store " << g_conf->osd_data
@@ -316,31 +316,34 @@ int main(int argc, const char **argv)
<< TEXT_NORMAL << dendl;
}
- Messenger *client_messenger = Messenger::create(g_ceph_context,
- entity_name_t::OSD(whoami), "client",
- getpid());
- Messenger *cluster_messenger = Messenger::create(g_ceph_context,
- entity_name_t::OSD(whoami), "cluster",
+ Messenger *ms_public = Messenger::create(g_ceph_context,
+ entity_name_t::OSD(whoami), "client",
+ getpid());
+ Messenger *ms_cluster = Messenger::create(g_ceph_context,
+ entity_name_t::OSD(whoami), "cluster",
+ getpid());
+ Messenger *ms_hbclient = Messenger::create(g_ceph_context,
+ entity_name_t::OSD(whoami), "hbclient",
+ getpid());
+ Messenger *ms_hb_back_server = Messenger::create(g_ceph_context,
+ entity_name_t::OSD(whoami), "hb_back_server",
getpid());
- Messenger *messenger_hbclient = Messenger::create(g_ceph_context,
- entity_name_t::OSD(whoami), "hbclient",
- getpid());
- Messenger *messenger_hb_back_server = Messenger::create(g_ceph_context,
- entity_name_t::OSD(whoami), "hb_back_server",
- getpid());
- Messenger *messenger_hb_front_server = Messenger::create(g_ceph_context,
+ Messenger *ms_hb_front_server = Messenger::create(g_ceph_context,
entity_name_t::OSD(whoami), "hb_front_server",
getpid());
- cluster_messenger->set_cluster_protocol(CEPH_OSD_PROTOCOL);
- messenger_hbclient->set_cluster_protocol(CEPH_OSD_PROTOCOL);
- messenger_hb_back_server->set_cluster_protocol(CEPH_OSD_PROTOCOL);
- messenger_hb_front_server->set_cluster_protocol(CEPH_OSD_PROTOCOL);
+ Messenger *ms_objecter = Messenger::create(g_ceph_context,
+ entity_name_t::OSD(whoami), "ms_objecter",
+ getpid());
+ ms_cluster->set_cluster_protocol(CEPH_OSD_PROTOCOL);
+ ms_hbclient->set_cluster_protocol(CEPH_OSD_PROTOCOL);
+ ms_hb_back_server->set_cluster_protocol(CEPH_OSD_PROTOCOL);
+ ms_hb_front_server->set_cluster_protocol(CEPH_OSD_PROTOCOL);
cout << "starting osd." << whoami
- << " at " << client_messenger->get_myaddr()
+ << " at " << ms_public->get_myaddr()
<< " osd_data " << g_conf->osd_data
<< " " << ((g_conf->osd_journal.empty()) ?
- "(no journal)" : g_conf->osd_journal)
+ "(no journal)" : g_conf->osd_journal)
<< std::endl;
boost::scoped_ptr<Throttle> client_byte_throttler(
@@ -356,40 +359,42 @@ int main(int argc, const char **argv)
CEPH_FEATURE_PGID64 |
CEPH_FEATURE_MSG_AUTH;
- client_messenger->set_default_policy(Messenger::Policy::stateless_server(supported, 0));
- client_messenger->set_policy_throttlers(entity_name_t::TYPE_CLIENT,
- client_byte_throttler.get(),
- client_msg_throttler.get());
- client_messenger->set_policy(entity_name_t::TYPE_MON,
+ ms_public->set_default_policy(Messenger::Policy::stateless_server(supported, 0));
+ ms_public->set_policy_throttlers(entity_name_t::TYPE_CLIENT,
+ client_byte_throttler.get(),
+ client_msg_throttler.get());
+ ms_public->set_policy(entity_name_t::TYPE_MON,
Messenger::Policy::lossy_client(supported,
CEPH_FEATURE_UID |
CEPH_FEATURE_PGID64 |
CEPH_FEATURE_OSDENC));
//try to poison pill any OSD connections on the wrong address
- client_messenger->set_policy(entity_name_t::TYPE_OSD,
- Messenger::Policy::stateless_server(0,0));
+ ms_public->set_policy(entity_name_t::TYPE_OSD,
+ Messenger::Policy::stateless_server(0,0));
- cluster_messenger->set_default_policy(Messenger::Policy::stateless_server(0, 0));
- cluster_messenger->set_policy(entity_name_t::TYPE_MON, Messenger::Policy::lossy_client(0,0));
- cluster_messenger->set_policy(entity_name_t::TYPE_OSD,
- Messenger::Policy::lossless_peer(supported,
- CEPH_FEATURE_UID |
- CEPH_FEATURE_PGID64 |
- CEPH_FEATURE_OSDENC));
- cluster_messenger->set_policy(entity_name_t::TYPE_CLIENT,
+ ms_cluster->set_default_policy(Messenger::Policy::stateless_server(0, 0));
+ ms_cluster->set_policy(entity_name_t::TYPE_MON, Messenger::Policy::lossy_client(0,0));
+ ms_cluster->set_policy(entity_name_t::TYPE_OSD,
+ Messenger::Policy::lossless_peer(supported,
+ CEPH_FEATURE_UID |
+ CEPH_FEATURE_PGID64 |
+ CEPH_FEATURE_OSDENC));
+ ms_cluster->set_policy(entity_name_t::TYPE_CLIENT,
+ Messenger::Policy::stateless_server(0, 0));
+
+ ms_hbclient->set_policy(entity_name_t::TYPE_OSD,
+ Messenger::Policy::lossy_client(0, 0));
+ ms_hb_back_server->set_policy(entity_name_t::TYPE_OSD,
Messenger::Policy::stateless_server(0, 0));
+ ms_hb_front_server->set_policy(entity_name_t::TYPE_OSD,
+ Messenger::Policy::stateless_server(0, 0));
- messenger_hbclient->set_policy(entity_name_t::TYPE_OSD,
- Messenger::Policy::lossy_client(0, 0));
- messenger_hb_back_server->set_policy(entity_name_t::TYPE_OSD,
- Messenger::Policy::stateless_server(0, 0));
- messenger_hb_front_server->set_policy(entity_name_t::TYPE_OSD,
- Messenger::Policy::stateless_server(0, 0));
+ ms_objecter->set_default_policy(Messenger::Policy::lossy_client(0, CEPH_FEATURE_OSDREPLYMUX));
- r = client_messenger->bind(g_conf->public_addr);
+ r = ms_public->bind(g_conf->public_addr);
if (r < 0)
exit(1);
- r = cluster_messenger->bind(g_conf->cluster_addr);
+ r = ms_cluster->bind(g_conf->cluster_addr);
if (r < 0)
exit(1);
@@ -400,7 +405,7 @@ int main(int argc, const char **argv)
if (hb_back_addr.is_ip())
hb_back_addr.set_port(0);
}
- r = messenger_hb_back_server->bind(hb_back_addr);
+ r = ms_hb_back_server->bind(hb_back_addr);
if (r < 0)
exit(1);
@@ -408,15 +413,17 @@ int main(int argc, const char **argv)
entity_addr_t hb_front_addr = g_conf->public_addr;
if (hb_front_addr.is_ip())
hb_front_addr.set_port(0);
- r = messenger_hb_front_server->bind(hb_front_addr);
+ r = ms_hb_front_server->bind(hb_front_addr);
if (r < 0)
exit(1);
+ ms_objecter->bind(g_conf->public_addr);
+
// Set up crypto, daemonize, etc.
global_init_daemonize(g_ceph_context, 0);
common_init_finish(g_ceph_context);
- if (g_conf->filestore_update_to >= (int)FileStore::on_disk_version) {
+ if (g_conf->filestore_update_to >= (int)FileStore::target_version) {
int err = OSD::convertfs(g_conf->osd_data, g_conf->osd_journal);
if (err < 0) {
derr << TEXT_RED << " ** ERROR: error converting store " << g_conf->osd_data
@@ -430,8 +437,13 @@ int main(int argc, const char **argv)
return -1;
global_init_chdir(g_ceph_context);
- osd = new OSD(whoami, cluster_messenger, client_messenger,
- messenger_hbclient, messenger_hb_front_server, messenger_hb_back_server,
+ osd = new OSD(g_ceph_context, whoami,
+ ms_cluster,
+ ms_public,
+ ms_hbclient,
+ ms_hb_front_server,
+ ms_hb_back_server,
+ ms_objecter,
&mc,
g_conf->osd_data, g_conf->osd_journal);
@@ -445,11 +457,12 @@ int main(int argc, const char **argv)
// Now close the standard file descriptors
global_init_shutdown_stderr(g_ceph_context);
- client_messenger->start();
- messenger_hbclient->start();
- messenger_hb_front_server->start();
- messenger_hb_back_server->start();
- cluster_messenger->start();
+ ms_public->start();
+ ms_hbclient->start();
+ ms_hb_front_server->start();
+ ms_hb_back_server->start();
+ ms_cluster->start();
+ ms_objecter->start();
// start osd
err = osd->init();
@@ -465,14 +478,17 @@ int main(int argc, const char **argv)
register_async_signal_handler_oneshot(SIGINT, handle_osd_signal);
register_async_signal_handler_oneshot(SIGTERM, handle_osd_signal);
+ osd->final_init();
+
if (g_conf->inject_early_sigterm)
kill(getpid(), SIGTERM);
- client_messenger->wait();
- messenger_hbclient->wait();
- messenger_hb_front_server->wait();
- messenger_hb_back_server->wait();
- cluster_messenger->wait();
+ ms_public->wait();
+ ms_hbclient->wait();
+ ms_hb_front_server->wait();
+ ms_hb_back_server->wait();
+ ms_cluster->wait();
+ ms_objecter->wait();
unregister_async_signal_handler(SIGHUP, sighup_handler);
unregister_async_signal_handler(SIGINT, handle_osd_signal);
@@ -481,11 +497,12 @@ int main(int argc, const char **argv)
// done
delete osd;
- delete client_messenger;
- delete messenger_hbclient;
- delete messenger_hb_front_server;
- delete messenger_hb_back_server;
- delete cluster_messenger;
+ delete ms_public;
+ delete ms_hbclient;
+ delete ms_hb_front_server;
+ delete ms_hb_back_server;
+ delete ms_cluster;
+ delete ms_objecter;
client_byte_throttler.reset();
client_msg_throttler.reset();
g_ceph_context->put();
diff --git a/src/client/Client.cc b/src/client/Client.cc
index 77fd2084cf1..60a5e4550b8 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -148,9 +148,12 @@ Client::Client(Messenger *m, MonClient *mc)
timer(m->cct, client_lock),
ino_invalidate_cb(NULL),
ino_invalidate_cb_handle(NULL),
+ dentry_invalidate_cb(NULL),
+ dentry_invalidate_cb_handle(NULL),
getgroups_cb(NULL),
getgroups_cb_handle(NULL),
async_ino_invalidator(m->cct),
+ async_dentry_invalidator(m->cct),
tick_event(NULL),
monclient(mc), messenger(m), whoami(m->get_myname().num()),
initialized(false), mounted(false), unmounting(false),
@@ -410,11 +413,17 @@ void Client::shutdown()
admin_socket->unregister_command("dump_cache");
if (ino_invalidate_cb) {
- ldout(cct, 10) << "shutdown stopping invalidator finisher" << dendl;
+ ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
async_ino_invalidator.wait_for_empty();
async_ino_invalidator.stop();
}
+ if (dentry_invalidate_cb) {
+ ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
+ async_dentry_invalidator.wait_for_empty();
+ async_dentry_invalidator.stop();
+ }
+
objectcacher->stop(); // outside of client_lock! this does a join.
client_lock.Lock();
@@ -1532,7 +1541,7 @@ void Client::_closed_mds_session(MetaSession *s)
signal_context_list(s->waiting_for_open);
mount_cond.Signal();
remove_session_caps(s);
- kick_requests(s, true);
+ kick_requests_closed(s);
mds_sessions.erase(s->mds_num);
delete s;
}
@@ -1905,7 +1914,7 @@ void Client::handle_mds_map(MMDSMap* m)
if (newstate >= MDSMap::STATE_ACTIVE) {
if (oldstate < MDSMap::STATE_ACTIVE) {
- kick_requests(p->second, false);
+ kick_requests(p->second);
kick_flushing_caps(p->second);
signal_context_list(p->second->waiting_for_open);
kick_maxsize_requests(p->second);
@@ -1989,25 +1998,16 @@ void Client::send_reconnect(MetaSession *session)
}
-void Client::kick_requests(MetaSession *session, bool signal)
+void Client::kick_requests(MetaSession *session)
{
ldout(cct, 10) << "kick_requests for mds." << session->mds_num << dendl;
-
for (map<tid_t, MetaRequest*>::iterator p = mds_requests.begin();
p != mds_requests.end();
- ++p)
+ ++p) {
if (p->second->mds == session->mds_num) {
- if (signal) {
- // only signal caller if there is a caller
- // otherwise, let resend_unsafe handle it
- if (p->second->caller_cond) {
- p->second->kick = true;
- p->second->caller_cond->Signal();
- }
- } else {
- send_request(p->second, session);
- }
+ send_request(p->second, session);
}
+ }
}
void Client::resend_unsafe_requests(MetaSession *session)
@@ -2018,6 +2018,25 @@ void Client::resend_unsafe_requests(MetaSession *session)
send_request(*iter, session);
}
+void Client::kick_requests_closed(MetaSession *session)
+{
+ ldout(cct, 10) << "kick_requests_closed for mds." << session->mds_num << dendl;
+ for (map<tid_t, MetaRequest*>::iterator p = mds_requests.begin();
+ p != mds_requests.end();
+ ++p) {
+ if (p->second->mds == session->mds_num) {
+ if (p->second->caller_cond) {
+ p->second->kick = true;
+ p->second->caller_cond->Signal();
+ }
+ p->second->item.remove_myself();
+ p->second->unsafe_item.remove_myself();
+ }
+ }
+ assert(session->requests.empty());
+ assert(session->unsafe_requests.empty());
+}
+
@@ -3551,6 +3570,45 @@ void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, MClientCa
m->put();
}
+class C_Client_DentryInvalidate : public Context {
+private:
+ Client *client;
+ vinodeno_t dirino;
+ vinodeno_t ino;
+ string name;
+public:
+ C_Client_DentryInvalidate(Client *c, Dentry *dn) :
+ client(c), dirino(dn->dir->parent_inode->vino()),
+ ino(dn->inode->vino()), name(dn->name) { }
+ void finish(int r) {
+ client->_async_dentry_invalidate(dirino, ino, name);
+ }
+};
+
+void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
+{
+ ldout(cct, 10) << "_async_dentry_invalidate '" << name << "' ino " << ino
+ << " in dir " << dirino << dendl;
+ dentry_invalidate_cb(dentry_invalidate_cb_handle, dirino, ino, name);
+}
+
+void Client::_schedule_invalidate_dentry_callback(Dentry *dn)
+{
+ if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
+ async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn));
+}
+
+void Client::_invalidate_inode_parents(Inode *in)
+{
+ set<Dentry*>::iterator q = in->dn_set.begin();
+ while (q != in->dn_set.end()) {
+ Dentry *dn = *q++;
+ // FIXME: we play lots of unlink/link tricks when handling MDS replies,
+ // so in->dn_set doesn't always reflect the state of kernel's dcache.
+ _schedule_invalidate_dentry_callback(dn);
+ unlink(dn, false);
+ }
+}
void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
{
@@ -3578,8 +3636,12 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient
in->uid = m->head.uid;
in->gid = m->head.gid;
}
+ bool deleted_inode = false;
if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
in->nlink = m->head.nlink;
+ if (in->nlink == 0 &&
+ (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
+ deleted_inode = true;
}
if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
m->xattrbl.length() &&
@@ -3633,6 +3695,10 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient
if (new_caps)
signal_cond_list(in->waitfor_caps);
+ // may drop inode's last ref
+ if (deleted_inode)
+ _invalidate_inode_parents(in);
+
m->put();
}
@@ -6319,6 +6385,17 @@ void Client::ll_register_ino_invalidate_cb(client_ino_callback_t cb, void *handl
async_ino_invalidator.start();
}
+void Client::ll_register_dentry_invalidate_cb(client_dentry_callback_t cb, void *handle)
+{
+ Mutex::Locker l(client_lock);
+ ldout(cct, 10) << "ll_register_dentry_invalidate_cb cb " << (void*)cb << " p " << (void*)handle << dendl;
+ if (cb == NULL)
+ return;
+ dentry_invalidate_cb = cb;
+ dentry_invalidate_cb_handle = handle;
+ async_dentry_invalidator.start();
+}
+
void Client::ll_register_getgroups_cb(client_getgroups_callback_t cb, void *handle)
{
Mutex::Locker l(client_lock);
diff --git a/src/client/Client.h b/src/client/Client.h
index c7c9cef0e0c..df59f235de4 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -120,6 +120,9 @@ struct MetaRequest;
typedef void (*client_ino_callback_t)(void *handle, vinodeno_t ino, int64_t off, int64_t len);
+typedef void (*client_dentry_callback_t)(void *handle, vinodeno_t dirino,
+ vinodeno_t ino, string& name);
+
typedef int (*client_getgroups_callback_t)(void *handle, uid_t uid, gid_t **sgids);
// ========================================================
@@ -211,10 +214,14 @@ class Client : public Dispatcher {
client_ino_callback_t ino_invalidate_cb;
void *ino_invalidate_cb_handle;
+ client_dentry_callback_t dentry_invalidate_cb;
+ void *dentry_invalidate_cb_handle;
+
client_getgroups_callback_t getgroups_cb;
void *getgroups_cb_handle;
Finisher async_ino_invalidator;
+ Finisher async_dentry_invalidator;
Context *tick_event;
utime_t last_cap_renew;
@@ -270,7 +277,8 @@ public:
void connect_mds_targets(int mds);
void send_request(MetaRequest *request, MetaSession *session);
MClientRequest *build_client_request(MetaRequest *request);
- void kick_requests(MetaSession *session, bool signal);
+ void kick_requests(MetaSession *session);
+ void kick_requests_closed(MetaSession *session);
void handle_client_request_forward(MClientRequestForward *reply);
void handle_client_reply(MClientReply *reply);
@@ -357,6 +365,7 @@ protected:
friend class C_Client_PutInode; // calls put_inode()
friend class C_Client_CacheInvalidate; // calls ino_invalidate_cb
+ friend class C_Client_DentryInvalidate; // calls dentry_invalidate_cb
//int get_cache_size() { return lru.lru_get_size(); }
//void set_cache_size(int m) { lru.lru_set_max(m); }
@@ -459,6 +468,10 @@ protected:
void finish_cap_snap(Inode *in, CapSnap *capsnap, int used);
void _flushed_cap_snap(Inode *in, snapid_t seq);
+ void _schedule_invalidate_dentry_callback(Dentry *dn);
+ void _async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name);
+ void _invalidate_inode_parents(Inode *in);
+
void _schedule_invalidate_callback(Inode *in, int64_t off, int64_t len, bool keep_caps);
void _invalidate_inode_cache(Inode *in, bool keep_caps);
void _invalidate_inode_cache(Inode *in, int64_t off, int64_t len, bool keep_caps);
@@ -735,6 +748,8 @@ public:
void ll_register_ino_invalidate_cb(client_ino_callback_t cb, void *handle);
+ void ll_register_dentry_invalidate_cb(client_dentry_callback_t cb, void *handle);
+
void ll_register_getgroups_cb(client_getgroups_callback_t cb, void *handle);
};
diff --git a/src/client/Makefile.am b/src/client/Makefile.am
new file mode 100644
index 00000000000..53107eba517
--- /dev/null
+++ b/src/client/Makefile.am
@@ -0,0 +1,35 @@
+libclient_la_SOURCES = \
+ client/Client.cc \
+ client/Inode.cc \
+ client/Dentry.cc \
+ client/MetaRequest.cc \
+ client/ClientSnapRealm.cc \
+ client/MetaSession.cc \
+ client/Trace.cc
+libclient_la_LIBADD = $(LIBOSDC) $(LIBEDIT_LIBS)
+noinst_LTLIBRARIES += libclient.la
+
+noinst_HEADERS += \
+ client/Client.h \
+ client/Dentry.h \
+ client/Dir.h \
+ client/Fh.h \
+ client/Inode.h \
+ client/MetaRequest.h \
+ client/MetaSession.h \
+ client/ClientSnapRealm.h \
+ client/SyntheticClient.h \
+ client/Trace.h \
+ client/ioctl.h \
+ client/ObjecterWriteback.h
+
+if WITH_FUSE
+libclient_fuse_la_SOURCES = client/fuse_ll.cc
+libclient_fuse_la_LIBADD = libclient.la -lfuse
+noinst_LTLIBRARIES += libclient_fuse.la
+noinst_HEADERS += client/fuse_ll.h
+endif
+
+ceph_test_ioctls_SOURCES = client/test_ioctls.c
+bin_DEBUGPROGRAMS += ceph_test_ioctls
+
diff --git a/src/client/SyntheticClient.cc b/src/client/SyntheticClient.cc
index cb211f5461b..6b2c7b10565 100644
--- a/src/client/SyntheticClient.cc
+++ b/src/client/SyntheticClient.cc
@@ -267,7 +267,7 @@ void parse_syn_options(vector<const char*>& args)
}
-SyntheticClient::SyntheticClient(Client *client, int w)
+SyntheticClient::SyntheticClient(Client *client, int w)
{
this->client = client;
whoami = w;
@@ -282,7 +282,7 @@ SyntheticClient::SyntheticClient(Client *client, int w)
this->iargs = syn_iargs;
this->sargs = syn_sargs;
- run_start = ceph_clock_now(g_ceph_context);
+ run_start = ceph_clock_now(client->cct);
}
@@ -332,7 +332,7 @@ int SyntheticClient::run()
return -1;
}
- //run_start = ceph_clock_now(g_ceph_context);
+ //run_start = ceph_clock_now(client->cct);
run_until = utime_t(0,0);
dout(5) << "run" << dendl;
@@ -442,7 +442,7 @@ int SyntheticClient::run()
iargs.pop_front();
if (iarg1 && run_me()) {
dout(2) << "sleepuntil " << iarg1 << dendl;
- utime_t at = ceph_clock_now(g_ceph_context) - run_start;
+ utime_t at = ceph_clock_now(client->cct) - run_start;
if (at.sec() < iarg1)
sleep(iarg1 - at.sec());
}
@@ -797,14 +797,14 @@ int SyntheticClient::run()
if (iarg1 == 0) iarg1 = 1; // play trace at least once!
for (int i=0; i<iarg1; i++) {
- utime_t start = ceph_clock_now(g_ceph_context);
+ utime_t start = ceph_clock_now(client->cct);
if (time_to_stop()) break;
play_trace(t, prefix, !playdata);
if (time_to_stop()) break;
if (iarg1 > 1) clean_dir(prefix); // clean only if repeat
- utime_t lat = ceph_clock_now(g_ceph_context);
+ utime_t lat = ceph_clock_now(client->cct);
lat -= start;
dout(0) << " trace " << tfile << " loop " << (i+1) << "/" << iarg1 << " done in " << (double)lat << " seconds" << dendl;
@@ -1012,7 +1012,7 @@ int SyntheticClient::play_trace(Trace& t, string& prefix, bool metadata_only)
char buf[1024];
char buf2[1024];
- utime_t start = ceph_clock_now(g_ceph_context);
+ utime_t start = ceph_clock_now(client->cct);
hash_map<int64_t, int64_t> open_files;
hash_map<int64_t, dir_result_t*> open_dirs;
@@ -1046,7 +1046,7 @@ int SyntheticClient::play_trace(Trace& t, string& prefix, bool metadata_only)
Cond cond;
bool ack;
bool safe;
- C_GatherBuilder safeg(g_ceph_context, new C_SafeCond(&lock, &cond, &safe));
+ C_GatherBuilder safeg(client->cct, new C_SafeCond(&lock, &cond, &safe));
Context *safegref = safeg.new_sub(); // take a ref
while (!t.end()) {
@@ -1436,7 +1436,7 @@ int SyntheticClient::play_trace(Trace& t, string& prefix, bool metadata_only)
bufferlist bl;
bl.push_back(bp);
SnapContext snapc;
- client->objecter->write(oid, oloc, off, len, snapc, bl, ceph_clock_now(g_ceph_context), 0,
+ client->objecter->write(oid, oloc, off, len, snapc, bl, ceph_clock_now(client->cct), 0,
new C_SafeCond(&lock, &cond, &ack),
safeg.new_sub());
safeg.activate();
@@ -1452,7 +1452,7 @@ int SyntheticClient::play_trace(Trace& t, string& prefix, bool metadata_only)
object_locator_t oloc(CEPH_DATA_RULE);
lock.Lock();
SnapContext snapc;
- client->objecter->zero(oid, oloc, off, len, snapc, ceph_clock_now(g_ceph_context), 0,
+ client->objecter->zero(oid, oloc, off, len, snapc, ceph_clock_now(client->cct), 0,
new C_SafeCond(&lock, &cond, &ack),
safeg.new_sub());
safeg.activate();
@@ -1762,9 +1762,9 @@ int SyntheticClient::read_dirs(const char *basedir, int dirs, int files, int dep
dout(3) << "read_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << dendl;
list<string> contents;
- utime_t s = ceph_clock_now(g_ceph_context);
+ utime_t s = ceph_clock_now(client->cct);
int r = client->getdir(basedir, contents);
- utime_t e = ceph_clock_now(g_ceph_context);
+ utime_t e = ceph_clock_now(client->cct);
e -= s;
if (r < 0) {
dout(0) << "read_dirs couldn't readdir " << basedir << ", stopping" << dendl;
@@ -1773,12 +1773,12 @@ int SyntheticClient::read_dirs(const char *basedir, int dirs, int files, int dep
for (int i=0; i<files; i++) {
snprintf(d, sizeof(d), "%s/file.%d", basedir, i);
- utime_t s = ceph_clock_now(g_ceph_context);
+ utime_t s = ceph_clock_now(client->cct);
if (client->lstat(d, &st) < 0) {
dout(2) << "read_dirs failed stat on " << d << ", stopping" << dendl;
return -1;
}
- utime_t e = ceph_clock_now(g_ceph_context);
+ utime_t e = ceph_clock_now(client->cct);
e -= s;
}
@@ -1816,7 +1816,7 @@ int SyntheticClient::make_files(int num, int count, int priv, bool more)
// files
struct stat st;
- utime_t start = ceph_clock_now(g_ceph_context);
+ utime_t start = ceph_clock_now(client->cct);
for (int c=0; c<count; c++) {
for (int n=0; n<num; n++) {
snprintf(d, sizeof(d), "dir.%d.run%d/file.client%d.%d", priv ? whoami:0, c, whoami, n);
@@ -1833,7 +1833,7 @@ int SyntheticClient::make_files(int num, int count, int priv, bool more)
if (time_to_stop()) return 0;
}
}
- utime_t end = ceph_clock_now(g_ceph_context);
+ utime_t end = ceph_clock_now(client->cct);
end -= start;
dout(0) << "makefiles time is " << end << " or " << ((double)end / (double)num) <<" per file" << dendl;
@@ -1851,24 +1851,24 @@ int SyntheticClient::link_test()
client->mkdir("orig", 0755);
client->mkdir("copy", 0755);
- utime_t start = ceph_clock_now(g_ceph_context);
+ utime_t start = ceph_clock_now(client->cct);
for (int i=0; i<num; i++) {
snprintf(d, sizeof(d), "orig/file.%d", i);
client->mknod(d, 0755);
}
- utime_t end = ceph_clock_now(g_ceph_context);
+ utime_t end = ceph_clock_now(client->cct);
end -= start;
dout(0) << "orig " << end << dendl;
// link
- start = ceph_clock_now(g_ceph_context);
+ start = ceph_clock_now(client->cct);
for (int i=0; i<num; i++) {
snprintf(d, sizeof(d), "orig/file.%d", i);
snprintf(e, sizeof(e), "copy/file.%d", i);
client->link(d, e);
}
- end = ceph_clock_now(g_ceph_context);
+ end = ceph_clock_now(client->cct);
end -= start;
dout(0) << "copy " << end << dendl;
@@ -1982,7 +1982,7 @@ int SyntheticClient::write_file(string& fn, int size, loff_t wrsize) // size i
return fd;
}
- utime_t from = ceph_clock_now(g_ceph_context);
+ utime_t from = ceph_clock_now(client->cct);
utime_t start = from;
uint64_t bytes = 0, total = 0;
@@ -2010,7 +2010,7 @@ int SyntheticClient::write_file(string& fn, int size, loff_t wrsize) // size i
bytes += wrsize;
total += wrsize;
- utime_t now = ceph_clock_now(g_ceph_context);
+ utime_t now = ceph_clock_now(client->cct);
if (now - from >= 1.0) {
double el = now - from;
dout(0) << "write " << (bytes / el / 1048576.0) << " MB/sec" << dendl;
@@ -2021,7 +2021,7 @@ int SyntheticClient::write_file(string& fn, int size, loff_t wrsize) // size i
client->fsync(fd, true);
- utime_t stop = ceph_clock_now(g_ceph_context);
+ utime_t stop = ceph_clock_now(client->cct);
double el = stop - start;
dout(0) << "write total " << (total / el / 1048576.0) << " MB/sec ("
<< total << " bytes in " << el << " seconds)" << dendl;
@@ -2098,7 +2098,7 @@ int SyntheticClient::read_file(const std::string& fn, int size,
return fd;
}
- utime_t from = ceph_clock_now(g_ceph_context);
+ utime_t from = ceph_clock_now(client->cct);
utime_t start = from;
uint64_t bytes = 0, total = 0;
@@ -2114,7 +2114,7 @@ int SyntheticClient::read_file(const std::string& fn, int size,
bytes += rdsize;
total += rdsize;
- utime_t now = ceph_clock_now(g_ceph_context);
+ utime_t now = ceph_clock_now(client->cct);
if (now - from >= 1.0) {
double el = now - from;
dout(0) << "read " << (bytes / el / 1048576.0) << " MB/sec" << dendl;
@@ -2146,7 +2146,7 @@ int SyntheticClient::read_file(const std::string& fn, int size,
dout(0) << " + " << (bad-1) << " other bad 16-byte bits in this block" << dendl;
}
- utime_t stop = ceph_clock_now(g_ceph_context);
+ utime_t stop = ceph_clock_now(client->cct);
double el = stop - start;
dout(0) << "read total " << (total / el / 1048576.0) << " MB/sec ("
<< total << " bytes in " << el << " seconds)" << dendl;
@@ -2181,7 +2181,7 @@ public:
int SyntheticClient::create_objects(int nobj, int osize, int inflight)
{
// divy up
- int numc = g_conf->num_client ? g_conf->num_client : 1;
+ int numc = client->cct->_conf->num_client ? client->cct->_conf->num_client : 1;
int start, inc, end;
@@ -2226,9 +2226,9 @@ int SyntheticClient::create_objects(int nobj, int osize, int inflight)
}
dout(10) << "writing " << oid << dendl;
- starts.push_back(ceph_clock_now(g_ceph_context));
+ starts.push_back(ceph_clock_now(client->cct));
client->client_lock.Lock();
- client->objecter->write(oid, oloc, 0, osize, snapc, bl, ceph_clock_now(g_ceph_context), 0,
+ client->objecter->write(oid, oloc, 0, osize, snapc, bl, ceph_clock_now(client->cct), 0,
new C_Ref(lock, cond, &unack),
new C_Ref(lock, cond, &unsafe));
client->client_lock.Unlock();
@@ -2240,7 +2240,7 @@ int SyntheticClient::create_objects(int nobj, int osize, int inflight)
}
lock.Unlock();
- utime_t lat = ceph_clock_now(g_ceph_context);
+ utime_t lat = ceph_clock_now(client->cct);
lat -= starts.front();
starts.pop_front();
}
@@ -2323,7 +2323,7 @@ int SyntheticClient::object_rw(int nobj, int osize, int wrpc,
SnapContext snapc;
client->client_lock.Lock();
- utime_t start = ceph_clock_now(g_ceph_context);
+ utime_t start = ceph_clock_now(client->cct);
if (write) {
dout(10) << "write to " << oid << dendl;
@@ -2339,7 +2339,7 @@ int SyntheticClient::object_rw(int nobj, int osize, int wrpc,
op.op.op = CEPH_OSD_OP_STARTSYNC;
m.ops.push_back(op);
}
- client->objecter->mutate(oid, oloc, m, snapc, ceph_clock_now(g_ceph_context), 0,
+ client->objecter->mutate(oid, oloc, m, snapc, ceph_clock_now(client->cct), 0,
NULL, new C_Ref(lock, cond, &unack));
/*client->objecter->write(oid, layout, 0, osize, snapc, bl, 0,
new C_Ref(lock, cond, &unack),
@@ -2359,7 +2359,7 @@ int SyntheticClient::object_rw(int nobj, int osize, int wrpc,
}
lock.Unlock();
- utime_t lat = ceph_clock_now(g_ceph_context);
+ utime_t lat = ceph_clock_now(client->cct);
lat -= start;
if (client->logger) {
if (write)
@@ -3292,7 +3292,7 @@ void SyntheticClient::import_find(const char *base, const char *find, bool data)
if (sp < 0) dirnum++;
//dout(0) << "leading dir " << filename << " " << dirnum << dendl;
- if (dirnum % g_conf->num_client != client->get_nodeid()) {
+ if (dirnum % client->cct->_conf->num_client != client->get_nodeid()) {
dout(20) << "skipping leading dir " << dirnum << " " << filename << dendl;
continue;
}
diff --git a/src/client/SyntheticClient.h b/src/client/SyntheticClient.h
index 3bbcb73cb1a..f9f4e3de0fb 100644
--- a/src/client/SyntheticClient.h
+++ b/src/client/SyntheticClient.h
@@ -205,7 +205,7 @@ class SyntheticClient {
}
bool time_to_stop() {
- utime_t now = ceph_clock_now(g_ceph_context);
+ utime_t now = ceph_clock_now(client->cct);
if (0) cout << "time_to_stop .. now " << now
<< " until " << run_until
<< " start " << run_start
@@ -271,6 +271,7 @@ class SyntheticClient {
void mksnap(const char *base, const char *name);
void rmsnap(const char *base, const char *name);
void mksnapfile(const char *dir);
+
};
#endif
diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc
index 0c78557f041..88f727e454e 100644
--- a/src/client/fuse_ll.cc
+++ b/src/client/fuse_ll.cc
@@ -12,7 +12,7 @@
*
*/
-#define FUSE_USE_VERSION 26
+#define FUSE_USE_VERSION 30
#include <fuse/fuse.h>
#include <fuse/fuse_lowlevel.h>
@@ -330,7 +330,7 @@ static void fuse_ll_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *
if (r == 0) {
fi->fh = (long)fh;
#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8)
- if (g_conf->fuse_use_invalidate_cb)
+ if (cfuse->client->cct->_conf->fuse_use_invalidate_cb)
fi->keep_cache = 1;
#endif
fuse_reply_open(req, fi);
@@ -551,7 +551,7 @@ static int getgroups_cb(void *handle, uid_t uid, gid_t **sgids)
}
#endif
-static void invalidate_cb(void *handle, vinodeno_t vino, int64_t off, int64_t len)
+static void ino_invalidate_cb(void *handle, vinodeno_t vino, int64_t off, int64_t len)
{
#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8)
CephFuse::Handle *cfuse = (CephFuse::Handle *)handle;
@@ -560,6 +560,19 @@ static void invalidate_cb(void *handle, vinodeno_t vino, int64_t off, int64_t le
#endif
}
+static void dentry_invalidate_cb(void *handle, vinodeno_t dirino,
+ vinodeno_t ino, string& name)
+{
+ CephFuse::Handle *cfuse = (CephFuse::Handle *)handle;
+ fuse_ino_t fdirino = cfuse->make_fake_ino(dirino.ino, dirino.snapid);
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 9)
+ fuse_ino_t fino = cfuse->make_fake_ino(ino.ino, ino.snapid);
+ fuse_lowlevel_notify_delete(cfuse->ch, fdirino, fino, name.c_str(), name.length());
+#elif FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8)
+ fuse_lowlevel_notify_inval_entry(cfuse->ch, fdirino, name.c_str(), name.length());
+#endif
+}
+
static void do_init(void *data, fuse_conn_info *bar)
{
CephFuse::Handle *cfuse = (CephFuse::Handle *)data;
@@ -673,24 +686,24 @@ int CephFuse::Handle::init(int argc, const char *argv[])
newargv[newargc++] = argv[0];
newargv[newargc++] = "-f"; // stay in foreground
- if (g_conf->fuse_allow_other) {
+ if (client->cct->_conf->fuse_allow_other) {
newargv[newargc++] = "-o";
newargv[newargc++] = "allow_other";
}
- if (g_conf->fuse_default_permissions) {
+ if (client->cct->_conf->fuse_default_permissions) {
newargv[newargc++] = "-o";
newargv[newargc++] = "default_permissions";
}
- if (g_conf->fuse_big_writes) {
+ if (client->cct->_conf->fuse_big_writes) {
newargv[newargc++] = "-o";
newargv[newargc++] = "big_writes";
}
- if (g_conf->fuse_atomic_o_trunc) {
+ if (client->cct->_conf->fuse_atomic_o_trunc) {
newargv[newargc++] = "-o";
newargv[newargc++] = "atomic_o_trunc";
}
- if (g_conf->fuse_debug)
+ if (client->cct->_conf->fuse_debug)
newargv[newargc++] = "-d";
for (int argctr = 1; argctr < argc; argctr++)
@@ -743,9 +756,10 @@ int CephFuse::Handle::init(int argc, const char *argv[])
client->ll_register_getgroups_cb(getgroups_cb, this);
*/
+ client->ll_register_dentry_invalidate_cb(dentry_invalidate_cb, this);
- if (g_conf->fuse_use_invalidate_cb)
- client->ll_register_ino_invalidate_cb(invalidate_cb, this);
+ if (client->cct->_conf->fuse_use_invalidate_cb)
+ client->ll_register_ino_invalidate_cb(ino_invalidate_cb, this);
done:
fuse_opt_free_args(&args);
diff --git a/src/client/hadoop/CephFSInterface.cc b/src/client/hadoop/CephFSInterface.cc
deleted file mode 100644
index d5a3c8f4fcd..00000000000
--- a/src/client/hadoop/CephFSInterface.cc
+++ /dev/null
@@ -1,993 +0,0 @@
-// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-#include "CephFSInterface.h"
-#include "include/cephfs/libcephfs.h"
-#include "common/ceph_argparse.h"
-#include "common/config.h"
-#include "msg/SimpleMessenger.h"
-
-#include <arpa/inet.h>
-#include <sys/stat.h>
-#include <sys/statvfs.h>
-
-#define dout_subsys ceph_subsys_hadoop
-
-union ceph_mount_union_t {
- struct ceph_mount_info *cmount;
- jlong cjlong;
-};
-
-static void set_ceph_mount_info(JNIEnv *env, jobject obj, struct ceph_mount_info *cmount)
-{
- jclass cls = env->GetObjectClass(obj);
- if (cls == NULL)
- return;
- jfieldID fid = env->GetFieldID(cls, "cluster", "J");
- if (fid == NULL)
- return;
- ceph_mount_union_t ceph_mount_union;
- ceph_mount_union.cjlong = 0;
- ceph_mount_union.cmount = cmount;
- env->SetLongField(obj, fid, ceph_mount_union.cjlong);
-}
-
-static struct ceph_mount_info *get_ceph_mount_t(JNIEnv *env, jobject obj)
-{
- jclass cls = env->GetObjectClass(obj);
- jfieldID fid = env->GetFieldID(cls, "cluster", "J");
- if (fid == NULL)
- return NULL;
- ceph_mount_union_t ceph_mount_union;
- ceph_mount_union.cjlong = env->GetLongField(obj, fid);
- return ceph_mount_union.cmount;
-}
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_initializeClient
- * Signature: (Ljava/lang/String;I)Z
- *
- * Performs any necessary setup to allow general use of the filesystem.
- * Inputs:
- * jstring args -- a command-line style input of Ceph config params
- * jint block_size -- the size in bytes to use for blocks
- * Returns: true on success, false otherwise
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1initializeClient
- (JNIEnv *env, jobject obj, jstring j_args, jint block_size)
-{
- // Convert Java argument string to argv
- const char *c_args = env->GetStringUTFChars(j_args, 0);
- if (c_args == NULL)
- return false; //out of memory!
- string cppargs(c_args);
- char b[cppargs.length()+1];
- strcpy(b, cppargs.c_str());
- env->ReleaseStringUTFChars(j_args, c_args);
- std::vector<const char*> args;
- char *p = b;
- while (*p) {
- args.push_back(p);
- while (*p && *p != ' ')
- p++;
- if (!*p)
- break;
- *p++ = 0;
- while (*p && *p == ' ')
- p++;
- }
-
- // parse the arguments
- bool set_local_writes = false;
- std::string mount_root, val;
- for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
- if (ceph_argparse_witharg(args, i, &val, "mount_root", (char*)NULL)) {
- mount_root = val;
- } else if (ceph_argparse_flag(args, i, "set_local_pg", (char*)NULL)) {
- set_local_writes = true;
- } else {
- ++i;
- }
- }
-
- // connect to the cmount
- struct ceph_mount_info *cmount;
- int ret = ceph_create(&cmount, NULL);
- if (ret)
- return false;
- ceph_conf_read_file(cmount, NULL); // read config file from the default location
- ceph_conf_parse_argv(cmount, args.size(), &args[0]);
- CephContext *cct = ceph_get_mount_context(cmount);
- ldout(cct, 3) << "CephFSInterface: mounting filesystem...:" << dendl;
-
- ret = ceph_mount(cmount, mount_root.c_str());
- if (ret)
- return false;
-
- ceph_localize_reads(cmount, true);
- ceph_set_default_file_stripe_unit(cmount, block_size);
- ceph_set_default_object_size(cmount, block_size);
-
- if (set_local_writes) {
- ceph_set_default_preferred_pg(cmount, ceph_get_local_osd(cmount));
- }
-
- set_ceph_mount_info(env, obj, cmount);
- return true;
-}
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_getcwd
- * Signature: (J)Ljava/lang/String;
- *
- * Returns the current working directory.(absolute) as a jstring
- */
-JNIEXPORT jstring JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1getcwd
- (JNIEnv *env, jobject obj)
-{
- struct ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- CephContext *cct = ceph_get_mount_context(cmount);
- ldout(cct, 10) << "CephFSInterface: In getcwd" << dendl;
- jstring j_path = env->NewStringUTF(ceph_getcwd(cmount));
- return j_path;
-}
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_setcwd
- * Signature: (Ljava/lang/String;)Z
- *
- * Changes the working directory.
- * Inputs:
- * jstring j_path: The path (relative or absolute) to switch to
- * Returns: true on success, false otherwise.
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1setcwd
-(JNIEnv *env, jobject obj, jstring j_path)
-{
- struct ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- CephContext *cct = ceph_get_mount_context(cmount);
- ldout(cct, 10) << "CephFSInterface: In setcwd" << dendl;
-
- const char *c_path = env->GetStringUTFChars(j_path, 0);
- if (c_path == NULL)
- return false;
- int ret = ceph_chdir(cmount, c_path);
- env->ReleaseStringUTFChars(j_path, c_path);
- return ret ? JNI_FALSE : JNI_TRUE;
-}
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_rmdir
- * Signature: (Ljava/lang/String;)Z
- *
- * Given a path to a directory, removes the directory.if empty.
- * Inputs:
- * jstring j_path: The path (relative or absolute) to the directory
- * Returns: true on successful delete; false otherwise
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1rmdir
- (JNIEnv *env, jobject obj, jstring j_path)
-{
- struct ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- CephContext *cct = ceph_get_mount_context(cmount);
- ldout(cct, 10) << "CephFSInterface: In rmdir" << dendl;
-
- const char *c_path = env->GetStringUTFChars(j_path, 0);
- if(c_path == NULL)
- return false;
- int ret = ceph_rmdir(cmount, c_path);
- env->ReleaseStringUTFChars(j_path, c_path);
- return ret ? JNI_FALSE : JNI_TRUE;
-}
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_unlink
- * Signature: (Ljava/lang/String;)Z
- * Given a path, unlinks it.
- * Inputs:
- * jstring j_path: The path (relative or absolute) to the file or empty dir
- * Returns: true if the unlink occurred, false otherwise.
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1unlink
- (JNIEnv *env, jobject obj, jstring j_path)
-{
- struct ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- CephContext *cct = ceph_get_mount_context(cmount);
- const char *c_path = env->GetStringUTFChars(j_path, 0);
- if (c_path == NULL)
- return false;
- ldout(cct, 10) << "CephFSInterface: In unlink for path " << c_path << ":" << dendl;
- int ret = ceph_unlink(cmount, c_path);
- env->ReleaseStringUTFChars(j_path, c_path);
- return ret ? JNI_FALSE : JNI_TRUE;
-}
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_rename
- * Signature: (Ljava/lang/String;Ljava/lang/String;)Z
- * Changes a given path name to a new name.
- * Inputs:
- * jstring j_from: The path whose name you want to change.
- * jstring j_to: The new name for the path.
- * Returns: true if the rename occurred, false otherwise
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1rename
- (JNIEnv *env, jobject obj, jstring j_from, jstring j_to)
-{
- struct ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- CephContext *cct = ceph_get_mount_context(cmount);
- ldout(cct, 10) << "CephFSInterface: In rename" << dendl;
- const char *c_from = env->GetStringUTFChars(j_from, 0);
- if (c_from == NULL)
- return false;
- const char *c_to = env->GetStringUTFChars(j_to, 0);
- if (c_to == NULL) {
- env->ReleaseStringUTFChars(j_from, c_from);
- return false;
- }
- struct stat stbuf;
- int ret = ceph_lstat(cmount, c_to, &stbuf);
- if (ret != -ENOENT) {
- // Hadoop doesn't want to overwrite files in a rename.
- env->ReleaseStringUTFChars(j_from, c_from);
- env->ReleaseStringUTFChars(j_to, c_to);
- return JNI_FALSE;
- }
-
- ret = ceph_rename(cmount, c_from, c_to);
- return ret ? JNI_FALSE : JNI_TRUE;
-}
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_exists
- * Signature: (Ljava/lang/String;)Z
- * Returns true if it the input path exists, false
- * if it does not or there is an unexpected failure.
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1exists
-(JNIEnv *env, jobject obj, jstring j_path)
-{
- struct ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- CephContext *cct = ceph_get_mount_context(cmount);
- ldout(cct, 10) << "CephFSInterface: In exists" << dendl;
-
- struct stat stbuf;
-
- const char *c_path = env->GetStringUTFChars(j_path, 0);
- if (c_path == NULL)
- return false;
- ldout(cct, 10) << "Attempting lstat with file " << c_path << ":" << dendl;
-
- int ret = ceph_lstat(cmount, c_path, &stbuf);
- ldout(cct, 10) << "result is " << ret << dendl;
- env->ReleaseStringUTFChars(j_path, c_path);
- if (ret < 0) {
- ldout(cct, 10) << "Returning false (file does not exist)" << dendl;
- return JNI_FALSE;
- }
- else {
- ldout(cct, 10) << "Returning true (file exists)" << dendl;
- return JNI_TRUE;
- }
-}
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_getblocksize
- * Signature: (Ljava/lang/String;)J
- * Get the block size for a given path.
- * Input:
- * j_string j_path: The path (relative or absolute) you want
- * the block size for.
- * Returns: block size (as a long) if the path exists, otherwise a negative
- * number corresponding to the standard C++ error codes (which are positive).
- */
-JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1getblocksize
- (JNIEnv *env, jobject obj, jstring j_path)
-{
- struct ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- CephContext *cct = ceph_get_mount_context(cmount);
- ldout(cct, 10) << "In getblocksize" << dendl;
-
- //struct stat stbuf;
-
- jlong result;
-
- const char *c_path = env->GetStringUTFChars(j_path, 0);
- if (c_path == NULL)
- return -ENOMEM;
- // we need to open the file to retrieve the stripe size
- ldout(cct, 10) << "CephFSInterface: getblocksize: opening file" << dendl;
- int fh = ceph_open(cmount, c_path, O_RDONLY, 0);
- env->ReleaseStringUTFChars(j_path, c_path);
- if (fh < 0)
- return fh;
-
- result = ceph_get_file_stripe_unit(cmount, fh);
-
- int close_result = ceph_close(cmount, fh);
- if (close_result < 0)
- return close_result;
-
- return result;
-}
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_isfile
- * Signature: (Ljava/lang/String;)Z
- * Returns true if the given path is a file; false otherwise.
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1isfile
- (JNIEnv *env, jobject obj, jstring j_path)
-{
- struct ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- CephContext *cct = ceph_get_mount_context(cmount);
- ldout(cct, 10) << "In isfile" << dendl;
-
- struct stat stbuf;
-
- const char *c_path = env->GetStringUTFChars(j_path, 0);
- if (c_path == NULL)
- return false;
- int ret = ceph_lstat(cmount, c_path, &stbuf);
- env->ReleaseStringUTFChars(j_path, c_path);
-
- // if the stat call failed, it's definitely not a file...
- if (ret < 0)
- return false;
-
- // check the stat result
- return !!S_ISREG(stbuf.st_mode);
-}
-
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_isdirectory
- * Signature: (Ljava/lang/String;)Z
- * Returns true if the given path is a directory, false otherwise.
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1isdirectory
- (JNIEnv *env, jobject obj, jstring j_path)
-{
- struct ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- CephContext *cct = ceph_get_mount_context(cmount);
- ldout(cct, 10) << "In isdirectory" << dendl;
-
- struct stat stbuf;
-
- const char *c_path = env->GetStringUTFChars(j_path, 0);
- if (c_path == NULL)
- return false;
- int result = ceph_lstat(cmount, c_path, &stbuf);
- env->ReleaseStringUTFChars(j_path, c_path);
-
- // if the stat call failed, it's definitely not a directory...
- if (result < 0)
- return JNI_FALSE;
-
- // check the stat result
- return !!S_ISDIR(stbuf.st_mode);
-}
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_getdir
- * Signature: (Ljava/lang/String;)[Ljava/lang/String;
- * Get the contents of a given directory.
- * Inputs:
- * jstring j_path: The path (relative or absolute) to the directory.
- * Returns: A Java String[] of the contents of the directory, or
- * NULL if there is an error (ie, path is not a dir). This listing
- * will not contain . or .. entries.
- */
-JNIEXPORT jobjectArray JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1getdir
-(JNIEnv *env, jobject obj, jstring j_path)
-{
- struct ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- CephContext *cct = ceph_get_mount_context(cmount);
- ldout(cct, 10) << "In getdir" << dendl;
-
- // get the directory listing
- list<string> contents;
- const char *c_path = env->GetStringUTFChars(j_path, 0);
- if (c_path == NULL) return NULL;
- struct ceph_dir_result *dirp;
- int r;
- r = ceph_opendir(cmount, c_path, &dirp);
- if (r<0) {
- env->ReleaseStringUTFChars(j_path, c_path);
- return NULL;
- }
- int buflen = 100; //good default?
- char *buf = new char[buflen];
- string *ent;
- int bufpos;
- while (1) {
- r = ceph_getdnames(cmount, dirp, buf, buflen);
- if (r==-ERANGE) { //expand the buffer
- delete [] buf;
- buflen *= 2;
- buf = new char[buflen];
- continue;
- }
- if (r<=0) break;
-
- //if we make it here, we got at least one name
- bufpos = 0;
- while (bufpos<r) {//make new strings and add them to listing
- ent = new string(buf+bufpos);
- if (ent->compare(".") && ent->compare(".."))
- //we DON'T want to include dot listings; Hadoop gets confused
- contents.push_back(*ent);
- bufpos+=ent->size()+1;
- delete ent;
- }
- }
- delete [] buf;
- ceph_closedir(cmount, dirp);
- env->ReleaseStringUTFChars(j_path, c_path);
-
- if (r < 0) return NULL;
-
- // Create a Java String array of the size of the directory listing
- jclass stringClass = env->FindClass("java/lang/String");
- if (stringClass == NULL) {
- ldout(cct, 0) << "ERROR: java String class not found; dying a horrible, painful death" << dendl;
- assert(0);
- }
- jobjectArray dirListingStringArray = (jobjectArray) env->NewObjectArray(contents.size(), stringClass, NULL);
- if(dirListingStringArray == NULL) return NULL;
-
- // populate the array with the elements of the directory list
- int i = 0;
- for (list<string>::iterator it = contents.begin();
- it != contents.end();
- ++it) {
- env->SetObjectArrayElement(dirListingStringArray, i,
- env->NewStringUTF(it->c_str()));
- ++i;
- }
-
- return dirListingStringArray;
-}
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_mkdirs
- * Signature: (Ljava/lang/String;I)I
- * Create the specified directory and any required intermediate ones with the
- * given mode.
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1mkdirs
-(JNIEnv *env, jobject obj, jstring j_path, jint mode)
-{
- struct ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- CephContext *cct = ceph_get_mount_context(cmount);
- ldout(cct, 10) << "In Hadoop mk_dirs" << dendl;
-
- //get c-style string and make the call, clean up the string...
- jint result;
- const char *c_path = env->GetStringUTFChars(j_path, 0);
- if (c_path == NULL)
- return -ENOMEM;
- result = ceph_mkdirs(cmount, c_path, mode);
- env->ReleaseStringUTFChars(j_path, c_path);
-
- //...and return
- return result;
-}
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_open_for_append
- * Signature: (Ljava/lang/String;)I
- * Open a file to append. If the file does not exist, it will be created.
- * Opening a dir is possible but may have bad results.
- * Inputs:
- * jstring j_path: The path to open.
- * Returns: a jint filehandle, or a number<0 if an error occurs.
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1open_1for_1append
-(JNIEnv *env, jobject obj, jstring j_path)
-{
- struct ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- CephContext *cct = ceph_get_mount_context(cmount);
- ldout(cct, 10) << "In hadoop open_for_append" << dendl;
-
- jint result;
-
- const char *c_path = env->GetStringUTFChars(j_path, 0);
- if (c_path == NULL)
- return -ENOMEM;
- result = ceph_open(cmount, c_path, O_WRONLY|O_CREAT|O_APPEND, 0);
- env->ReleaseStringUTFChars(j_path, c_path);
-
- return result;
-}
-
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_open_for_read
- * Signature: (Ljava/lang/String;)I
- * Open a file for reading.
- * Opening a dir is possible but may have bad results.
- * Inputs:
- * jstring j_path: The path to open.
- * Returns: a jint filehandle, or a number<0 if an error occurs.
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1open_1for_1read
- (JNIEnv *env, jobject obj, jstring j_path)
-{
- struct ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- CephContext *cct = ceph_get_mount_context(cmount);
- ldout(cct, 10) << "In open_for_read" << dendl;
-
- jint result;
-
- // open as read-only: flag = O_RDONLY
- const char *c_path = env->GetStringUTFChars(j_path, 0);
- if (c_path == NULL)
- return -ENOMEM;
- result = ceph_open(cmount, c_path, O_RDONLY, 0);
- env->ReleaseStringUTFChars(j_path, c_path);
-
- // returns file handle, or -1 on failure
- return result;
-}
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_open_for_overwrite
- * Signature: (Ljava/lang/String;)I
- * Opens a file for overwriting; creates it if necessary.
- * Opening a dir is possible but may have bad results.
- * Inputs:
- * jstring j_path: The path to open.
- * jint mode: The mode to open with.
- * Returns: a jint filehandle, or a number<0 if an error occurs.
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1open_1for_1overwrite
- (JNIEnv *env, jobject obj, jstring j_path, jint mode)
-{
- struct ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- CephContext *cct = ceph_get_mount_context(cmount);
- ldout(cct, 10) << "In open_for_overwrite" << dendl;
-
- jint result;
- const char *c_path = env->GetStringUTFChars(j_path, 0);
- if (c_path == NULL)
- return -ENOMEM;
- result = ceph_open(cmount, c_path, O_WRONLY|O_CREAT|O_TRUNC, mode);
- env->ReleaseStringUTFChars(j_path, c_path);
-
- // returns file handle, or -1 on failure
- return result;
-}
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_close
- * Signature: (I)I
- * Closes a given filehandle.
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1close
-(JNIEnv *env, jobject obj, jint fh)
-{
- struct ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- CephContext *cct = ceph_get_mount_context(cmount);
- ldout(cct, 10) << "In CephTalker::ceph_close" << dendl;
- return ceph_close(cmount, fh);
-}
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_setPermission
- * Signature: (Ljava/lang/String;I)Z
- * Change the mode on a path.
- * Inputs:
- * jstring j_path: The path to change mode on.
- * jint j_new_mode: The mode to apply.
- * Returns: true if the mode is properly applied, false if there
- * is any error.
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1setPermission
-(JNIEnv *env, jobject obj, jstring j_path, jint j_new_mode)
-{
- const char *c_path = env->GetStringUTFChars(j_path, 0);
- if (c_path == NULL)
- return false;
- struct ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- int result = ceph_chmod(cmount, c_path, j_new_mode);
- env->ReleaseStringUTFChars(j_path, c_path);
-
- return (result==0);
-}
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_kill_client
- * Signature: (J)Z
- *
- * Closes the Ceph client. This should be called before shutting down
- * (multiple times is okay but redundant).
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1kill_1client
- (JNIEnv *env, jobject obj)
-{
- struct ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- if (!cmount)
- return true;
- ceph_shutdown(cmount);
- set_ceph_mount_info(env, obj, NULL);
- return true;
-}
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_stat
- * Signature: (Ljava/lang/String;Lorg/apache/hadoop/fs/ceph/CephFileSystem/Stat;)Z
- * Get the statistics on a path returned in a custom format defined
- * in CephTalker.
- * Inputs:
- * jstring j_path: The path to stat.
- * jobject j_stat: The stat object to fill.
- * Returns: true if the stat is successful, false otherwise.
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1stat
-(JNIEnv *env, jobject obj, jstring j_path, jobject j_stat)
-{
- //setup variables
- struct stat st;
- const char *c_path = env->GetStringUTFChars(j_path, 0);
- if (c_path == NULL) return false;
-
- jclass cls = env->GetObjectClass(j_stat);
- if (cls == NULL) return false;
- jfieldID c_size_id = env->GetFieldID(cls, "size", "J");
- if (c_size_id == NULL) return false;
- jfieldID c_dir_id = env->GetFieldID(cls, "is_dir", "Z");
- if (c_dir_id == NULL) return false;
- jfieldID c_block_id = env->GetFieldID(cls, "block_size", "J");
- if (c_block_id == NULL) return false;
- jfieldID c_mod_id = env->GetFieldID(cls, "mod_time", "J");
- if (c_mod_id == NULL) return false;
- jfieldID c_access_id = env->GetFieldID(cls, "access_time", "J");
- if (c_access_id == NULL) return false;
- jfieldID c_mode_id = env->GetFieldID(cls, "mode", "I");
- if (c_mode_id == NULL) return false;
- //do actual lstat
- struct ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- int r = ceph_lstat(cmount, c_path, &st);
- env->ReleaseStringUTFChars(j_path, c_path);
-
- if (r < 0) return false; //fail out; file DNE or Ceph broke
-
- //put variables from struct stat into Java
- env->SetLongField(j_stat, c_size_id, st.st_size);
- env->SetBooleanField(j_stat, c_dir_id, (0 != S_ISDIR(st.st_mode)));
- env->SetLongField(j_stat, c_block_id, st.st_blksize);
-
- long long java_mtime(st.st_mtim.tv_sec);
- java_mtime *= 1000;
- java_mtime += st.st_mtim.tv_nsec / 1000;
- env->SetLongField(j_stat, c_mod_id, java_mtime);
-
- long long java_atime(st.st_atim.tv_sec);
- java_atime *= 1000;
- java_atime += st.st_atim.tv_nsec / 1000;
- env->SetLongField(j_stat, c_access_id, java_atime);
-
- env->SetIntField(j_stat, c_mode_id, (int)st.st_mode);
-
- //return happy
- return true;
-}
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_statfs
- * Signature: (Ljava/lang/String;Lorg/apache/hadoop/fs/ceph/CephFileSystem/CephStat;)I
- * Statfs a filesystem in a custom format defined in CephTalker.
- * Inputs:
- * jstring j_path: A path on the filesystem that you wish to stat.
- * jobject j_ceph_stat: The CephStat object to fill.
- * Returns: true if successful and the CephStat is filled; false otherwise.
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1statfs
-(JNIEnv *env, jobject obj, jstring j_path, jobject j_cephstat)
-{
- //setup variables
- struct statvfs stbuf;
- const char *c_path = env->GetStringUTFChars(j_path, 0);
- if (c_path == NULL)
- return -ENOMEM;
- jclass cls = env->GetObjectClass(j_cephstat);
- if (cls == NULL)
- return 1; //JVM error of some kind
- jfieldID c_capacity_id = env->GetFieldID(cls, "capacity", "J");
- jfieldID c_used_id = env->GetFieldID(cls, "used", "J");
- jfieldID c_remaining_id = env->GetFieldID(cls, "remaining", "J");
-
- //do the statfs
- struct ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- int r = ceph_statfs(cmount, c_path, &stbuf);
- env->ReleaseStringUTFChars(j_path, c_path);
- if (r != 0)
- return r; //something broke
-
- //place info into Java; convert from bytes to kilobytes
- env->SetLongField(j_cephstat, c_capacity_id,
- (long)stbuf.f_blocks*stbuf.f_bsize/1024);
- env->SetLongField(j_cephstat, c_used_id,
- (long)(stbuf.f_blocks-stbuf.f_bavail)*stbuf.f_bsize/1024);
- env->SetLongField(j_cephstat, c_remaining_id,
- (long)stbuf.f_bavail*stbuf.f_bsize/1024);
- return r;
-}
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_replication
- * Signature: (Ljava/lang/String;)I
- * Check how many times a path should be replicated (if it is
- * degraded it may not actually be replicated this often).
- * Inputs:
- * jstring j_path: The path to check.
- * Returns: an int containing the number of times replicated.
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1replication
-(JNIEnv *env, jobject obj, jstring j_path)
-{
- //get c-string of path, send off to libceph, release c-string, return
- const char *c_path = env->GetStringUTFChars(j_path, 0);
- if (c_path == NULL)
- return -ENOMEM;
- ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- int fh = 0;
- fh = ceph_open(cmount, c_path, O_RDONLY, 0);
- env->ReleaseStringUTFChars(j_path, c_path);
- if (fh < 0) {
- return fh;
- }
- int replication = ceph_get_file_replication(cmount, fh);
- ceph_close(cmount, fh);
- return replication;
-}
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_hosts
- * Signature: (IJ)[Ljava/lang/String;
- * Find the IP:port addresses of the primary OSD for a given file and offset.
- * Inputs:
- * jint j_fh: The filehandle for the file.
- * jlong j_offset: The offset to get the location of.
- * Returns: a jstring of the location as IP, or NULL if there is an error.
- */
-JNIEXPORT jobjectArray JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1hosts
-(JNIEnv *env, jobject obj, jint j_fh, jlong j_offset)
-{
- struct ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- struct sockaddr_storage *ss;
- char address[30];
- jobjectArray addr_array;
- jclass string_cls;
- jstring j_addr;
- int r, n = 3; /* initial guess at # of replicas */
-
- for (;;) {
- ss = new struct sockaddr_storage[n];
- r = ceph_get_file_stripe_address(cmount, j_fh, j_offset, ss, n);
- if (r < 0) {
- if (r == -ERANGE) {
- delete [] ss;
- n *= 2;
- continue;
- }
- return NULL;
- }
- n = r;
- break;
- }
-
- /* TODO: cache this */
- string_cls = env->FindClass("java/lang/String");
- if (!string_cls)
- goto out;
-
- addr_array = env->NewObjectArray(n, string_cls, NULL);
- if (!addr_array)
- goto out;
-
- for (r = 0; r < n; r++) {
- /* Hadoop only deals with IPv4 */
- if (ss[r].ss_family != AF_INET)
- goto out;
-
- memset(address, 0, sizeof(address));
-
- inet_ntop(ss[r].ss_family, &((struct sockaddr_in *)&ss[r])->sin_addr,
- address, sizeof(address));
-
- j_addr = env->NewStringUTF(address);
-
- env->SetObjectArrayElement(addr_array, r, j_addr);
- if (env->ExceptionOccurred())
- goto out;
-
- env->DeleteLocalRef(j_addr);
- }
-
- delete [] ss;
- return addr_array;
-
-out:
- delete [] ss;
- return NULL;
-}
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_setTimes
- * Signature: (Ljava/lang/String;JJ)I
- * Set the mtime and atime for a given path.
- * Inputs:
- * jstring j_path: The path to set the times for.
- * jlong mtime: The mtime to set, in millis since epoch (-1 to not set).
- * jlong atime: The atime to set, in millis since epoch (-1 to not set)
- * Returns: 0 if successful, an error code otherwise.
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1setTimes
-(JNIEnv *env, jobject obj, jstring j_path, jlong mtime, jlong atime)
-{
- const char *c_path = env->GetStringUTFChars(j_path, 0);
- if(c_path == NULL) return -ENOMEM;
-
- //build the mask for ceph_setattr
- int mask = 0;
- if (mtime!=-1) mask = CEPH_SETATTR_MTIME;
- if (atime!=-1) mask |= CEPH_SETATTR_ATIME;
- //build a struct stat and fill it in!
- //remember to convert from millis to seconds and microseconds
- struct stat attr;
- attr.st_mtim.tv_sec = mtime / 1000;
- attr.st_mtim.tv_nsec = (mtime % 1000) * 1000000;
- attr.st_atim.tv_sec = atime / 1000;
- attr.st_atim.tv_nsec = (atime % 1000) * 1000000;
- struct ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- return ceph_setattr(cmount, c_path, &attr, mask);
-}
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_read
- * Signature: (JI[BII)I
- * Reads into the given byte array from the current position.
- * Inputs:
- * jint fh: the filehandle to read from
- * jbyteArray j_buffer: the byte array to read into
- * jint buffer_offset: where in the buffer to start writing
- * jint length: how much to read.
- * There'd better be enough space in the buffer to write all
- * the data from the given offset!
- * Returns: the number of bytes read on success (as jint),
- * or an error code otherwise.
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1read
- (JNIEnv *env, jobject obj, jint fh, jbyteArray j_buffer, jint buffer_offset, jint length)
-{
- struct ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- CephContext *cct = ceph_get_mount_context(cmount);
- ldout(cct, 10) << "In read" << dendl;
-
-
- // Make sure to convert the Hadoop read arguments into a
- // more ceph-friendly form
- jint result;
-
- // Step 1: get a pointer to the buffer.
- jbyte *j_buffer_ptr = env->GetByteArrayElements(j_buffer, NULL);
- if (j_buffer_ptr == NULL) return -ENOMEM;
- char *c_buffer = (char*) j_buffer_ptr;
-
- // Step 2: pointer arithmetic to start in the right buffer position
- c_buffer += (int)buffer_offset;
-
- // Step 3: do the read
- result = ceph_read(cmount, (int)fh, c_buffer, length, -1);
-
- // Step 4: release the pointer to the buffer
- env->ReleaseByteArrayElements(j_buffer, j_buffer_ptr, 0);
-
- return result;
-}
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_seek_from_start
- * Signature: (JIJ)J
- * Seeks to the given position in the given file.
- * Inputs:
- * jint fh: The filehandle to seek in.
- * jlong pos: The position to seek to.
- * Returns: the new position (as a jlong) of the filehandle on success,
- * or a negative error code on failure.
- */
-JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1seek_1from_1start
- (JNIEnv *env, jobject obj, jint fh, jlong pos)
-{
- struct ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- CephContext *cct = ceph_get_mount_context(cmount);
- ldout(cct, 10) << "In CephTalker::seek_from_start" << dendl;
- return ceph_lseek(cmount, fh, pos, SEEK_SET);
-}
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_getpos
- * Signature: (I)J
- *
- * Get the current position in a file (as a jlong) of a given filehandle.
- * Returns: jlong current file position on success, or a
- * negative error code on failure.
- */
-JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1getpos
- (JNIEnv *env, jobject obj, jint fh)
-{
- // seek a distance of 0 to get current offset
- struct ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- CephContext *cct = ceph_get_mount_context(cmount);
- ldout(cct, 10) << "In CephTalker::ceph_getpos" << dendl;
- return ceph_lseek(cmount, fh, 0, SEEK_CUR);
-}
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_write
- * Signature: (I[BII)I
- * Write the given buffer contents to the given filehandle.
- * Inputs:
- * jint fh: The filehandle to write to.
- * jbyteArray j_buffer: The buffer to write from
- * jint buffer_offset: The position in the buffer to write from
- * jint length: The number of (sequential) bytes to write.
- * Returns: jint, on success the number of bytes written, on failure
- * a negative error code.
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1write
- (JNIEnv *env, jobject obj, jint fh, jbyteArray j_buffer, jint buffer_offset, jint length)
-{
- struct ceph_mount_info *cmount = get_ceph_mount_t(env, obj);
- CephContext *cct = ceph_get_mount_context(cmount);
- ldout(cct, 10) << "In write" << dendl;
-
- // IMPORTANT NOTE: Hadoop write arguments are a bit different from POSIX so we
- // have to convert. The write is *always* from the current position in the file,
- // and buffer_offset is the location in the *buffer* where we start writing.
- jint result;
-
- // Step 1: get a pointer to the buffer.
- jbyte *j_buffer_ptr = env->GetByteArrayElements(j_buffer, NULL);
- if (j_buffer_ptr == NULL)
- return -ENOMEM;
- char *c_buffer = (char*) j_buffer_ptr;
-
- // Step 2: pointer arithmetic to start in the right buffer position
- c_buffer += (int)buffer_offset;
-
- // Step 3: do the write
- result = ceph_write(cmount, (int)fh, c_buffer, length, -1);
-
- // Step 4: release the pointer to the buffer
- env->ReleaseByteArrayElements(j_buffer, j_buffer_ptr, 0);
-
- return result;
-}
diff --git a/src/client/hadoop/CephFSInterface.h b/src/client/hadoop/CephFSInterface.h
deleted file mode 100644
index 6939b3a501d..00000000000
--- a/src/client/hadoop/CephFSInterface.h
+++ /dev/null
@@ -1,236 +0,0 @@
-// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-/* BE CAREFUL EDITING THIS FILE - it is a compilation of JNI
- machine-generated headers */
-
-#include <jni.h>
-
-#ifndef _Included_org_apache_hadoop_fs_ceph_CephTalker
-#define _Included_org_apache_hadoop_fs_ceph_CephTalker
-#ifdef __cplusplus
-extern "C" {
-#endif
- //these constants are machine-generated to match Java constants in the source
-#undef org_apache_hadoop_fs_ceph_CephFileSystem_DEFAULT_BLOCK_SIZE
-#define org_apache_hadoop_fs_ceph_CephFileSystem_DEFAULT_BLOCK_SIZE 8388608LL
-#undef org_apache_hadoop_fs_ceph_CephInputStream_SKIP_BUFFER_SIZE
-#define org_apache_hadoop_fs_ceph_CephInputStream_SKIP_BUFFER_SIZE 2048L
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_initializeClient
- * Signature: (Ljava/lang/String;I)Z
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1initializeClient
- (JNIEnv *, jobject, jstring, jint);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_getcwd
- * Signature: ()Ljava/lang/String;
- */
-JNIEXPORT jstring JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1getcwd
- (JNIEnv *, jobject);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_setcwd
- * Signature: (Ljava/lang/String;)Z
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1setcwd
- (JNIEnv *, jobject, jstring);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_rmdir
- * Signature: (Ljava/lang/String;)Z
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1rmdir
- (JNIEnv *, jobject, jstring);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_unlink
- * Signature: (Ljava/lang/String;)Z
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1unlink
- (JNIEnv *, jobject, jstring);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_rename
- * Signature: (Ljava/lang/String;Ljava/lang/String;)Z
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1rename
- (JNIEnv *, jobject, jstring, jstring);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_exists
- * Signature: (Ljava/lang/String;)Z
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1exists
- (JNIEnv *, jobject, jstring);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_getblocksize
- * Signature: (Ljava/lang/String;)J
- */
-JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1getblocksize
- (JNIEnv *, jobject, jstring);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_isdirectory
- * Signature: (Ljava/lang/String;)Z
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1isdirectory
- (JNIEnv *, jobject, jstring);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_isfile
- * Signature: (Ljava/lang/String;)Z
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1isfile
- (JNIEnv *, jobject, jstring);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_getdir
- * Signature: (Ljava/lang/String;)[Ljava/lang/String;
- */
-JNIEXPORT jobjectArray JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1getdir
- (JNIEnv *, jobject, jstring);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_mkdirs
- * Signature: (Ljava/lang/String;I)I
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1mkdirs
- (JNIEnv *, jobject, jstring, jint);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_open_for_append
- * Signature: (Ljava/lang/String;)I
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1open_1for_1append
- (JNIEnv *, jobject, jstring);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_open_for_read
- * Signature: (Ljava/lang/String;)I
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1open_1for_1read
- (JNIEnv *, jobject, jstring);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_open_for_overwrite
- * Signature: (Ljava/lang/String;I)I
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1open_1for_1overwrite
- (JNIEnv *, jobject, jstring, jint);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_close
- * Signature: (I)I
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1close
- (JNIEnv *, jobject, jint);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_setPermission
- * Signature: (Ljava/lang/String;I)Z
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1setPermission
- (JNIEnv *, jobject, jstring, jint);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_kill_client
- * Signature: ()Z
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1kill_1client
- (JNIEnv *, jobject);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_stat
- * Signature: (Ljava/lang/String;Lorg/apache/hadoop/fs/ceph/CephFileSystem/Stat;)Z
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1stat
- (JNIEnv *, jobject, jstring, jobject);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_statfs
- * Signature: (Ljava/lang/String;Lorg/apache/hadoop/fs/ceph/CephFileSystem/CephStat;)I
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1statfs
-(JNIEnv * env, jobject obj, jstring j_path, jobject j_cephstat);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_replication
- * Signature: (Ljava/lang/String;)I
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1replication
- (JNIEnv *, jobject, jstring);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_hosts
- * Signature: (IJ)[Ljava/lang/String;
- */
-JNIEXPORT jobjectArray JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1hosts
- (JNIEnv *, jobject, jint, jlong);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_setTimes
- * Signature: (Ljava/lang/String;JJ)I
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1setTimes
- (JNIEnv *, jobject, jstring, jlong, jlong);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_read
- * Signature: (I[BII)I
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1read
- (JNIEnv *, jobject, jint, jbyteArray, jint, jint);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_seek_from_start
- * Signature: (IJ)J
- */
-JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1seek_1from_1start
- (JNIEnv *, jobject, jint, jlong);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_getpos
- * Signature: (I)J
- */
-JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1getpos
- (JNIEnv *, jobject, jint);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_write
- * Signature: (I[BII)I
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1write
- (JNIEnv *, jobject, jint, jbyteArray, jint, jint);
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/src/client/hadoop/HADOOP-ceph.patch b/src/client/hadoop/HADOOP-ceph.patch
deleted file mode 100644
index 84cdb370f77..00000000000
--- a/src/client/hadoop/HADOOP-ceph.patch
+++ /dev/null
@@ -1,2234 +0,0 @@
-diff --git a/src/core/core-default.xml b/src/core/core-default.xml
-index 8bc3b99..26543bc 100644
---- a/src/core/core-default.xml
-+++ b/src/core/core-default.xml
-@@ -210,6 +210,12 @@
- </property>
-
- <property>
-+ <name>fs.ceph.impl</name>
-+ <value>org.apache.hadoop.fs.ceph.CephFileSystem</value>
-+ <description>The file system for ceph: uris.</description>
-+</property>
-+
-+<property>
- <name>fs.har.impl.disable.cache</name>
- <value>true</value>
- <description>Don't cache 'har' filesystem instances.</description>
-diff --git a/src/core/org/apache/hadoop/fs/ceph/CephFS.java b/src/core/org/apache/hadoop/fs/ceph/CephFS.java
-new file mode 100644
-index 0000000..5d51eb2
---- /dev/null
-+++ b/src/core/org/apache/hadoop/fs/ceph/CephFS.java
-@@ -0,0 +1,250 @@
-+// -*- mode:Java; tab-width:2; c-basic-offset:2; indent-tabs-mode:t -*-
-+
-+/**
-+ *
-+ * Licensed under the Apache License, Version 2.0
-+ * (the "License"); you may not use this file except in compliance with
-+ * the License. You may obtain a copy of the License at
-+ *
-+ * http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-+ * implied. See the License for the specific language governing
-+ * permissions and limitations under the License.
-+ *
-+ *
-+ * Abstract base class for communicating with a Ceph filesystem and its
-+ * C++ codebase from Java, or pretending to do so (for unit testing purposes).
-+ * As only the Ceph package should be using this directly, all methods
-+ * are protected.
-+ */
-+package org.apache.hadoop.fs.ceph;
-+
-+import org.apache.hadoop.conf.Configuration;
-+
-+abstract class CephFS {
-+
-+ protected static final int ENOTDIR = 20;
-+ protected static final int EEXIST = 17;
-+ protected static final int ENOENT = 2;
-+
-+ /*
-+ * Performs any necessary setup to allow general use of the filesystem.
-+ * Inputs:
-+ * String argsuments -- a command-line style input of Ceph config params
-+ * int block_size -- the size in bytes to use for blocks
-+ * Returns: true on success, false otherwise
-+ */
-+ abstract protected boolean ceph_initializeClient(String arguments, int block_size);
-+
-+ /*
-+ * Returns the current working directory (absolute) as a String
-+ */
-+ abstract protected String ceph_getcwd();
-+
-+ /*
-+ * Changes the working directory.
-+ * Inputs:
-+ * String path: The path (relative or absolute) to switch to
-+ * Returns: true on success, false otherwise.
-+ */
-+ abstract protected boolean ceph_setcwd(String path);
-+
-+ /*
-+ * Given a path to a directory, removes the directory if empty.
-+ * Inputs:
-+ * jstring j_path: The path (relative or absolute) to the directory
-+ * Returns: true on successful delete; false otherwise
-+ */
-+ abstract protected boolean ceph_rmdir(String path);
-+
-+ /*
-+ * Given a path, unlinks it.
-+ * Inputs:
-+ * String path: The path (relative or absolute) to the file or empty dir
-+ * Returns: true if the unlink occurred, false otherwise.
-+ */
-+ abstract protected boolean ceph_unlink(String path);
-+
-+ /*
-+ * Changes a given path name to a new name, assuming new_path doesn't exist.
-+ * Inputs:
-+ * jstring j_from: The path whose name you want to change.
-+ * jstring j_to: The new name for the path.
-+ * Returns: true if the rename occurred, false otherwise
-+ */
-+ abstract protected boolean ceph_rename(String old_path, String new_path);
-+
-+ /*
-+ * Returns true if it the input path exists, false
-+ * if it does not or there is an unexpected failure.
-+ */
-+ abstract protected boolean ceph_exists(String path);
-+
-+ /*
-+ * Get the block size for a given path.
-+ * Input:
-+ * String path: The path (relative or absolute) you want
-+ * the block size for.
-+ * Returns: block size if the path exists, otherwise a negative number
-+ * corresponding to the standard C++ error codes (which are positive).
-+ */
-+ abstract protected long ceph_getblocksize(String path);
-+
-+ /*
-+ * Returns true if the given path is a directory, false otherwise.
-+ */
-+ abstract protected boolean ceph_isdirectory(String path);
-+
-+ /*
-+ * Returns true if the given path is a file; false otherwise.
-+ */
-+ abstract protected boolean ceph_isfile(String path);
-+
-+ /*
-+ * Get the contents of a given directory.
-+ * Inputs:
-+ * String path: The path (relative or absolute) to the directory.
-+ * Returns: A Java String[] of the contents of the directory, or
-+ * NULL if there is an error (ie, path is not a dir). This listing
-+ * will not contain . or .. entries.
-+ */
-+ abstract protected String[] ceph_getdir(String path);
-+
-+ /*
-+ * Create the specified directory and any required intermediate ones with the
-+ * given mode.
-+ */
-+ abstract protected int ceph_mkdirs(String path, int mode);
-+
-+ /*
-+ * Open a file to append. If the file does not exist, it will be created.
-+ * Opening a dir is possible but may have bad results.
-+ * Inputs:
-+ * String path: The path to open.
-+ * Returns: an int filehandle, or a number<0 if an error occurs.
-+ */
-+ abstract protected int ceph_open_for_append(String path);
-+
-+ /*
-+ * Open a file for reading.
-+ * Opening a dir is possible but may have bad results.
-+ * Inputs:
-+ * String path: The path to open.
-+ * Returns: an int filehandle, or a number<0 if an error occurs.
-+ */
-+ abstract protected int ceph_open_for_read(String path);
-+
-+ /*
-+ * Opens a file for overwriting; creates it if necessary.
-+ * Opening a dir is possible but may have bad results.
-+ * Inputs:
-+ * String path: The path to open.
-+ * int mode: The mode to open with.
-+ * Returns: an int filehandle, or a number<0 if an error occurs.
-+ */
-+ abstract protected int ceph_open_for_overwrite(String path, int mode);
-+
-+ /*
-+ * Closes the given file. Returns 0 on success, or a negative
-+ * error code otherwise.
-+ */
-+ abstract protected int ceph_close(int filehandle);
-+
-+ /*
-+ * Change the mode on a path.
-+ * Inputs:
-+ * String path: The path to change mode on.
-+ * int mode: The mode to apply.
-+ * Returns: true if the mode is properly applied, false if there
-+ * is any error.
-+ */
-+ abstract protected boolean ceph_setPermission(String path, int mode);
-+
-+ /*
-+ * Closes the Ceph client. This should be called before shutting down
-+ * (multiple times is okay but redundant).
-+ */
-+ abstract protected boolean ceph_kill_client();
-+
-+ /*
-+ * Get the statistics on a path returned in a custom format defined
-+ * in CephFileSystem.
-+ * Inputs:
-+ * String path: The path to stat.
-+ * Stat fill: The stat object to fill.
-+ * Returns: true if the stat is successful, false otherwise.
-+ */
-+ abstract protected boolean ceph_stat(String path, CephFileSystem.Stat fill);
-+
-+ /*
-+ * Check how many times a file should be replicated. If it is,
-+ * degraded it may not actually be replicated this often.
-+ * Inputs:
-+ * int fh: a file descriptor
-+ * Returns: an int containing the number of times replicated.
-+ */
-+ abstract protected int ceph_replication(String path);
-+
-+ /*
-+ * Find the IP address of the primary OSD for a given file and offset.
-+ * Inputs:
-+ * int fh: The filehandle for the file.
-+ * long offset: The offset to get the location of.
-+ * Returns: an array of String of the location as IP, or NULL if there is an error.
-+ */
-+ abstract protected String[] ceph_hosts(int fh, long offset);
-+
-+ /*
-+ * Set the mtime and atime for a given path.
-+ * Inputs:
-+ * String path: The path to set the times for.
-+ * long mtime: The mtime to set, in millis since epoch (-1 to not set).
-+ * long atime: The atime to set, in millis since epoch (-1 to not set)
-+ * Returns: 0 if successful, an error code otherwise.
-+ */
-+ abstract protected int ceph_setTimes(String path, long mtime, long atime);
-+
-+ /*
-+ * Get the current position in a file (as a long) of a given filehandle.
-+ * Returns: (long) current file position on success, or a
-+ * negative error code on failure.
-+ */
-+ abstract protected long ceph_getpos(int fh);
-+
-+ /*
-+ * Write the given buffer contents to the given filehandle.
-+ * Inputs:
-+ * int fh: The filehandle to write to.
-+ * byte[] buffer: The buffer to write from
-+ * int buffer_offset: The position in the buffer to write from
-+ * int length: The number of (sequential) bytes to write.
-+ * Returns: int, on success the number of bytes written, on failure
-+ * a negative error code.
-+ */
-+ abstract protected int ceph_write(int fh, byte[] buffer, int buffer_offset, int length);
-+
-+ /*
-+ * Reads into the given byte array from the current position.
-+ * Inputs:
-+ * int fh: the filehandle to read from
-+ * byte[] buffer: the byte array to read into
-+ * int buffer_offset: where in the buffer to start writing
-+ * int length: how much to read.
-+ * There'd better be enough space in the buffer to write all
-+ * the data from the given offset!
-+ * Returns: the number of bytes read on success (as an int),
-+ * or an error code otherwise. */
-+ abstract protected int ceph_read(int fh, byte[] buffer, int buffer_offset, int length);
-+
-+ /*
-+ * Seeks to the given position in the given file.
-+ * Inputs:
-+ * int fh: The filehandle to seek in.
-+ * long pos: The position to seek to.
-+ * Returns: the new position (as a long) of the filehandle on success,
-+ * or a negative error code on failure. */
-+ abstract protected long ceph_seek_from_start(int fh, long pos);
-+}
-diff --git a/src/core/org/apache/hadoop/fs/ceph/CephFaker.java b/src/core/org/apache/hadoop/fs/ceph/CephFaker.java
-new file mode 100644
-index 0000000..c598f53
---- /dev/null
-+++ b/src/core/org/apache/hadoop/fs/ceph/CephFaker.java
-@@ -0,0 +1,483 @@
-+// -*- mode:Java; tab-width:2; c-basic-offset:2; indent-tabs-mode:t -*-
-+
-+/**
-+ *
-+ * Licensed under the Apache License, Version 2.0
-+ * (the "License"); you may not use this file except in compliance with
-+ * the License. You may obtain a copy of the License at
-+ *
-+ * http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-+ * implied. See the License for the specific language governing
-+ * permissions and limitations under the License.
-+ *
-+ *
-+ * This uses the local Filesystem but pretends to be communicating
-+ * with a Ceph deployment, for unit testing the CephFileSystem.
-+ */
-+
-+package org.apache.hadoop.fs.ceph;
-+
-+
-+import java.net.URI;
-+import java.util.Hashtable;
-+import java.io.Closeable;
-+import java.io.FileNotFoundException;
-+import java.io.IOException;
-+
-+import org.apache.commons.logging.Log;
-+import org.apache.commons.logging.LogFactory;
-+import org.apache.hadoop.conf.Configuration;
-+import org.apache.hadoop.fs.BlockLocation;
-+import org.apache.hadoop.fs.FileStatus;
-+import org.apache.hadoop.fs.FileSystem;
-+import org.apache.hadoop.fs.FSDataInputStream;
-+import org.apache.hadoop.fs.FSDataOutputStream;
-+import org.apache.hadoop.fs.Path;
-+import org.apache.hadoop.fs.permission.FsPermission;
-+
-+
-+class CephFaker extends CephFS {
-+ private static final Log LOG = LogFactory.getLog(CephFaker.class);
-+ FileSystem localFS;
-+ String localPrefix;
-+ int blockSize;
-+ Configuration conf;
-+ Hashtable<Integer, Object> files;
-+ Hashtable<Integer, String> filenames;
-+ int fileCount = 0;
-+ boolean initialized = false;
-+
-+ public CephFaker(Configuration con, Log log) {
-+ conf = con;
-+ files = new Hashtable<Integer, Object>();
-+ filenames = new Hashtable<Integer, String>();
-+ }
-+
-+ protected boolean ceph_initializeClient(String args, int block_size) {
-+ if (!initialized) {
-+ // let's remember the default block_size
-+ blockSize = block_size;
-+
-+ /* for a real Ceph deployment, this starts up the client,
-+ * sets debugging levels, etc. We just need to get the
-+ * local FileSystem to use, and we'll ignore any
-+ * command-line arguments. */
-+ try {
-+ localFS = FileSystem.getLocal(conf);
-+ localFS.initialize(URI.create("file://localhost"), conf);
-+ localFS.setVerifyChecksum(false);
-+ String testDir = conf.get("hadoop.tmp.dir");
-+
-+ localPrefix = localFS.getWorkingDirectory().toString();
-+ int testDirLoc = localPrefix.indexOf(testDir) - 1;
-+
-+ if (-2 == testDirLoc) {
-+ testDirLoc = localPrefix.length();
-+ }
-+ localPrefix = localPrefix.substring(0, testDirLoc) + "/"
-+ + conf.get("hadoop.tmp.dir");
-+
-+ localFS.setWorkingDirectory(
-+ new Path(localPrefix + "/user/" + System.getProperty("user.name")));
-+ // I don't know why, but the unit tests expect the default
-+ // working dir to be /user/username, so satisfy them!
-+ // debug("localPrefix is " + localPrefix, INFO);
-+ } catch (IOException e) {
-+ return false;
-+ }
-+ initialized = true;
-+ }
-+ return true;
-+ }
-+
-+ protected String ceph_getcwd() {
-+ return sanitize_path(localFS.getWorkingDirectory().toString());
-+ }
-+
-+ protected boolean ceph_setcwd(String path) {
-+ localFS.setWorkingDirectory(new Path(prepare_path(path)));
-+ return true;
-+ }
-+
-+ // the caller is responsible for ensuring empty dirs
-+ protected boolean ceph_rmdir(String pth) {
-+ Path path = new Path(prepare_path(pth));
-+ boolean ret = false;
-+
-+ try {
-+ if (localFS.listStatus(path).length <= 1) {
-+ ret = localFS.delete(path, true);
-+ }
-+ } catch (IOException e) {}
-+ return ret;
-+ }
-+
-+ // this needs to work on (empty) directories too
-+ protected boolean ceph_unlink(String path) {
-+ path = prepare_path(path);
-+ boolean ret = false;
-+
-+ if (ceph_isdirectory(path)) {
-+ ret = ceph_rmdir(path);
-+ } else {
-+ try {
-+ ret = localFS.delete(new Path(path), false);
-+ } catch (IOException e) {}
-+ }
-+ return ret;
-+ }
-+
-+ protected boolean ceph_rename(String oldName, String newName) {
-+ oldName = prepare_path(oldName);
-+ newName = prepare_path(newName);
-+ try {
-+ Path parent = new Path(newName).getParent();
-+ Path newPath = new Path(newName);
-+
-+ if (localFS.exists(parent) && !localFS.exists(newPath)) {
-+ return localFS.rename(new Path(oldName), newPath);
-+ }
-+ return false;
-+ } catch (IOException e) {
-+ return false;
-+ }
-+ }
-+
-+ protected boolean ceph_exists(String path) {
-+ path = prepare_path(path);
-+ boolean ret = false;
-+
-+ try {
-+ ret = localFS.exists(new Path(path));
-+ } catch (IOException e) {}
-+ return ret;
-+ }
-+
-+ protected long ceph_getblocksize(String path) {
-+ path = prepare_path(path);
-+ try {
-+ FileStatus status = localFS.getFileStatus(new Path(path));
-+
-+ return status.getBlockSize();
-+ } catch (FileNotFoundException e) {
-+ return -CephFS.ENOENT;
-+ } catch (IOException e) {
-+ return -1; // just fail generically
-+ }
-+ }
-+
-+ protected boolean ceph_isdirectory(String path) {
-+ path = prepare_path(path);
-+ try {
-+ FileStatus status = localFS.getFileStatus(new Path(path));
-+
-+ return status.isDir();
-+ } catch (IOException e) {
-+ return false;
-+ }
-+ }
-+
-+ protected boolean ceph_isfile(String path) {
-+ path = prepare_path(path);
-+ boolean ret = false;
-+
-+ try {
-+ FileStatus status = localFS.getFileStatus(new Path(path));
-+
-+ ret = !status.isDir();
-+ } catch (Exception e) {}
-+ return ret;
-+ }
-+
-+ protected String[] ceph_getdir(String path) {
-+ path = prepare_path(path);
-+ if (!ceph_isdirectory(path)) {
-+ return null;
-+ }
-+ try {
-+ FileStatus[] stats = localFS.listStatus(new Path(path));
-+ String[] names = new String[stats.length];
-+ String name;
-+
-+ for (int i = 0; i < stats.length; ++i) {
-+ name = stats[i].getPath().toString();
-+ names[i] = name.substring(name.lastIndexOf(Path.SEPARATOR) + 1);
-+ }
-+ return names;
-+ } catch (IOException e) {}
-+ return null;
-+ }
-+
-+ protected int ceph_mkdirs(String path, int mode) {
-+ path = prepare_path(path);
-+ // debug("ceph_mkdirs on " + path, INFO);
-+ try {
-+ if (localFS.mkdirs(new Path(path), new FsPermission((short) mode))) {
-+ return 0;
-+ }
-+ } catch (IOException e) {}
-+ if (ceph_isdirectory(path)) { // apparently it already existed
-+ return -EEXIST;
-+ } else if (ceph_isfile(path)) {
-+ return -ENOTDIR;
-+ }
-+ return -1;
-+ }
-+
-+ /*
-+ * Unlike a real Ceph deployment, you can't do opens on a directory.
-+ * Since that has unpredictable behavior and you shouldn't do it anyway,
-+ * it's okay.
-+ */
-+ protected int ceph_open_for_append(String path) {
-+ path = prepare_path(path);
-+ FSDataOutputStream stream;
-+
-+ try {
-+ stream = localFS.append(new Path(path));
-+ files.put(new Integer(fileCount), stream);
-+ filenames.put(new Integer(fileCount), path);
-+ return fileCount++;
-+ } catch (IOException e) {}
-+ return -1; // failure
-+ }
-+
-+ protected int ceph_open_for_read(String path) {
-+ path = prepare_path(path);
-+ FSDataInputStream stream;
-+
-+ try {
-+ stream = localFS.open(new Path(path));
-+ files.put(new Integer(fileCount), stream);
-+ filenames.put(new Integer(fileCount), path);
-+ LOG.info("ceph_open_for_read fh:" + fileCount + ", pathname:" + path);
-+ return fileCount++;
-+ } catch (IOException e) {}
-+ return -1; // failure
-+ }
-+
-+ protected int ceph_open_for_overwrite(String path, int mode) {
-+ path = prepare_path(path);
-+ FSDataOutputStream stream;
-+
-+ try {
-+ stream = localFS.create(new Path(path));
-+ files.put(new Integer(fileCount), stream);
-+ filenames.put(new Integer(fileCount), path);
-+ LOG.info("ceph_open_for_overwrite fh:" + fileCount + ", pathname:" + path);
-+ return fileCount++;
-+ } catch (IOException e) {}
-+ return -1; // failure
-+ }
-+
-+ protected int ceph_close(int filehandle) {
-+ LOG.info("ceph_close(filehandle " + filehandle + ")");
-+ try {
-+ ((Closeable) files.get(new Integer(filehandle))).close();
-+ if (null == files.get(new Integer(filehandle))) {
-+ return -ENOENT; // this isn't quite the right error code,
-+ // but the important part is it's negative
-+ }
-+ return 0; // hurray, success
-+ } catch (NullPointerException ne) {
-+ LOG.warn("ceph_close caught NullPointerException!" + ne);
-+ } // err, how?
-+ catch (IOException ie) {
-+ LOG.warn("ceph_close caught IOException!" + ie);
-+ }
-+ return -1; // failure
-+ }
-+
-+ protected boolean ceph_setPermission(String pth, int mode) {
-+ pth = prepare_path(pth);
-+ Path path = new Path(pth);
-+ boolean ret = false;
-+
-+ try {
-+ localFS.setPermission(path, new FsPermission((short) mode));
-+ ret = true;
-+ } catch (IOException e) {}
-+ return ret;
-+ }
-+
-+ // rather than try and match a Ceph deployment's behavior exactly,
-+ // just make bad things happen if they try and call methods after this
-+ protected boolean ceph_kill_client() {
-+ // debug("ceph_kill_client", INFO);
-+ localFS.setWorkingDirectory(new Path(localPrefix));
-+ // debug("working dir is now " + localFS.getWorkingDirectory(), INFO);
-+ try {
-+ localFS.close();
-+ } catch (Exception e) {}
-+ localFS = null;
-+ files = null;
-+ filenames = null;
-+ return true;
-+ }
-+
-+ protected boolean ceph_stat(String pth, CephFileSystem.Stat fill) {
-+ pth = prepare_path(pth);
-+ Path path = new Path(pth);
-+ boolean ret = false;
-+
-+ try {
-+ FileStatus status = localFS.getFileStatus(path);
-+
-+ fill.size = status.getLen();
-+ fill.is_dir = status.isDir();
-+ fill.block_size = status.getBlockSize();
-+ fill.mod_time = status.getModificationTime();
-+ fill.access_time = status.getAccessTime();
-+ fill.mode = status.getPermission().toShort();
-+ ret = true;
-+ } catch (IOException e) {}
-+ return ret;
-+ }
-+
-+ protected int ceph_replication(String path) {
-+ path = prepare_path(path);
-+ int ret = -1; // -1 for failure
-+
-+ try {
-+ ret = localFS.getFileStatus(new Path(path)).getReplication();
-+ } catch (IOException e) {}
-+ return ret;
-+ }
-+
-+ protected String[] ceph_hosts(int fh, long offset) {
-+ String[] ret = null;
-+
-+ try {
-+ BlockLocation[] locs = localFS.getFileBlockLocations(
-+ localFS.getFileStatus(new Path(filenames.get(new Integer(fh)))),
-+ offset, 1);
-+
-+ ret = locs[0].getNames();
-+ } catch (IOException e) {} catch (NullPointerException f) {}
-+ return ret;
-+ }
-+
-+ protected int ceph_setTimes(String pth, long mtime, long atime) {
-+ pth = prepare_path(pth);
-+ Path path = new Path(pth);
-+ int ret = -1; // generic fail
-+
-+ try {
-+ localFS.setTimes(path, mtime, atime);
-+ ret = 0;
-+ } catch (IOException e) {}
-+ return ret;
-+ }
-+
-+ protected long ceph_getpos(int fh) {
-+ long ret = -1; // generic fail
-+
-+ try {
-+ Object stream = files.get(new Integer(fh));
-+
-+ if (stream instanceof FSDataInputStream) {
-+ ret = ((FSDataInputStream) stream).getPos();
-+ } else if (stream instanceof FSDataOutputStream) {
-+ ret = ((FSDataOutputStream) stream).getPos();
-+ }
-+ } catch (IOException e) {} catch (NullPointerException f) {}
-+ return ret;
-+ }
-+
-+ protected int ceph_write(int fh, byte[] buffer,
-+ int buffer_offset, int length) {
-+ LOG.info(
-+ "ceph_write fh:" + fh + ", buffer_offset:" + buffer_offset + ", length:"
-+ + length);
-+ long ret = -1; // generic fail
-+
-+ try {
-+ FSDataOutputStream os = (FSDataOutputStream) files.get(new Integer(fh));
-+
-+ LOG.info("ceph_write got outputstream");
-+ long startPos = os.getPos();
-+
-+ os.write(buffer, buffer_offset, length);
-+ ret = os.getPos() - startPos;
-+ } catch (IOException e) {
-+ LOG.warn("ceph_write caught IOException!");
-+ } catch (NullPointerException f) {
-+ LOG.warn("ceph_write caught NullPointerException!");
-+ }
-+ return (int) ret;
-+ }
-+
-+ protected int ceph_read(int fh, byte[] buffer,
-+ int buffer_offset, int length) {
-+ long ret = -1; // generic fail
-+
-+ try {
-+ FSDataInputStream is = (FSDataInputStream) files.get(new Integer(fh));
-+ long startPos = is.getPos();
-+
-+ is.read(buffer, buffer_offset, length);
-+ ret = is.getPos() - startPos;
-+ } catch (IOException e) {} catch (NullPointerException f) {}
-+ return (int) ret;
-+ }
-+
-+ protected long ceph_seek_from_start(int fh, long pos) {
-+ LOG.info("ceph_seek_from_start(fh " + fh + ", pos " + pos + ")");
-+ long ret = -1; // generic fail
-+
-+ try {
-+ LOG.info("ceph_seek_from_start filename is " + filenames.get(new Integer(fh)));
-+ if (null == files.get(new Integer(fh))) {
-+ LOG.warn("ceph_seek_from_start: is is null!");
-+ }
-+ FSDataInputStream is = (FSDataInputStream) files.get(new Integer(fh));
-+
-+ LOG.info("ceph_seek_from_start retrieved is!");
-+ is.seek(pos);
-+ ret = is.getPos();
-+ } catch (IOException e) {
-+ LOG.warn("ceph_seek_from_start caught IOException!");
-+ } catch (NullPointerException f) {
-+ LOG.warn("ceph_seek_from_start caught NullPointerException!");
-+ }
-+ return (int) ret;
-+ }
-+
-+ /*
-+ * We need to remove the localFS file prefix before returning to Ceph
-+ */
-+ private String sanitize_path(String path) {
-+ // debug("sanitize_path(" + path + ")", INFO);
-+ /* if (path.startsWith("file:"))
-+ path = path.substring("file:".length()); */
-+ if (path.startsWith(localPrefix)) {
-+ path = path.substring(localPrefix.length());
-+ if (path.length() == 0) { // it was a root path
-+ path = "/";
-+ }
-+ }
-+ // debug("sanitize_path returning " + path, INFO);
-+ return path;
-+ }
-+
-+ /*
-+ * If it's an absolute path we need to shove the
-+ * test dir onto the front as a prefix.
-+ */
-+ private String prepare_path(String path) {
-+ // debug("prepare_path(" + path + ")", INFO);
-+ if (path.startsWith("/")) {
-+ path = localPrefix + path;
-+ } else if (path.equals("..")) {
-+ if (ceph_getcwd().equals("/")) {
-+ path = ".";
-+ } // you can't go up past root!
-+ }
-+ // debug("prepare_path returning" + path, INFO);
-+ return path;
-+ }
-+}
-diff --git a/src/core/org/apache/hadoop/fs/ceph/CephFileSystem.java b/src/core/org/apache/hadoop/fs/ceph/CephFileSystem.java
-new file mode 100644
-index 0000000..95f2223
---- /dev/null
-+++ b/src/core/org/apache/hadoop/fs/ceph/CephFileSystem.java
-@@ -0,0 +1,804 @@
-+// -*- mode:Java; tab-width:2; c-basic-offset:2; indent-tabs-mode:t -*-
-+
-+/**
-+ *
-+ * Licensed under the Apache License, Version 2.0
-+ * (the "License"); you may not use this file except in compliance with
-+ * the License. You may obtain a copy of the License at
-+ *
-+ * http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-+ * implied. See the License for the specific language governing
-+ * permissions and limitations under the License.
-+ *
-+ *
-+ * Implements the Hadoop FS interfaces to allow applications to store
-+ * files in Ceph.
-+ */
-+package org.apache.hadoop.fs.ceph;
-+
-+
-+import java.io.IOException;
-+import java.io.FileNotFoundException;
-+import java.io.OutputStream;
-+import java.net.URI;
-+import java.net.InetAddress;
-+import java.util.EnumSet;
-+import java.lang.Math;
-+import java.util.ArrayList;
-+
-+import org.apache.commons.logging.Log;
-+import org.apache.commons.logging.LogFactory;
-+import org.apache.hadoop.conf.Configuration;
-+import org.apache.hadoop.fs.BlockLocation;
-+import org.apache.hadoop.fs.FSDataInputStream;
-+import org.apache.hadoop.fs.FSInputStream;
-+import org.apache.hadoop.fs.FSDataOutputStream;
-+import org.apache.hadoop.fs.FileSystem;
-+import org.apache.hadoop.fs.FileUtil;
-+import org.apache.hadoop.fs.Path;
-+import org.apache.hadoop.fs.permission.FsPermission;
-+import org.apache.hadoop.util.Progressable;
-+import org.apache.hadoop.fs.FileStatus;
-+import org.apache.hadoop.net.DNS;
-+
-+
-+/**
-+ * <p>
-+ * A {@link FileSystem} backed by <a href="http://ceph.newdream.net">Ceph.</a>.
-+ * This will not start a Ceph instance; one must already be running.
-+ * </p>
-+ * Configuration of the CephFileSystem is handled via a few Hadoop
-+ * Configuration properties: <br>
-+ * fs.ceph.monAddr -- the ip address/port of the monitor to connect to. <br>
-+ * fs.ceph.libDir -- the directory that libcephfs and libhadoopceph are
-+ * located in. This assumes Hadoop is being run on a linux-style machine
-+ * with names like libcephfs.so.
-+ * fs.ceph.commandLine -- if you prefer you can fill in this property
-+ * just as you would when starting Ceph up from the command line. Specific
-+ * properties override any configuration specified here.
-+ * <p>
-+ * You can also enable debugging of the CephFileSystem and Ceph itself: <br>
-+ * fs.ceph.debug -- if 'true' will print out method enter/exit messages,
-+ * plus a little more.
-+ * fs.ceph.clientDebug/fs.ceph.messengerDebug -- will print out debugging
-+ * from the respective Ceph system of at least that importance.
-+ */
-+public class CephFileSystem extends FileSystem {
-+ private static final Log LOG = LogFactory.getLog(CephFileSystem.class);
-+ private URI uri;
-+
-+ private Path workingDir;
-+ private final Path root;
-+ private CephFS ceph = null;
-+
-+ private static String CEPH_NAMESERVER;
-+ private static final String CEPH_NAMESERVER_KEY = "fs.ceph.nameserver";
-+ private static final String CEPH_NAMESERVER_DEFAULT = "localhost";
-+
-+ /**
-+ * Create a new CephFileSystem.
-+ */
-+ public CephFileSystem() {
-+ root = new Path("/");
-+ }
-+
-+ /**
-+ * Used for testing purposes, this constructor
-+ * sets the given CephFS instead of defaulting to a
-+ * CephTalker (with its assumed real Ceph instance to talk to).
-+ */
-+ public CephFileSystem(CephFS ceph_fs) {
-+ super();
-+ root = new Path("/");
-+ ceph = ceph_fs;
-+ }
-+
-+ /**
-+ * Lets you get the URI of this CephFileSystem.
-+ * @return the URI.
-+ */
-+ public URI getUri() {
-+ LOG.debug("getUri:exit with return " + uri);
-+ return uri;
-+ }
-+
-+ /**
-+ * Should be called after constructing a CephFileSystem but before calling
-+ * any other methods.
-+ * Starts up the connection to Ceph, reads in configuraton options, etc.
-+ * @param uri The URI for this filesystem.
-+ * @param conf The Hadoop Configuration to retrieve properties from.
-+ * @throws IOException if necessary properties are unset.
-+ */
-+ @Override
-+ public void initialize(URI uri, Configuration conf) throws IOException {
-+ super.initialize(uri, conf);
-+ setConf(conf);
-+ this.uri = URI.create(uri.getScheme() + "://" + uri.getAuthority());
-+ if (ceph == null) {
-+ ceph = new CephTalker(conf, LOG);
-+ }
-+
-+ CEPH_NAMESERVER = conf.get(CEPH_NAMESERVER_KEY, CEPH_NAMESERVER_DEFAULT);
-+
-+ // build up the arguments for Ceph
-+ String arguments = "CephFSInterface";
-+
-+ arguments += conf.get("fs.ceph.commandLine", "");
-+ if (conf.get("fs.ceph.clientDebug") != null) {
-+ arguments += " --debug_client ";
-+ arguments += conf.get("fs.ceph.clientDebug");
-+ }
-+ if (conf.get("fs.ceph.messengerDebug") != null) {
-+ arguments += " --debug_ms ";
-+ arguments += conf.get("fs.ceph.messengerDebug");
-+ }
-+ if (conf.get("fs.ceph.monAddr") != null) {
-+ arguments += " -m ";
-+ arguments += conf.get("fs.ceph.monAddr");
-+ }
-+ arguments += " --client-readahead-max-periods="
-+ + conf.get("fs.ceph.readahead", "1");
-+ // make sure they gave us a ceph monitor address or conf file
-+ LOG.info("initialize:Ceph initialization arguments: " + arguments);
-+ if ((conf.get("fs.ceph.monAddr") == null) && (arguments.indexOf("-m") == -1)
-+ && (arguments.indexOf("-c") == -1)) {
-+ LOG.fatal("initialize:You need to specify a Ceph monitor address.");
-+ throw new IOException(
-+ "You must specify a Ceph monitor address or config file!");
-+ }
-+ // Initialize the client
-+ if (!ceph.ceph_initializeClient(arguments,
-+ conf.getInt("fs.ceph.blockSize", 1 << 26))) {
-+ LOG.fatal("initialize:Ceph initialization failed!");
-+ throw new IOException("Ceph initialization failed!");
-+ }
-+ LOG.info("initialize:Ceph initialized client. Setting cwd to /");
-+ ceph.ceph_setcwd("/");
-+ LOG.debug("initialize:exit");
-+
-+ this.workingDir = getHomeDirectory();
-+ }
-+
-+ /**
-+ * Close down the CephFileSystem. Runs the base-class close method
-+ * and then kills the Ceph client itself.
-+ */
-+ @Override
-+ public void close() throws IOException {
-+ LOG.debug("close:enter");
-+ super.close(); // this method does stuff, make sure it's run!
-+ LOG.trace("close: Calling ceph_kill_client from Java");
-+ ceph.ceph_kill_client();
-+ LOG.debug("close:exit");
-+ }
-+
-+ /**
-+ * Get an FSDataOutputStream to append onto a file.
-+ * @param file The File you want to append onto
-+ * @param bufferSize Ceph does internal buffering but you can buffer in the Java code as well if you like.
-+ * @param progress The Progressable to report progress to.
-+ * Reporting is limited but exists.
-+ * @return An FSDataOutputStream that connects to the file on Ceph.
-+ * @throws IOException If the file cannot be found or appended to.
-+ */
-+ public FSDataOutputStream append(Path file, int bufferSize,
-+ Progressable progress) throws IOException {
-+ LOG.debug("append:enter with path " + file + " bufferSize " + bufferSize);
-+ Path abs_path = makeAbsolute(file);
-+
-+ if (progress != null) {
-+ progress.progress();
-+ }
-+ LOG.trace("append: Entering ceph_open_for_append from Java");
-+ int fd = ceph.ceph_open_for_append(getCephPath(abs_path));
-+
-+ LOG.trace("append: Returned to Java");
-+ if (progress != null) {
-+ progress.progress();
-+ }
-+ if (fd < 0) { // error in open
-+ throw new IOException(
-+ "append: Open for append failed on path \"" + abs_path.toString()
-+ + "\"");
-+ }
-+ CephOutputStream cephOStream = new CephOutputStream(getConf(), ceph, fd,
-+ bufferSize);
-+
-+ LOG.debug("append:exit");
-+ return new FSDataOutputStream(cephOStream, statistics);
-+ }
-+
-+ /**
-+ * Get the current working directory for the given file system
-+ * @return the directory Path
-+ */
-+ public Path getWorkingDirectory() {
-+ return workingDir;
-+ }
-+
-+ /**
-+ * Set the current working directory for the given file system. All relative
-+ * paths will be resolved relative to it.
-+ *
-+ * @param dir The directory to change to.
-+ */
-+ @Override
-+ public void setWorkingDirectory(Path dir) {
-+ workingDir = makeAbsolute(dir);
-+ }
-+
-+ /**
-+ * Return only the path component from a potentially fully qualified path.
-+ */
-+ private String getCephPath(Path path) {
-+ if (!path.isAbsolute()) {
-+ throw new IllegalArgumentException("Path must be absolute: " + path);
-+ }
-+ return path.toUri().getPath();
-+ }
-+
-+ /**
-+ * Check if a path exists.
-+ * Overriden because it's moderately faster than the generic implementation.
-+ * @param path The file to check existence on.
-+ * @return true if the file exists, false otherwise.
-+ */
-+ @Override
-+ public boolean exists(Path path) throws IOException {
-+ LOG.debug("exists:enter with path " + path);
-+ boolean result;
-+ Path abs_path = makeAbsolute(path);
-+
-+ if (abs_path.equals(root)) {
-+ result = true;
-+ } else {
-+ LOG.trace(
-+ "exists:Calling ceph_exists from Java on path " + abs_path.toString());
-+ result = ceph.ceph_exists(getCephPath(abs_path));
-+ LOG.trace("exists:Returned from ceph_exists to Java");
-+ }
-+ LOG.debug("exists:exit with value " + result);
-+ return result;
-+ }
-+
-+ /**
-+ * Create a directory and any nonexistent parents. Any portion
-+ * of the directory tree can exist without error.
-+ * @param path The directory path to create
-+ * @param perms The permissions to apply to the created directories.
-+ * @return true if successful, false otherwise
-+ * @throws IOException if the path is a child of a file.
-+ */
-+ @Override
-+ public boolean mkdirs(Path path, FsPermission perms) throws IOException {
-+ LOG.debug("mkdirs:enter with path " + path);
-+ Path abs_path = makeAbsolute(path);
-+
-+ LOG.trace("mkdirs:calling ceph_mkdirs from Java");
-+ int result = ceph.ceph_mkdirs(getCephPath(abs_path), (int) perms.toShort());
-+
-+ if (result != 0) {
-+ LOG.warn(
-+ "mkdirs: make directory " + abs_path + "Failing with result " + result);
-+ if (-ceph.ENOTDIR == result) {
-+ throw new IOException("Parent path is not a directory");
-+ }
-+ return false;
-+ } else {
-+ LOG.debug("mkdirs:exiting succesfully");
-+ return true;
-+ }
-+ }
-+
-+ /**
-+ * Check if a path is a file. This is moderately faster than the
-+ * generic implementation.
-+ * @param path The path to check.
-+ * @return true if the path is definitely a file, false otherwise.
-+ */
-+ @Override
-+ public boolean isFile(Path path) throws IOException {
-+ LOG.debug("isFile:enter with path " + path);
-+ Path abs_path = makeAbsolute(path);
-+ boolean result;
-+
-+ if (abs_path.equals(root)) {
-+ result = false;
-+ } else {
-+ LOG.trace("isFile:entering ceph_isfile from Java");
-+ result = ceph.ceph_isfile(getCephPath(abs_path));
-+ }
-+ LOG.debug("isFile:exit with result " + result);
-+ return result;
-+ }
-+
-+ /**
-+ * Get stat information on a file. This does not fill owner or group, as
-+ * Ceph's support for these is a bit different than HDFS'.
-+ * @param path The path to stat.
-+ * @return FileStatus object containing the stat information.
-+ * @throws FileNotFoundException if the path could not be resolved.
-+ */
-+ public FileStatus getFileStatus(Path path) throws IOException {
-+ LOG.debug("getFileStatus:enter with path " + path);
-+ Path abs_path = makeAbsolute(path);
-+ // sadly, Ceph doesn't really do uids/gids just yet, but
-+ // everything else is filled
-+ FileStatus status;
-+ Stat lstat = new Stat();
-+
-+ LOG.trace("getFileStatus: calling ceph_stat from Java");
-+ if (ceph.ceph_stat(getCephPath(abs_path), lstat)) {
-+ status = new FileStatus(lstat.size, lstat.is_dir,
-+ ceph.ceph_replication(getCephPath(abs_path)), lstat.block_size,
-+ lstat.mod_time, lstat.access_time,
-+ new FsPermission((short) lstat.mode), System.getProperty("user.name"), null,
-+ path.makeQualified(this));
-+ } else { // fail out
-+ throw new FileNotFoundException(
-+ "org.apache.hadoop.fs.ceph.CephFileSystem: File " + path
-+ + " does not exist or could not be accessed");
-+ }
-+
-+ LOG.debug("getFileStatus:exit");
-+ return status;
-+ }
-+
-+ /**
-+ * Get the FileStatus for each listing in a directory.
-+ * @param path The directory to get listings from.
-+ * @return FileStatus[] containing one FileStatus for each directory listing;
-+ * null if path does not exist.
-+ */
-+ public FileStatus[] listStatus(Path path) throws IOException {
-+ LOG.debug("listStatus:enter with path " + path);
-+ Path abs_path = makeAbsolute(path);
-+ Path[] paths = listPaths(abs_path);
-+
-+ if (paths != null) {
-+ FileStatus[] statuses = new FileStatus[paths.length];
-+
-+ for (int i = 0; i < paths.length; ++i) {
-+ statuses[i] = getFileStatus(paths[i]);
-+ }
-+ LOG.debug("listStatus:exit");
-+ return statuses;
-+ }
-+
-+ if (isFile(path)) {
-+ return new FileStatus[] { getFileStatus(path) };
-+ }
-+
-+ return null;
-+ }
-+
-+ @Override
-+ public void setPermission(Path p, FsPermission permission) throws IOException {
-+ LOG.debug(
-+ "setPermission:enter with path " + p + " and permissions " + permission);
-+ Path abs_path = makeAbsolute(p);
-+
-+ LOG.trace("setPermission:calling ceph_setpermission from Java");
-+ ceph.ceph_setPermission(getCephPath(abs_path), permission.toShort());
-+ LOG.debug("setPermission:exit");
-+ }
-+
-+ /**
-+ * Set access/modification times of a file.
-+ * @param p The path
-+ * @param mtime Set modification time in number of millis since Jan 1, 1970.
-+ * @param atime Set access time in number of millis since Jan 1, 1970.
-+ */
-+ @Override
-+ public void setTimes(Path p, long mtime, long atime) throws IOException {
-+ LOG.debug(
-+ "setTimes:enter with path " + p + " mtime:" + mtime + " atime:" + atime);
-+ Path abs_path = makeAbsolute(p);
-+
-+ LOG.trace("setTimes:calling ceph_setTimes from Java");
-+ int r = ceph.ceph_setTimes(getCephPath(abs_path), mtime, atime);
-+
-+ if (r < 0) {
-+ throw new IOException(
-+ "Failed to set times on path " + abs_path.toString() + " Error code: "
-+ + r);
-+ }
-+ LOG.debug("setTimes:exit");
-+ }
-+
-+ /**
-+ * Create a new file and open an FSDataOutputStream that's connected to it.
-+ * @param path The file to create.
-+ * @param permission The permissions to apply to the file.
-+ * @param overwrite If true, overwrite any existing file with
-+ * this name; otherwise don't.
-+ * @param bufferSize Ceph does internal buffering, but you can buffer
-+ * in the Java code too if you like.
-+ * @param replication Ignored by Ceph. This can be
-+ * configured via Ceph configuration.
-+ * @param blockSize Ignored by Ceph. You can set client-wide block sizes
-+ * via the fs.ceph.blockSize param if you like.
-+ * @param progress A Progressable to report back to.
-+ * Reporting is limited but exists.
-+ * @return An FSDataOutputStream pointing to the created file.
-+ * @throws IOException if the path is an
-+ * existing directory, or the path exists but overwrite is false, or there is a
-+ * failure in attempting to open for append with Ceph.
-+ */
-+ public FSDataOutputStream create(Path path,
-+ FsPermission permission,
-+ boolean overwrite,
-+ int bufferSize,
-+ short replication,
-+ long blockSize,
-+ Progressable progress) throws IOException {
-+ LOG.debug("create:enter with path " + path);
-+ Path abs_path = makeAbsolute(path);
-+
-+ if (progress != null) {
-+ progress.progress();
-+ }
-+ // We ignore replication since that's not configurable here, and
-+ // progress reporting is quite limited.
-+ // Required semantics: if the file exists, overwrite if 'overwrite' is set;
-+ // otherwise, throw an exception
-+
-+ // Step 1: existence test
-+ boolean exists = exists(abs_path);
-+
-+ if (exists) {
-+ if (getFileStatus(abs_path).isDir()) {
-+ throw new IOException(
-+ "create: Cannot overwrite existing directory \"" + path.toString()
-+ + "\" with a file");
-+ }
-+ if (!overwrite) {
-+ throw new IOException(
-+ "createRaw: Cannot open existing file \"" + abs_path.toString()
-+ + "\" for writing without overwrite flag");
-+ }
-+ }
-+
-+ if (progress != null) {
-+ progress.progress();
-+ }
-+
-+ // Step 2: create any nonexistent directories in the path
-+ if (!exists) {
-+ Path parent = abs_path.getParent();
-+
-+ if (parent != null) { // if parent is root, we're done
-+ int r = ceph.ceph_mkdirs(getCephPath(parent), permission.toShort());
-+
-+ if (!(r == 0 || r == -ceph.EEXIST)) {
-+ throw new IOException("Error creating parent directory; code: " + r);
-+ }
-+ }
-+ if (progress != null) {
-+ progress.progress();
-+ }
-+ }
-+ // Step 3: open the file
-+ LOG.trace("calling ceph_open_for_overwrite from Java");
-+ int fh = ceph.ceph_open_for_overwrite(getCephPath(abs_path),
-+ (int) permission.toShort());
-+
-+ if (progress != null) {
-+ progress.progress();
-+ }
-+ LOG.trace("Returned from ceph_open_for_overwrite to Java with fh " + fh);
-+ if (fh < 0) {
-+ throw new IOException(
-+ "create: Open for overwrite failed on path \"" + path.toString()
-+ + "\"");
-+ }
-+
-+ // Step 4: create the stream
-+ OutputStream cephOStream = new CephOutputStream(getConf(), ceph, fh,
-+ bufferSize);
-+
-+ LOG.debug("create:exit");
-+ return new FSDataOutputStream(cephOStream, statistics);
-+ }
-+
-+ /**
-+ * Open a Ceph file and attach the file handle to an FSDataInputStream.
-+ * @param path The file to open
-+ * @param bufferSize Ceph does internal buffering; but you can buffer in
-+ * the Java code too if you like.
-+ * @return FSDataInputStream reading from the given path.
-+ * @throws IOException if the path DNE or is a
-+ * directory, or there is an error getting data to set up the FSDataInputStream.
-+ */
-+ public FSDataInputStream open(Path path, int bufferSize) throws IOException {
-+ LOG.debug("open:enter with path " + path);
-+ Path abs_path = makeAbsolute(path);
-+
-+ int fh = ceph.ceph_open_for_read(getCephPath(abs_path));
-+
-+ if (fh < 0) { // uh-oh, something's bad!
-+ if (fh == -ceph.ENOENT) { // well that was a stupid open
-+ throw new IOException(
-+ "open: absolute path \"" + abs_path.toString()
-+ + "\" does not exist");
-+ } else { // hrm...the file exists but we can't open it :(
-+ throw new IOException("open: Failed to open file " + abs_path.toString());
-+ }
-+ }
-+
-+ if (getFileStatus(abs_path).isDir()) { // yes, it is possible to open Ceph directories
-+ // but that doesn't mean you should in Hadoop!
-+ ceph.ceph_close(fh);
-+ throw new IOException(
-+ "open: absolute path \"" + abs_path.toString() + "\" is a directory!");
-+ }
-+ Stat lstat = new Stat();
-+
-+ LOG.trace("open:calling ceph_stat from Java");
-+ ceph.ceph_stat(getCephPath(abs_path), lstat);
-+ LOG.trace("open:returned to Java");
-+ long size = lstat.size;
-+
-+ if (size < 0) {
-+ throw new IOException(
-+ "Failed to get file size for file " + abs_path.toString()
-+ + " but succeeded in opening file. Something bizarre is going on.");
-+ }
-+ FSInputStream cephIStream = new CephInputStream(getConf(), ceph, fh, size,
-+ bufferSize);
-+
-+ LOG.debug("open:exit");
-+ return new FSDataInputStream(cephIStream);
-+ }
-+
-+ /**
-+ * Rename a file or directory.
-+ * @param src The current path of the file/directory
-+ * @param dst The new name for the path.
-+ * @return true if the rename succeeded, false otherwise.
-+ */
-+ @Override
-+ public boolean rename(Path src, Path dst) throws IOException {
-+ LOG.debug("rename:enter with src:" + src + " and dest:" + dst);
-+ Path abs_src = makeAbsolute(src);
-+ Path abs_dst = makeAbsolute(dst);
-+
-+ LOG.trace("calling ceph_rename from Java");
-+ boolean result = ceph.ceph_rename(getCephPath(abs_src), getCephPath(abs_dst));
-+
-+ if (!result) {
-+ boolean isDir = false;
-+ try {
-+ isDir = getFileStatus(abs_dst).isDir();
-+ } catch (FileNotFoundException e) {}
-+ if (isDir) { // move the srcdir into destdir
-+ LOG.debug("ceph_rename failed but dst is a directory!");
-+ Path new_dst = new Path(abs_dst, abs_src.getName());
-+
-+ result = rename(abs_src, new_dst);
-+ LOG.debug(
-+ "attempt to move " + abs_src.toString() + " to "
-+ + new_dst.toString() + "has result:" + result);
-+ }
-+ }
-+ LOG.debug("rename:exit with result: " + result);
-+ return result;
-+ }
-+
-+ /*
-+ * Attempt to convert an IP into its hostname
-+ */
-+ private String[] ips2Hosts(String[] ips) {
-+ ArrayList<String> hosts = new ArrayList<String>();
-+ for (String ip : ips) {
-+ try {
-+ String host = DNS.reverseDns(InetAddress.getByName(ip), CEPH_NAMESERVER);
-+ if (host.charAt(host.length()-1) == '.') {
-+ host = host.substring(0, host.length()-1);
-+ }
-+ hosts.add(host); /* append */
-+ } catch (Exception e) {
-+ LOG.error("reverseDns ["+ip+"] failed: "+ e);
-+ }
-+ }
-+ return hosts.toArray(new String[hosts.size()]);
-+ }
-+
-+ /**
-+ * Get a BlockLocation object for each block in a file.
-+ *
-+ * Note that this doesn't include port numbers in the name field as
-+ * Ceph handles slow/down servers internally. This data should be used
-+ * only for selecting which servers to run which jobs on.
-+ *
-+ * @param file A FileStatus object corresponding to the file you want locations for.
-+ * @param start The offset of the first part of the file you are interested in.
-+ * @param len The amount of the file past the offset you are interested in.
-+ * @return A BlockLocation[] where each object corresponds to a block within
-+ * the given range.
-+ */
-+ @Override
-+ public BlockLocation[] getFileBlockLocations(FileStatus file, long start, long len) throws IOException {
-+ Path abs_path = makeAbsolute(file.getPath());
-+
-+ int fh = ceph.ceph_open_for_read(getCephPath(abs_path));
-+ if (fh < 0) {
-+ LOG.error("getFileBlockLocations:got error " + fh + ", exiting and returning null!");
-+ return null;
-+ }
-+
-+ long blockSize = ceph.ceph_getblocksize(getCephPath(abs_path));
-+ BlockLocation[] locations = new BlockLocation[(int) Math.ceil(len / (float) blockSize)];
-+
-+ for (int i = 0; i < locations.length; ++i) {
-+ long offset = start + i * blockSize;
-+ long blockStart = start + i * blockSize - (start % blockSize);
-+ String ips[] = ceph.ceph_hosts(fh, offset);
-+ String hosts[] = ips2Hosts(ips);
-+ locations[i] = new BlockLocation(null, hosts, blockStart, blockSize);
-+ LOG.debug("getFileBlockLocations: location[" + i + "]: " + locations[i]);
-+ }
-+
-+ ceph.ceph_close(fh);
-+ return locations;
-+ }
-+
-+ @Deprecated
-+ public boolean delete(Path path) throws IOException {
-+ return delete(path, false);
-+ }
-+
-+ /**
-+ * Delete the given path, and optionally its children.
-+ * @param path the path to delete.
-+ * @param recursive If the path is a non-empty directory and this is false,
-+ * delete will throw an IOException. If path is a file this is ignored.
-+ * @return true if the delete succeeded, false otherwise (including if
-+ * path doesn't exist).
-+ * @throws IOException if you attempt to non-recursively delete a directory,
-+ * or you attempt to delete the root directory.
-+ */
-+ public boolean delete(Path path, boolean recursive) throws IOException {
-+ LOG.debug("delete:enter with path " + path + " and recursive=" + recursive);
-+ Path abs_path = makeAbsolute(path);
-+
-+ // sanity check
-+ if (abs_path.equals(root)) {
-+ throw new IOException("Error: deleting the root directory is a Bad Idea.");
-+ }
-+ if (!exists(abs_path)) {
-+ return false;
-+ }
-+
-+ // if the path is a file, try to delete it.
-+ if (isFile(abs_path)) {
-+ LOG.trace("delete:calling ceph_unlink from Java with path " + abs_path);
-+ boolean result = ceph.ceph_unlink(getCephPath(abs_path));
-+
-+ if (!result) {
-+ LOG.error(
-+ "delete: failed to delete file \"" + abs_path.toString() + "\".");
-+ }
-+ LOG.debug("delete:exit with success=" + result);
-+ return result;
-+ }
-+
-+ /* The path is a directory, so recursively try to delete its contents,
-+ and then delete the directory. */
-+ // get the entries; listPaths will remove . and .. for us
-+ Path[] contents = listPaths(abs_path);
-+
-+ if (contents == null) {
-+ LOG.error(
-+ "delete: Failed to read contents of directory \""
-+ + abs_path.toString() + "\" while trying to delete it, BAILING");
-+ return false;
-+ }
-+ if (!recursive && contents.length > 0) {
-+ throw new IOException("Directories must be deleted recursively!");
-+ }
-+ // delete the entries
-+ LOG.debug("delete: recursively calling delete on contents of " + abs_path);
-+ for (Path p : contents) {
-+ if (!delete(p, true)) {
-+ LOG.error(
-+ "delete: Failed to delete file \"" + p.toString()
-+ + "\" while recursively deleting \"" + abs_path.toString()
-+ + "\", BAILING");
-+ return false;
-+ }
-+ }
-+ // if we've come this far it's a now-empty directory, so delete it!
-+ boolean result = ceph.ceph_rmdir(getCephPath(abs_path));
-+
-+ if (!result) {
-+ LOG.error(
-+ "delete: failed to delete \"" + abs_path.toString() + "\", BAILING");
-+ }
-+ LOG.debug("delete:exit");
-+ return result;
-+ }
-+
-+ /**
-+ * Returns the default replication value of 1. This may
-+ * NOT be the actual value, as replication is controlled
-+ * by a separate Ceph configuration.
-+ */
-+ @Override
-+ public short getDefaultReplication() {
-+ return 1;
-+ }
-+
-+ /**
-+ * Get the default block size.
-+ * @return the default block size, in bytes, as a long.
-+ */
-+ @Override
-+ public long getDefaultBlockSize() {
-+ return getConf().getInt("fs.ceph.blockSize", 1 << 26);
-+ }
-+
-+ /**
-+ * Adds the working directory to path if path is not already
-+ * an absolute path. The URI scheme is not removed here. It
-+ * is removed only when users (e.g. ceph native calls) need
-+ * the path-only portion.
-+ */
-+ private Path makeAbsolute(Path path) {
-+ if (path.isAbsolute()) {
-+ return path;
-+ }
-+ return new Path(workingDir, path);
-+ }
-+
-+ private Path[] listPaths(Path path) throws IOException {
-+ LOG.debug("listPaths:enter with path " + path);
-+ String dirlist[];
-+
-+ Path abs_path = makeAbsolute(path);
-+
-+ // If it's a directory, get the listing. Otherwise, complain and give up.
-+ LOG.debug("calling ceph_getdir from Java with path " + abs_path);
-+ dirlist = ceph.ceph_getdir(getCephPath(abs_path));
-+ LOG.debug("returning from ceph_getdir to Java");
-+
-+ if (dirlist == null) {
-+ return null;
-+ }
-+
-+ // convert the strings to Paths
-+ Path[] paths = new Path[dirlist.length];
-+
-+ for (int i = 0; i < dirlist.length; ++i) {
-+ LOG.trace(
-+ "Raw enumeration of paths in \"" + abs_path.toString() + "\": \""
-+ + dirlist[i] + "\"");
-+ // convert each listing to an absolute path
-+ Path raw_path = new Path(dirlist[i]);
-+
-+ if (raw_path.isAbsolute()) {
-+ paths[i] = raw_path;
-+ } else {
-+ paths[i] = new Path(abs_path, raw_path);
-+ }
-+ }
-+ LOG.debug("listPaths:exit");
-+ return paths;
-+ }
-+
-+ static class Stat {
-+ public long size;
-+ public boolean is_dir;
-+ public long block_size;
-+ public long mod_time;
-+ public long access_time;
-+ public int mode;
-+
-+ public Stat() {}
-+ }
-+}
-diff --git a/src/core/org/apache/hadoop/fs/ceph/CephInputStream.java b/src/core/org/apache/hadoop/fs/ceph/CephInputStream.java
-new file mode 100644
-index 0000000..d9668d0
---- /dev/null
-+++ b/src/core/org/apache/hadoop/fs/ceph/CephInputStream.java
-@@ -0,0 +1,254 @@
-+// -*- mode:Java; tab-width:2; c-basic-offset:2; indent-tabs-mode:t -*-
-+
-+/**
-+ *
-+ * Licensed under the Apache License, Version 2.0
-+ * (the "License"); you may not use this file except in compliance with
-+ * the License. You may obtain a copy of the License at
-+ *
-+ * http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-+ * implied. See the License for the specific language governing
-+ * permissions and limitations under the License.
-+ *
-+ *
-+ * Implements the Hadoop FS interfaces to allow applications to store
-+ * files in Ceph.
-+ */
-+package org.apache.hadoop.fs.ceph;
-+
-+
-+import java.io.IOException;
-+
-+import org.apache.commons.logging.Log;
-+import org.apache.commons.logging.LogFactory;
-+import org.apache.hadoop.conf.Configuration;
-+import org.apache.hadoop.fs.FSInputStream;
-+
-+
-+/**
-+ * <p>
-+ * An {@link FSInputStream} for a CephFileSystem and corresponding
-+ * Ceph instance.
-+ */
-+public class CephInputStream extends FSInputStream {
-+ private static final Log LOG = LogFactory.getLog(CephInputStream.class);
-+ private boolean closed;
-+
-+ private int fileHandle;
-+
-+ private long fileLength;
-+
-+ private CephFS ceph;
-+
-+ private byte[] buffer;
-+ private int bufPos = 0;
-+ private int bufValid = 0;
-+ private long cephPos = 0;
-+
-+ /**
-+ * Create a new CephInputStream.
-+ * @param conf The system configuration. Unused.
-+ * @param fh The filehandle provided by Ceph to reference.
-+ * @param flength The current length of the file. If the length changes
-+ * you will need to close and re-open it to access the new data.
-+ */
-+ public CephInputStream(Configuration conf, CephFS cephfs,
-+ int fh, long flength, int bufferSize) {
-+ // Whoever's calling the constructor is responsible for doing the actual ceph_open
-+ // call and providing the file handle.
-+ fileLength = flength;
-+ fileHandle = fh;
-+ closed = false;
-+ ceph = cephfs;
-+ buffer = new byte[bufferSize];
-+ LOG.debug(
-+ "CephInputStream constructor: initializing stream with fh " + fh
-+ + " and file length " + flength);
-+
-+ }
-+
-+ /** Ceph likes things to be closed before it shuts down,
-+ * so closing the IOStream stuff voluntarily in a finalizer is good
-+ */
-+ protected void finalize() throws Throwable {
-+ try {
-+ if (!closed) {
-+ close();
-+ }
-+ } finally {
-+ super.finalize();
-+ }
-+ }
-+
-+ private synchronized boolean fillBuffer() throws IOException {
-+ bufValid = ceph.ceph_read(fileHandle, buffer, 0, buffer.length);
-+ bufPos = 0;
-+ if (bufValid < 0) {
-+ int err = bufValid;
-+
-+ bufValid = 0;
-+ // attempt to reset to old position. If it fails, too bad.
-+ ceph.ceph_seek_from_start(fileHandle, cephPos);
-+ throw new IOException("Failed to fill read buffer! Error code:" + err);
-+ }
-+ cephPos += bufValid;
-+ return (bufValid != 0);
-+ }
-+
-+ /*
-+ * Get the current position of the stream.
-+ */
-+ public synchronized long getPos() throws IOException {
-+ return cephPos - bufValid + bufPos;
-+ }
-+
-+ /**
-+ * Find the number of bytes remaining in the file.
-+ */
-+ @Override
-+ public synchronized int available() throws IOException {
-+ return (int) (fileLength - getPos());
-+ }
-+
-+ public synchronized void seek(long targetPos) throws IOException {
-+ LOG.trace(
-+ "CephInputStream.seek: Seeking to position " + targetPos + " on fd "
-+ + fileHandle);
-+ if (targetPos > fileLength) {
-+ throw new IOException(
-+ "CephInputStream.seek: failed seek to position " + targetPos
-+ + " on fd " + fileHandle + ": Cannot seek after EOF " + fileLength);
-+ }
-+ long oldPos = cephPos;
-+
-+ cephPos = ceph.ceph_seek_from_start(fileHandle, targetPos);
-+ bufValid = 0;
-+ bufPos = 0;
-+ if (cephPos < 0) {
-+ cephPos = oldPos;
-+ throw new IOException("Ceph failed to seek to new position!");
-+ }
-+ }
-+
-+ /**
-+ * Failovers are handled by the Ceph code at a very low level;
-+ * if there are issues that can be solved by changing sources
-+ * they'll be dealt with before anybody even tries to call this method!
-+ * @return false.
-+ */
-+ public synchronized boolean seekToNewSource(long targetPos) {
-+ return false;
-+ }
-+
-+ /**
-+ * Read a byte from the file.
-+ * @return the next byte.
-+ */
-+ @Override
-+ public synchronized int read() throws IOException {
-+ LOG.trace(
-+ "CephInputStream.read: Reading a single byte from fd " + fileHandle
-+ + " by calling general read function");
-+
-+ byte result[] = new byte[1];
-+
-+ if (getPos() >= fileLength) {
-+ return -1;
-+ }
-+ if (-1 == read(result, 0, 1)) {
-+ return -1;
-+ }
-+ if (result[0] < 0) {
-+ return 256 + (int) result[0];
-+ } else {
-+ return result[0];
-+ }
-+ }
-+
-+ /**
-+ * Read a specified number of bytes from the file into a byte[].
-+ * @param buf the byte array to read into.
-+ * @param off the offset to start at in the file
-+ * @param len the number of bytes to read
-+ * @return 0 if successful, otherwise an error code.
-+ * @throws IOException on bad input.
-+ */
-+ @Override
-+ public synchronized int read(byte buf[], int off, int len)
-+ throws IOException {
-+ LOG.trace(
-+ "CephInputStream.read: Reading " + len + " bytes from fd " + fileHandle);
-+
-+ if (closed) {
-+ throw new IOException(
-+ "CephInputStream.read: cannot read " + len + " bytes from fd "
-+ + fileHandle + ": stream closed");
-+ }
-+
-+ // ensure we're not past the end of the file
-+ if (getPos() >= fileLength) {
-+ LOG.debug(
-+ "CephInputStream.read: cannot read " + len + " bytes from fd "
-+ + fileHandle + ": current position is " + getPos()
-+ + " and file length is " + fileLength);
-+
-+ return -1;
-+ }
-+
-+ int totalRead = 0;
-+ int initialLen = len;
-+ int read;
-+
-+ do {
-+ read = Math.min(len, bufValid - bufPos);
-+ try {
-+ System.arraycopy(buffer, bufPos, buf, off, read);
-+ } catch (IndexOutOfBoundsException ie) {
-+ throw new IOException(
-+ "CephInputStream.read: Indices out of bounds:" + "read length is "
-+ + len + ", buffer offset is " + off + ", and buffer size is "
-+ + buf.length);
-+ } catch (ArrayStoreException ae) {
-+ throw new IOException(
-+ "Uh-oh, CephInputStream failed to do an array"
-+ + "copy due to type mismatch...");
-+ } catch (NullPointerException ne) {
-+ throw new IOException(
-+ "CephInputStream.read: cannot read " + len + "bytes from fd:"
-+ + fileHandle + ": buf is null");
-+ }
-+ bufPos += read;
-+ len -= read;
-+ off += read;
-+ totalRead += read;
-+ } while (len > 0 && fillBuffer());
-+
-+ LOG.trace(
-+ "CephInputStream.read: Reading " + initialLen + " bytes from fd "
-+ + fileHandle + ": succeeded in reading " + totalRead + " bytes");
-+ return totalRead;
-+ }
-+
-+ /**
-+ * Close the CephInputStream and release the associated filehandle.
-+ */
-+ @Override
-+ public void close() throws IOException {
-+ LOG.trace("CephOutputStream.close:enter");
-+ if (!closed) {
-+ int result = ceph.ceph_close(fileHandle);
-+
-+ closed = true;
-+ if (result != 0) {
-+ throw new IOException(
-+ "Close somehow failed!"
-+ + "Don't try and use this stream again, though");
-+ }
-+ LOG.trace("CephOutputStream.close:exit");
-+ }
-+ }
-+}
-diff --git a/src/core/org/apache/hadoop/fs/ceph/CephOutputStream.java b/src/core/org/apache/hadoop/fs/ceph/CephOutputStream.java
-new file mode 100644
-index 0000000..4c50f88
---- /dev/null
-+++ b/src/core/org/apache/hadoop/fs/ceph/CephOutputStream.java
-@@ -0,0 +1,219 @@
-+// -*- mode:Java; tab-width:2; c-basic-offset:2; indent-tabs-mode:t -*-
-+
-+/**
-+ *
-+ * Licensed under the Apache License, Version 2.0
-+ * (the "License"); you may not use this file except in compliance with
-+ * the License. You may obtain a copy of the License at
-+ *
-+ * http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-+ * implied. See the License for the specific language governing
-+ * permissions and limitations under the License.
-+ *
-+ *
-+ * Implements the Hadoop FS interfaces to allow applications to store
-+ * files in Ceph.
-+ */
-+
-+package org.apache.hadoop.fs.ceph;
-+
-+
-+import java.io.IOException;
-+import java.io.OutputStream;
-+
-+import org.apache.commons.logging.Log;
-+import org.apache.commons.logging.LogFactory;
-+import org.apache.hadoop.conf.Configuration;
-+import org.apache.hadoop.util.Progressable;
-+
-+
-+/**
-+ * <p>
-+ * An {@link OutputStream} for a CephFileSystem and corresponding
-+ * Ceph instance.
-+ */
-+public class CephOutputStream extends OutputStream {
-+ private static final Log LOG = LogFactory.getLog(CephOutputStream.class);
-+ private boolean closed;
-+
-+ private CephFS ceph;
-+
-+ private int fileHandle;
-+
-+ private byte[] buffer;
-+ private int bufUsed = 0;
-+
-+ /**
-+ * Construct the CephOutputStream.
-+ * @param conf The FileSystem configuration.
-+ * @param fh The Ceph filehandle to connect to.
-+ */
-+ public CephOutputStream(Configuration conf, CephFS cephfs,
-+ int fh, int bufferSize) {
-+ ceph = cephfs;
-+ fileHandle = fh;
-+ closed = false;
-+ buffer = new byte[bufferSize];
-+ }
-+
-+ /** Ceph likes things to be closed before it shuts down,
-+ *so closing the IOStream stuff voluntarily is good
-+ */
-+ protected void finalize() throws Throwable {
-+ try {
-+ if (!closed) {
-+ close();
-+ }
-+ } finally {
-+ super.finalize();
-+ }
-+ }
-+
-+ /**
-+ * Get the current position in the file.
-+ * @return The file offset in bytes.
-+ */
-+ public long getPos() throws IOException {
-+ return ceph.ceph_getpos(fileHandle);
-+ }
-+
-+ /**
-+ * Write a byte.
-+ * @param b The byte to write.
-+ * @throws IOException If you have closed the CephOutputStream or the
-+ * write fails.
-+ */
-+ @Override
-+ public synchronized void write(int b) throws IOException {
-+ LOG.trace(
-+ "CephOutputStream.write: writing a single byte to fd " + fileHandle);
-+
-+ if (closed) {
-+ throw new IOException(
-+ "CephOutputStream.write: cannot write " + "a byte to fd " + fileHandle
-+ + ": stream closed");
-+ }
-+ // Stick the byte in a buffer and write it
-+ byte buf[] = new byte[1];
-+
-+ buf[0] = (byte) b;
-+ write(buf, 0, 1);
-+ return;
-+ }
-+
-+ /**
-+ * Write a byte buffer into the Ceph file.
-+ * @param buf the byte array to write from
-+ * @param off the position in the file to start writing at.
-+ * @param len The number of bytes to actually write.
-+ * @throws IOException if you have closed the CephOutputStream, or
-+ * if buf is null or off + len > buf.length, or
-+ * if the write fails due to a Ceph error.
-+ */
-+ @Override
-+ public synchronized void write(byte buf[], int off, int len) throws IOException {
-+ LOG.trace(
-+ "CephOutputStream.write: writing " + len + " bytes to fd " + fileHandle);
-+ // make sure stream is open
-+ if (closed) {
-+ throw new IOException(
-+ "CephOutputStream.write: cannot write " + len + "bytes to fd "
-+ + fileHandle + ": stream closed");
-+ }
-+
-+ int result;
-+ int write;
-+
-+ while (len > 0) {
-+ write = Math.min(len, buffer.length - bufUsed);
-+ try {
-+ System.arraycopy(buf, off, buffer, bufUsed, write);
-+ } catch (IndexOutOfBoundsException ie) {
-+ throw new IOException(
-+ "CephOutputStream.write: Indices out of bounds: "
-+ + "write length is " + len + ", buffer offset is " + off
-+ + ", and buffer size is " + buf.length);
-+ } catch (ArrayStoreException ae) {
-+ throw new IOException(
-+ "Uh-oh, CephOutputStream failed to do an array"
-+ + " copy due to type mismatch...");
-+ } catch (NullPointerException ne) {
-+ throw new IOException(
-+ "CephOutputStream.write: cannot write " + len + "bytes to fd "
-+ + fileHandle + ": buffer is null");
-+ }
-+ bufUsed += write;
-+ len -= write;
-+ off += write;
-+ if (bufUsed == buffer.length) {
-+ result = ceph.ceph_write(fileHandle, buffer, 0, bufUsed);
-+ if (result < 0) {
-+ throw new IOException(
-+ "CephOutputStream.write: Buffered write of " + bufUsed
-+ + " bytes failed!");
-+ }
-+ if (result != bufUsed) {
-+ throw new IOException(
-+ "CephOutputStream.write: Wrote only " + result + " bytes of "
-+ + bufUsed + " in buffer! Data may be lost or written"
-+ + " twice to Ceph!");
-+ }
-+ bufUsed = 0;
-+ }
-+
-+ }
-+ return;
-+ }
-+
-+ /**
-+ * Flush the buffered data.
-+ * @throws IOException if you've closed the stream or the write fails.
-+ */
-+ @Override
-+ public synchronized void flush() throws IOException {
-+ if (!closed) {
-+ if (bufUsed == 0) {
-+ return;
-+ }
-+ int result = ceph.ceph_write(fileHandle, buffer, 0, bufUsed);
-+
-+ if (result < 0) {
-+ throw new IOException(
-+ "CephOutputStream.write: Write of " + bufUsed + "bytes to fd "
-+ + fileHandle + " failed");
-+ }
-+ if (result != bufUsed) {
-+ throw new IOException(
-+ "CephOutputStream.write: Write of " + bufUsed + "bytes to fd "
-+ + fileHandle + "was incomplete: only " + result + " of " + bufUsed
-+ + " bytes were written.");
-+ }
-+ bufUsed = 0;
-+ return;
-+ }
-+ }
-+
-+ /**
-+ * Close the CephOutputStream.
-+ * @throws IOException if Ceph somehow returns an error. In current code it can't.
-+ */
-+ @Override
-+ public synchronized void close() throws IOException {
-+ LOG.trace("CephOutputStream.close:enter");
-+ if (!closed) {
-+ flush();
-+ int result = ceph.ceph_close(fileHandle);
-+
-+ if (result != 0) {
-+ throw new IOException("Close failed!");
-+ }
-+
-+ closed = true;
-+ LOG.trace("CephOutputStream.close:exit");
-+ }
-+ }
-+}
-diff --git a/src/core/org/apache/hadoop/fs/ceph/CephTalker.java b/src/core/org/apache/hadoop/fs/ceph/CephTalker.java
-new file mode 100644
-index 0000000..569652f
---- /dev/null
-+++ b/src/core/org/apache/hadoop/fs/ceph/CephTalker.java
-@@ -0,0 +1,91 @@
-+// -*- mode:Java; tab-width:2; c-basic-offset:2; indent-tabs-mode:t -*-
-+
-+/**
-+ *
-+ * Licensed under the Apache License, Version 2.0
-+ * (the "License"); you may not use this file except in compliance with
-+ * the License. You may obtain a copy of the License at
-+ *
-+ * http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-+ * implied. See the License for the specific language governing
-+ * permissions and limitations under the License.
-+ *
-+ *
-+ * Wraps a number of native function calls to communicate with the Ceph
-+ * filesystem.
-+ */
-+package org.apache.hadoop.fs.ceph;
-+
-+
-+import org.apache.hadoop.conf.Configuration;
-+import org.apache.commons.logging.Log;
-+
-+
-+class CephTalker extends CephFS {
-+ // JNI doesn't give us any way to store pointers, so use a long.
-+ // Here we're assuming pointers aren't longer than 8 bytes.
-+ long cluster;
-+
-+ // we write a constructor so we can load the libraries
-+ public CephTalker(Configuration conf, Log log) {
-+ System.load(conf.get("fs.ceph.libDir") + "/libcephfs.so");
-+ System.load(conf.get("fs.ceph.libDir") + "/libhadoopcephfs.so");
-+ cluster = 0;
-+ }
-+
-+ protected native boolean ceph_initializeClient(String arguments, int block_size);
-+
-+ protected native String ceph_getcwd();
-+
-+ protected native boolean ceph_setcwd(String path);
-+
-+ protected native boolean ceph_rmdir(String path);
-+
-+ protected native boolean ceph_unlink(String path);
-+
-+ protected native boolean ceph_rename(String old_path, String new_path);
-+
-+ protected native boolean ceph_exists(String path);
-+
-+ protected native long ceph_getblocksize(String path);
-+
-+ protected native boolean ceph_isdirectory(String path);
-+
-+ protected native boolean ceph_isfile(String path);
-+
-+ protected native String[] ceph_getdir(String path);
-+
-+ protected native int ceph_mkdirs(String path, int mode);
-+
-+ protected native int ceph_open_for_append(String path);
-+
-+ protected native int ceph_open_for_read(String path);
-+
-+ protected native int ceph_open_for_overwrite(String path, int mode);
-+
-+ protected native int ceph_close(int filehandle);
-+
-+ protected native boolean ceph_setPermission(String path, int mode);
-+
-+ protected native boolean ceph_kill_client();
-+
-+ protected native boolean ceph_stat(String path, CephFileSystem.Stat fill);
-+
-+ protected native int ceph_replication(String Path);
-+
-+ protected native String[] ceph_hosts(int fh, long offset);
-+
-+ protected native int ceph_setTimes(String path, long mtime, long atime);
-+
-+ protected native long ceph_getpos(int fh);
-+
-+ protected native int ceph_write(int fh, byte[] buffer, int buffer_offset, int length);
-+
-+ protected native int ceph_read(int fh, byte[] buffer, int buffer_offset, int length);
-+
-+ protected native long ceph_seek_from_start(int fh, long pos);
-+}
-diff --git a/src/mapred/org/apache/hadoop/filecache/TrackerDistributedCacheManager.java b/src/mapred/org/apache/hadoop/filecache/TrackerDistributedCacheManager.java
-index 9e22f1f..cd55361 100644
---- a/src/mapred/org/apache/hadoop/filecache/TrackerDistributedCacheManager.java
-+++ b/src/mapred/org/apache/hadoop/filecache/TrackerDistributedCacheManager.java
-@@ -386,10 +386,12 @@ public class TrackerDistributedCacheManager {
- if (modifiedTime != desiredTimestamp) {
- DateFormat df = DateFormat.getDateTimeInstance(DateFormat.SHORT,
- DateFormat.SHORT);
-+ /*
- throw new IOException("The distributed cache object " + source +
- " changed during the job from " +
- df.format(new Date(desiredTimestamp)) + " to " +
- df.format(new Date(modifiedTime)));
-+ */
- }
-
- Path parchive = null;
-diff --git a/src/test/commit-tests b/src/test/commit-tests
-index 1148c8b..85fa53d 100644
---- a/src/test/commit-tests
-+++ b/src/test/commit-tests
-@@ -53,6 +53,7 @@
- **/TestRPC.java
- **/TestS3Credentials.java
- **/TestS3FileSystem.java
-+**/TestCeph.java
- **/TestSaslRPC.java
- **/TestScriptBasedMapping.java
- **/TestSequenceFileSerialization.java
-diff --git a/src/test/org/apache/hadoop/fs/ceph/TestCeph.java b/src/test/org/apache/hadoop/fs/ceph/TestCeph.java
-new file mode 100644
-index 0000000..e46b0ee
---- /dev/null
-+++ b/src/test/org/apache/hadoop/fs/ceph/TestCeph.java
-@@ -0,0 +1,45 @@
-+// -*- mode:Java; tab-width:2; c-basic-offset:2; indent-tabs-mode:t -*-
-+
-+/**
-+ * Licensed to the Apache Software Foundation (ASF) under one
-+ * or more contributor license agreements. See the NOTICE file
-+ * distributed with this work for additional information
-+ * regarding copyright ownership. The ASF licenses this file
-+ * to you under the Apache License, Version 2.0 (the
-+ * "License"); you may not use this file except in compliance
-+ * with the License. You may obtain a copy of the License at
-+ *
-+ * http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+ * See the License for the specific language governing permissions and
-+ * limitations under the License.
-+ *
-+ * Unit tests for the CephFileSystem API implementation.
-+ */
-+
-+package org.apache.hadoop.fs.ceph;
-+
-+
-+import java.io.IOException;
-+import java.net.URI;
-+import org.apache.hadoop.conf.Configuration;
-+import org.apache.hadoop.fs.FileSystemContractBaseTest;
-+import org.apache.hadoop.fs.FileSystem;
-+import org.apache.hadoop.fs.Path;
-+
-+
-+public class TestCeph extends FileSystemContractBaseTest {
-+
-+ @Override
-+ protected void setUp() throws IOException {
-+ Configuration conf = new Configuration();
-+ CephFaker cephfaker = new CephFaker(conf, FileSystem.LOG);
-+ CephFileSystem cephfs = new CephFileSystem(cephfaker);
-+
-+ cephfs.initialize(URI.create("ceph://null"), conf);
-+ fs = cephfs;
-+ }
-+}
diff --git a/src/client/hadoop/Readme b/src/client/hadoop/Readme
deleted file mode 100644
index 2967b96cf5a..00000000000
--- a/src/client/hadoop/Readme
+++ /dev/null
@@ -1,17 +0,0 @@
-This directory contains:
-CephFSInterface.cc/h: A C++ JNI library used by the Hadoop Java code.
-ceph: A directory containing all the Java source files for a
-Hadoop-compliant CephFileSystem.
-HADOOP-ceph.patch: A patch for Hadoop. It should apply fine to one of the
-.20 branches. (It was generated against .20.205.0) This
-patch adds in all the files contained in the ceph dir as well as making
-some changes so that Hadoop's configuration code will recognize the
-CephFileSystem properties and classes. It is possible that this will be
-out-of-date compared to the files contained in the ceph dir, so you
-should apply this patch and then copy ceph/* into the appropriate Hadoop
-dir.
-
-There are also a number of javah-generated C header files which are used
-in writing CephFSInterface but can be safely ignored otherwise.
-
-Configuration instructions are included in Javadoc format in the ceph dir.
diff --git a/src/client/hadoop/ceph/CephFS.java b/src/client/hadoop/ceph/CephFS.java
deleted file mode 100644
index 5d51eb21600..00000000000
--- a/src/client/hadoop/ceph/CephFS.java
+++ /dev/null
@@ -1,250 +0,0 @@
-// -*- mode:Java; tab-width:2; c-basic-offset:2; indent-tabs-mode:t -*-
-
-/**
- *
- * Licensed under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- * implied. See the License for the specific language governing
- * permissions and limitations under the License.
- *
- *
- * Abstract base class for communicating with a Ceph filesystem and its
- * C++ codebase from Java, or pretending to do so (for unit testing purposes).
- * As only the Ceph package should be using this directly, all methods
- * are protected.
- */
-package org.apache.hadoop.fs.ceph;
-
-import org.apache.hadoop.conf.Configuration;
-
-abstract class CephFS {
-
- protected static final int ENOTDIR = 20;
- protected static final int EEXIST = 17;
- protected static final int ENOENT = 2;
-
- /*
- * Performs any necessary setup to allow general use of the filesystem.
- * Inputs:
- * String argsuments -- a command-line style input of Ceph config params
- * int block_size -- the size in bytes to use for blocks
- * Returns: true on success, false otherwise
- */
- abstract protected boolean ceph_initializeClient(String arguments, int block_size);
-
- /*
- * Returns the current working directory (absolute) as a String
- */
- abstract protected String ceph_getcwd();
-
- /*
- * Changes the working directory.
- * Inputs:
- * String path: The path (relative or absolute) to switch to
- * Returns: true on success, false otherwise.
- */
- abstract protected boolean ceph_setcwd(String path);
-
- /*
- * Given a path to a directory, removes the directory if empty.
- * Inputs:
- * jstring j_path: The path (relative or absolute) to the directory
- * Returns: true on successful delete; false otherwise
- */
- abstract protected boolean ceph_rmdir(String path);
-
- /*
- * Given a path, unlinks it.
- * Inputs:
- * String path: The path (relative or absolute) to the file or empty dir
- * Returns: true if the unlink occurred, false otherwise.
- */
- abstract protected boolean ceph_unlink(String path);
-
- /*
- * Changes a given path name to a new name, assuming new_path doesn't exist.
- * Inputs:
- * jstring j_from: The path whose name you want to change.
- * jstring j_to: The new name for the path.
- * Returns: true if the rename occurred, false otherwise
- */
- abstract protected boolean ceph_rename(String old_path, String new_path);
-
- /*
- * Returns true if it the input path exists, false
- * if it does not or there is an unexpected failure.
- */
- abstract protected boolean ceph_exists(String path);
-
- /*
- * Get the block size for a given path.
- * Input:
- * String path: The path (relative or absolute) you want
- * the block size for.
- * Returns: block size if the path exists, otherwise a negative number
- * corresponding to the standard C++ error codes (which are positive).
- */
- abstract protected long ceph_getblocksize(String path);
-
- /*
- * Returns true if the given path is a directory, false otherwise.
- */
- abstract protected boolean ceph_isdirectory(String path);
-
- /*
- * Returns true if the given path is a file; false otherwise.
- */
- abstract protected boolean ceph_isfile(String path);
-
- /*
- * Get the contents of a given directory.
- * Inputs:
- * String path: The path (relative or absolute) to the directory.
- * Returns: A Java String[] of the contents of the directory, or
- * NULL if there is an error (ie, path is not a dir). This listing
- * will not contain . or .. entries.
- */
- abstract protected String[] ceph_getdir(String path);
-
- /*
- * Create the specified directory and any required intermediate ones with the
- * given mode.
- */
- abstract protected int ceph_mkdirs(String path, int mode);
-
- /*
- * Open a file to append. If the file does not exist, it will be created.
- * Opening a dir is possible but may have bad results.
- * Inputs:
- * String path: The path to open.
- * Returns: an int filehandle, or a number<0 if an error occurs.
- */
- abstract protected int ceph_open_for_append(String path);
-
- /*
- * Open a file for reading.
- * Opening a dir is possible but may have bad results.
- * Inputs:
- * String path: The path to open.
- * Returns: an int filehandle, or a number<0 if an error occurs.
- */
- abstract protected int ceph_open_for_read(String path);
-
- /*
- * Opens a file for overwriting; creates it if necessary.
- * Opening a dir is possible but may have bad results.
- * Inputs:
- * String path: The path to open.
- * int mode: The mode to open with.
- * Returns: an int filehandle, or a number<0 if an error occurs.
- */
- abstract protected int ceph_open_for_overwrite(String path, int mode);
-
- /*
- * Closes the given file. Returns 0 on success, or a negative
- * error code otherwise.
- */
- abstract protected int ceph_close(int filehandle);
-
- /*
- * Change the mode on a path.
- * Inputs:
- * String path: The path to change mode on.
- * int mode: The mode to apply.
- * Returns: true if the mode is properly applied, false if there
- * is any error.
- */
- abstract protected boolean ceph_setPermission(String path, int mode);
-
- /*
- * Closes the Ceph client. This should be called before shutting down
- * (multiple times is okay but redundant).
- */
- abstract protected boolean ceph_kill_client();
-
- /*
- * Get the statistics on a path returned in a custom format defined
- * in CephFileSystem.
- * Inputs:
- * String path: The path to stat.
- * Stat fill: The stat object to fill.
- * Returns: true if the stat is successful, false otherwise.
- */
- abstract protected boolean ceph_stat(String path, CephFileSystem.Stat fill);
-
- /*
- * Check how many times a file should be replicated. If it is,
- * degraded it may not actually be replicated this often.
- * Inputs:
- * int fh: a file descriptor
- * Returns: an int containing the number of times replicated.
- */
- abstract protected int ceph_replication(String path);
-
- /*
- * Find the IP address of the primary OSD for a given file and offset.
- * Inputs:
- * int fh: The filehandle for the file.
- * long offset: The offset to get the location of.
- * Returns: an array of String of the location as IP, or NULL if there is an error.
- */
- abstract protected String[] ceph_hosts(int fh, long offset);
-
- /*
- * Set the mtime and atime for a given path.
- * Inputs:
- * String path: The path to set the times for.
- * long mtime: The mtime to set, in millis since epoch (-1 to not set).
- * long atime: The atime to set, in millis since epoch (-1 to not set)
- * Returns: 0 if successful, an error code otherwise.
- */
- abstract protected int ceph_setTimes(String path, long mtime, long atime);
-
- /*
- * Get the current position in a file (as a long) of a given filehandle.
- * Returns: (long) current file position on success, or a
- * negative error code on failure.
- */
- abstract protected long ceph_getpos(int fh);
-
- /*
- * Write the given buffer contents to the given filehandle.
- * Inputs:
- * int fh: The filehandle to write to.
- * byte[] buffer: The buffer to write from
- * int buffer_offset: The position in the buffer to write from
- * int length: The number of (sequential) bytes to write.
- * Returns: int, on success the number of bytes written, on failure
- * a negative error code.
- */
- abstract protected int ceph_write(int fh, byte[] buffer, int buffer_offset, int length);
-
- /*
- * Reads into the given byte array from the current position.
- * Inputs:
- * int fh: the filehandle to read from
- * byte[] buffer: the byte array to read into
- * int buffer_offset: where in the buffer to start writing
- * int length: how much to read.
- * There'd better be enough space in the buffer to write all
- * the data from the given offset!
- * Returns: the number of bytes read on success (as an int),
- * or an error code otherwise. */
- abstract protected int ceph_read(int fh, byte[] buffer, int buffer_offset, int length);
-
- /*
- * Seeks to the given position in the given file.
- * Inputs:
- * int fh: The filehandle to seek in.
- * long pos: The position to seek to.
- * Returns: the new position (as a long) of the filehandle on success,
- * or a negative error code on failure. */
- abstract protected long ceph_seek_from_start(int fh, long pos);
-}
diff --git a/src/client/hadoop/ceph/CephFaker.java b/src/client/hadoop/ceph/CephFaker.java
deleted file mode 100644
index c598f536039..00000000000
--- a/src/client/hadoop/ceph/CephFaker.java
+++ /dev/null
@@ -1,483 +0,0 @@
-// -*- mode:Java; tab-width:2; c-basic-offset:2; indent-tabs-mode:t -*-
-
-/**
- *
- * Licensed under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- * implied. See the License for the specific language governing
- * permissions and limitations under the License.
- *
- *
- * This uses the local Filesystem but pretends to be communicating
- * with a Ceph deployment, for unit testing the CephFileSystem.
- */
-
-package org.apache.hadoop.fs.ceph;
-
-
-import java.net.URI;
-import java.util.Hashtable;
-import java.io.Closeable;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.BlockLocation;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FSDataOutputStream;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.permission.FsPermission;
-
-
-class CephFaker extends CephFS {
- private static final Log LOG = LogFactory.getLog(CephFaker.class);
- FileSystem localFS;
- String localPrefix;
- int blockSize;
- Configuration conf;
- Hashtable<Integer, Object> files;
- Hashtable<Integer, String> filenames;
- int fileCount = 0;
- boolean initialized = false;
-
- public CephFaker(Configuration con, Log log) {
- conf = con;
- files = new Hashtable<Integer, Object>();
- filenames = new Hashtable<Integer, String>();
- }
-
- protected boolean ceph_initializeClient(String args, int block_size) {
- if (!initialized) {
- // let's remember the default block_size
- blockSize = block_size;
-
- /* for a real Ceph deployment, this starts up the client,
- * sets debugging levels, etc. We just need to get the
- * local FileSystem to use, and we'll ignore any
- * command-line arguments. */
- try {
- localFS = FileSystem.getLocal(conf);
- localFS.initialize(URI.create("file://localhost"), conf);
- localFS.setVerifyChecksum(false);
- String testDir = conf.get("hadoop.tmp.dir");
-
- localPrefix = localFS.getWorkingDirectory().toString();
- int testDirLoc = localPrefix.indexOf(testDir) - 1;
-
- if (-2 == testDirLoc) {
- testDirLoc = localPrefix.length();
- }
- localPrefix = localPrefix.substring(0, testDirLoc) + "/"
- + conf.get("hadoop.tmp.dir");
-
- localFS.setWorkingDirectory(
- new Path(localPrefix + "/user/" + System.getProperty("user.name")));
- // I don't know why, but the unit tests expect the default
- // working dir to be /user/username, so satisfy them!
- // debug("localPrefix is " + localPrefix, INFO);
- } catch (IOException e) {
- return false;
- }
- initialized = true;
- }
- return true;
- }
-
- protected String ceph_getcwd() {
- return sanitize_path(localFS.getWorkingDirectory().toString());
- }
-
- protected boolean ceph_setcwd(String path) {
- localFS.setWorkingDirectory(new Path(prepare_path(path)));
- return true;
- }
-
- // the caller is responsible for ensuring empty dirs
- protected boolean ceph_rmdir(String pth) {
- Path path = new Path(prepare_path(pth));
- boolean ret = false;
-
- try {
- if (localFS.listStatus(path).length <= 1) {
- ret = localFS.delete(path, true);
- }
- } catch (IOException e) {}
- return ret;
- }
-
- // this needs to work on (empty) directories too
- protected boolean ceph_unlink(String path) {
- path = prepare_path(path);
- boolean ret = false;
-
- if (ceph_isdirectory(path)) {
- ret = ceph_rmdir(path);
- } else {
- try {
- ret = localFS.delete(new Path(path), false);
- } catch (IOException e) {}
- }
- return ret;
- }
-
- protected boolean ceph_rename(String oldName, String newName) {
- oldName = prepare_path(oldName);
- newName = prepare_path(newName);
- try {
- Path parent = new Path(newName).getParent();
- Path newPath = new Path(newName);
-
- if (localFS.exists(parent) && !localFS.exists(newPath)) {
- return localFS.rename(new Path(oldName), newPath);
- }
- return false;
- } catch (IOException e) {
- return false;
- }
- }
-
- protected boolean ceph_exists(String path) {
- path = prepare_path(path);
- boolean ret = false;
-
- try {
- ret = localFS.exists(new Path(path));
- } catch (IOException e) {}
- return ret;
- }
-
- protected long ceph_getblocksize(String path) {
- path = prepare_path(path);
- try {
- FileStatus status = localFS.getFileStatus(new Path(path));
-
- return status.getBlockSize();
- } catch (FileNotFoundException e) {
- return -CephFS.ENOENT;
- } catch (IOException e) {
- return -1; // just fail generically
- }
- }
-
- protected boolean ceph_isdirectory(String path) {
- path = prepare_path(path);
- try {
- FileStatus status = localFS.getFileStatus(new Path(path));
-
- return status.isDir();
- } catch (IOException e) {
- return false;
- }
- }
-
- protected boolean ceph_isfile(String path) {
- path = prepare_path(path);
- boolean ret = false;
-
- try {
- FileStatus status = localFS.getFileStatus(new Path(path));
-
- ret = !status.isDir();
- } catch (Exception e) {}
- return ret;
- }
-
- protected String[] ceph_getdir(String path) {
- path = prepare_path(path);
- if (!ceph_isdirectory(path)) {
- return null;
- }
- try {
- FileStatus[] stats = localFS.listStatus(new Path(path));
- String[] names = new String[stats.length];
- String name;
-
- for (int i = 0; i < stats.length; ++i) {
- name = stats[i].getPath().toString();
- names[i] = name.substring(name.lastIndexOf(Path.SEPARATOR) + 1);
- }
- return names;
- } catch (IOException e) {}
- return null;
- }
-
- protected int ceph_mkdirs(String path, int mode) {
- path = prepare_path(path);
- // debug("ceph_mkdirs on " + path, INFO);
- try {
- if (localFS.mkdirs(new Path(path), new FsPermission((short) mode))) {
- return 0;
- }
- } catch (IOException e) {}
- if (ceph_isdirectory(path)) { // apparently it already existed
- return -EEXIST;
- } else if (ceph_isfile(path)) {
- return -ENOTDIR;
- }
- return -1;
- }
-
- /*
- * Unlike a real Ceph deployment, you can't do opens on a directory.
- * Since that has unpredictable behavior and you shouldn't do it anyway,
- * it's okay.
- */
- protected int ceph_open_for_append(String path) {
- path = prepare_path(path);
- FSDataOutputStream stream;
-
- try {
- stream = localFS.append(new Path(path));
- files.put(new Integer(fileCount), stream);
- filenames.put(new Integer(fileCount), path);
- return fileCount++;
- } catch (IOException e) {}
- return -1; // failure
- }
-
- protected int ceph_open_for_read(String path) {
- path = prepare_path(path);
- FSDataInputStream stream;
-
- try {
- stream = localFS.open(new Path(path));
- files.put(new Integer(fileCount), stream);
- filenames.put(new Integer(fileCount), path);
- LOG.info("ceph_open_for_read fh:" + fileCount + ", pathname:" + path);
- return fileCount++;
- } catch (IOException e) {}
- return -1; // failure
- }
-
- protected int ceph_open_for_overwrite(String path, int mode) {
- path = prepare_path(path);
- FSDataOutputStream stream;
-
- try {
- stream = localFS.create(new Path(path));
- files.put(new Integer(fileCount), stream);
- filenames.put(new Integer(fileCount), path);
- LOG.info("ceph_open_for_overwrite fh:" + fileCount + ", pathname:" + path);
- return fileCount++;
- } catch (IOException e) {}
- return -1; // failure
- }
-
- protected int ceph_close(int filehandle) {
- LOG.info("ceph_close(filehandle " + filehandle + ")");
- try {
- ((Closeable) files.get(new Integer(filehandle))).close();
- if (null == files.get(new Integer(filehandle))) {
- return -ENOENT; // this isn't quite the right error code,
- // but the important part is it's negative
- }
- return 0; // hurray, success
- } catch (NullPointerException ne) {
- LOG.warn("ceph_close caught NullPointerException!" + ne);
- } // err, how?
- catch (IOException ie) {
- LOG.warn("ceph_close caught IOException!" + ie);
- }
- return -1; // failure
- }
-
- protected boolean ceph_setPermission(String pth, int mode) {
- pth = prepare_path(pth);
- Path path = new Path(pth);
- boolean ret = false;
-
- try {
- localFS.setPermission(path, new FsPermission((short) mode));
- ret = true;
- } catch (IOException e) {}
- return ret;
- }
-
- // rather than try and match a Ceph deployment's behavior exactly,
- // just make bad things happen if they try and call methods after this
- protected boolean ceph_kill_client() {
- // debug("ceph_kill_client", INFO);
- localFS.setWorkingDirectory(new Path(localPrefix));
- // debug("working dir is now " + localFS.getWorkingDirectory(), INFO);
- try {
- localFS.close();
- } catch (Exception e) {}
- localFS = null;
- files = null;
- filenames = null;
- return true;
- }
-
- protected boolean ceph_stat(String pth, CephFileSystem.Stat fill) {
- pth = prepare_path(pth);
- Path path = new Path(pth);
- boolean ret = false;
-
- try {
- FileStatus status = localFS.getFileStatus(path);
-
- fill.size = status.getLen();
- fill.is_dir = status.isDir();
- fill.block_size = status.getBlockSize();
- fill.mod_time = status.getModificationTime();
- fill.access_time = status.getAccessTime();
- fill.mode = status.getPermission().toShort();
- ret = true;
- } catch (IOException e) {}
- return ret;
- }
-
- protected int ceph_replication(String path) {
- path = prepare_path(path);
- int ret = -1; // -1 for failure
-
- try {
- ret = localFS.getFileStatus(new Path(path)).getReplication();
- } catch (IOException e) {}
- return ret;
- }
-
- protected String[] ceph_hosts(int fh, long offset) {
- String[] ret = null;
-
- try {
- BlockLocation[] locs = localFS.getFileBlockLocations(
- localFS.getFileStatus(new Path(filenames.get(new Integer(fh)))),
- offset, 1);
-
- ret = locs[0].getNames();
- } catch (IOException e) {} catch (NullPointerException f) {}
- return ret;
- }
-
- protected int ceph_setTimes(String pth, long mtime, long atime) {
- pth = prepare_path(pth);
- Path path = new Path(pth);
- int ret = -1; // generic fail
-
- try {
- localFS.setTimes(path, mtime, atime);
- ret = 0;
- } catch (IOException e) {}
- return ret;
- }
-
- protected long ceph_getpos(int fh) {
- long ret = -1; // generic fail
-
- try {
- Object stream = files.get(new Integer(fh));
-
- if (stream instanceof FSDataInputStream) {
- ret = ((FSDataInputStream) stream).getPos();
- } else if (stream instanceof FSDataOutputStream) {
- ret = ((FSDataOutputStream) stream).getPos();
- }
- } catch (IOException e) {} catch (NullPointerException f) {}
- return ret;
- }
-
- protected int ceph_write(int fh, byte[] buffer,
- int buffer_offset, int length) {
- LOG.info(
- "ceph_write fh:" + fh + ", buffer_offset:" + buffer_offset + ", length:"
- + length);
- long ret = -1; // generic fail
-
- try {
- FSDataOutputStream os = (FSDataOutputStream) files.get(new Integer(fh));
-
- LOG.info("ceph_write got outputstream");
- long startPos = os.getPos();
-
- os.write(buffer, buffer_offset, length);
- ret = os.getPos() - startPos;
- } catch (IOException e) {
- LOG.warn("ceph_write caught IOException!");
- } catch (NullPointerException f) {
- LOG.warn("ceph_write caught NullPointerException!");
- }
- return (int) ret;
- }
-
- protected int ceph_read(int fh, byte[] buffer,
- int buffer_offset, int length) {
- long ret = -1; // generic fail
-
- try {
- FSDataInputStream is = (FSDataInputStream) files.get(new Integer(fh));
- long startPos = is.getPos();
-
- is.read(buffer, buffer_offset, length);
- ret = is.getPos() - startPos;
- } catch (IOException e) {} catch (NullPointerException f) {}
- return (int) ret;
- }
-
- protected long ceph_seek_from_start(int fh, long pos) {
- LOG.info("ceph_seek_from_start(fh " + fh + ", pos " + pos + ")");
- long ret = -1; // generic fail
-
- try {
- LOG.info("ceph_seek_from_start filename is " + filenames.get(new Integer(fh)));
- if (null == files.get(new Integer(fh))) {
- LOG.warn("ceph_seek_from_start: is is null!");
- }
- FSDataInputStream is = (FSDataInputStream) files.get(new Integer(fh));
-
- LOG.info("ceph_seek_from_start retrieved is!");
- is.seek(pos);
- ret = is.getPos();
- } catch (IOException e) {
- LOG.warn("ceph_seek_from_start caught IOException!");
- } catch (NullPointerException f) {
- LOG.warn("ceph_seek_from_start caught NullPointerException!");
- }
- return (int) ret;
- }
-
- /*
- * We need to remove the localFS file prefix before returning to Ceph
- */
- private String sanitize_path(String path) {
- // debug("sanitize_path(" + path + ")", INFO);
- /* if (path.startsWith("file:"))
- path = path.substring("file:".length()); */
- if (path.startsWith(localPrefix)) {
- path = path.substring(localPrefix.length());
- if (path.length() == 0) { // it was a root path
- path = "/";
- }
- }
- // debug("sanitize_path returning " + path, INFO);
- return path;
- }
-
- /*
- * If it's an absolute path we need to shove the
- * test dir onto the front as a prefix.
- */
- private String prepare_path(String path) {
- // debug("prepare_path(" + path + ")", INFO);
- if (path.startsWith("/")) {
- path = localPrefix + path;
- } else if (path.equals("..")) {
- if (ceph_getcwd().equals("/")) {
- path = ".";
- } // you can't go up past root!
- }
- // debug("prepare_path returning" + path, INFO);
- return path;
- }
-}
diff --git a/src/client/hadoop/ceph/CephFileSystem.java b/src/client/hadoop/ceph/CephFileSystem.java
deleted file mode 100644
index 95f22238b4d..00000000000
--- a/src/client/hadoop/ceph/CephFileSystem.java
+++ /dev/null
@@ -1,804 +0,0 @@
-// -*- mode:Java; tab-width:2; c-basic-offset:2; indent-tabs-mode:t -*-
-
-/**
- *
- * Licensed under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- * implied. See the License for the specific language governing
- * permissions and limitations under the License.
- *
- *
- * Implements the Hadoop FS interfaces to allow applications to store
- * files in Ceph.
- */
-package org.apache.hadoop.fs.ceph;
-
-
-import java.io.IOException;
-import java.io.FileNotFoundException;
-import java.io.OutputStream;
-import java.net.URI;
-import java.net.InetAddress;
-import java.util.EnumSet;
-import java.lang.Math;
-import java.util.ArrayList;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.BlockLocation;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FSInputStream;
-import org.apache.hadoop.fs.FSDataOutputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.FileUtil;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.permission.FsPermission;
-import org.apache.hadoop.util.Progressable;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.net.DNS;
-
-
-/**
- * <p>
- * A {@link FileSystem} backed by <a href="http://ceph.newdream.net">Ceph.</a>.
- * This will not start a Ceph instance; one must already be running.
- * </p>
- * Configuration of the CephFileSystem is handled via a few Hadoop
- * Configuration properties: <br>
- * fs.ceph.monAddr -- the ip address/port of the monitor to connect to. <br>
- * fs.ceph.libDir -- the directory that libcephfs and libhadoopceph are
- * located in. This assumes Hadoop is being run on a linux-style machine
- * with names like libcephfs.so.
- * fs.ceph.commandLine -- if you prefer you can fill in this property
- * just as you would when starting Ceph up from the command line. Specific
- * properties override any configuration specified here.
- * <p>
- * You can also enable debugging of the CephFileSystem and Ceph itself: <br>
- * fs.ceph.debug -- if 'true' will print out method enter/exit messages,
- * plus a little more.
- * fs.ceph.clientDebug/fs.ceph.messengerDebug -- will print out debugging
- * from the respective Ceph system of at least that importance.
- */
-public class CephFileSystem extends FileSystem {
- private static final Log LOG = LogFactory.getLog(CephFileSystem.class);
- private URI uri;
-
- private Path workingDir;
- private final Path root;
- private CephFS ceph = null;
-
- private static String CEPH_NAMESERVER;
- private static final String CEPH_NAMESERVER_KEY = "fs.ceph.nameserver";
- private static final String CEPH_NAMESERVER_DEFAULT = "localhost";
-
- /**
- * Create a new CephFileSystem.
- */
- public CephFileSystem() {
- root = new Path("/");
- }
-
- /**
- * Used for testing purposes, this constructor
- * sets the given CephFS instead of defaulting to a
- * CephTalker (with its assumed real Ceph instance to talk to).
- */
- public CephFileSystem(CephFS ceph_fs) {
- super();
- root = new Path("/");
- ceph = ceph_fs;
- }
-
- /**
- * Lets you get the URI of this CephFileSystem.
- * @return the URI.
- */
- public URI getUri() {
- LOG.debug("getUri:exit with return " + uri);
- return uri;
- }
-
- /**
- * Should be called after constructing a CephFileSystem but before calling
- * any other methods.
- * Starts up the connection to Ceph, reads in configuraton options, etc.
- * @param uri The URI for this filesystem.
- * @param conf The Hadoop Configuration to retrieve properties from.
- * @throws IOException if necessary properties are unset.
- */
- @Override
- public void initialize(URI uri, Configuration conf) throws IOException {
- super.initialize(uri, conf);
- setConf(conf);
- this.uri = URI.create(uri.getScheme() + "://" + uri.getAuthority());
- if (ceph == null) {
- ceph = new CephTalker(conf, LOG);
- }
-
- CEPH_NAMESERVER = conf.get(CEPH_NAMESERVER_KEY, CEPH_NAMESERVER_DEFAULT);
-
- // build up the arguments for Ceph
- String arguments = "CephFSInterface";
-
- arguments += conf.get("fs.ceph.commandLine", "");
- if (conf.get("fs.ceph.clientDebug") != null) {
- arguments += " --debug_client ";
- arguments += conf.get("fs.ceph.clientDebug");
- }
- if (conf.get("fs.ceph.messengerDebug") != null) {
- arguments += " --debug_ms ";
- arguments += conf.get("fs.ceph.messengerDebug");
- }
- if (conf.get("fs.ceph.monAddr") != null) {
- arguments += " -m ";
- arguments += conf.get("fs.ceph.monAddr");
- }
- arguments += " --client-readahead-max-periods="
- + conf.get("fs.ceph.readahead", "1");
- // make sure they gave us a ceph monitor address or conf file
- LOG.info("initialize:Ceph initialization arguments: " + arguments);
- if ((conf.get("fs.ceph.monAddr") == null) && (arguments.indexOf("-m") == -1)
- && (arguments.indexOf("-c") == -1)) {
- LOG.fatal("initialize:You need to specify a Ceph monitor address.");
- throw new IOException(
- "You must specify a Ceph monitor address or config file!");
- }
- // Initialize the client
- if (!ceph.ceph_initializeClient(arguments,
- conf.getInt("fs.ceph.blockSize", 1 << 26))) {
- LOG.fatal("initialize:Ceph initialization failed!");
- throw new IOException("Ceph initialization failed!");
- }
- LOG.info("initialize:Ceph initialized client. Setting cwd to /");
- ceph.ceph_setcwd("/");
- LOG.debug("initialize:exit");
-
- this.workingDir = getHomeDirectory();
- }
-
- /**
- * Close down the CephFileSystem. Runs the base-class close method
- * and then kills the Ceph client itself.
- */
- @Override
- public void close() throws IOException {
- LOG.debug("close:enter");
- super.close(); // this method does stuff, make sure it's run!
- LOG.trace("close: Calling ceph_kill_client from Java");
- ceph.ceph_kill_client();
- LOG.debug("close:exit");
- }
-
- /**
- * Get an FSDataOutputStream to append onto a file.
- * @param file The File you want to append onto
- * @param bufferSize Ceph does internal buffering but you can buffer in the Java code as well if you like.
- * @param progress The Progressable to report progress to.
- * Reporting is limited but exists.
- * @return An FSDataOutputStream that connects to the file on Ceph.
- * @throws IOException If the file cannot be found or appended to.
- */
- public FSDataOutputStream append(Path file, int bufferSize,
- Progressable progress) throws IOException {
- LOG.debug("append:enter with path " + file + " bufferSize " + bufferSize);
- Path abs_path = makeAbsolute(file);
-
- if (progress != null) {
- progress.progress();
- }
- LOG.trace("append: Entering ceph_open_for_append from Java");
- int fd = ceph.ceph_open_for_append(getCephPath(abs_path));
-
- LOG.trace("append: Returned to Java");
- if (progress != null) {
- progress.progress();
- }
- if (fd < 0) { // error in open
- throw new IOException(
- "append: Open for append failed on path \"" + abs_path.toString()
- + "\"");
- }
- CephOutputStream cephOStream = new CephOutputStream(getConf(), ceph, fd,
- bufferSize);
-
- LOG.debug("append:exit");
- return new FSDataOutputStream(cephOStream, statistics);
- }
-
- /**
- * Get the current working directory for the given file system
- * @return the directory Path
- */
- public Path getWorkingDirectory() {
- return workingDir;
- }
-
- /**
- * Set the current working directory for the given file system. All relative
- * paths will be resolved relative to it.
- *
- * @param dir The directory to change to.
- */
- @Override
- public void setWorkingDirectory(Path dir) {
- workingDir = makeAbsolute(dir);
- }
-
- /**
- * Return only the path component from a potentially fully qualified path.
- */
- private String getCephPath(Path path) {
- if (!path.isAbsolute()) {
- throw new IllegalArgumentException("Path must be absolute: " + path);
- }
- return path.toUri().getPath();
- }
-
- /**
- * Check if a path exists.
- * Overriden because it's moderately faster than the generic implementation.
- * @param path The file to check existence on.
- * @return true if the file exists, false otherwise.
- */
- @Override
- public boolean exists(Path path) throws IOException {
- LOG.debug("exists:enter with path " + path);
- boolean result;
- Path abs_path = makeAbsolute(path);
-
- if (abs_path.equals(root)) {
- result = true;
- } else {
- LOG.trace(
- "exists:Calling ceph_exists from Java on path " + abs_path.toString());
- result = ceph.ceph_exists(getCephPath(abs_path));
- LOG.trace("exists:Returned from ceph_exists to Java");
- }
- LOG.debug("exists:exit with value " + result);
- return result;
- }
-
- /**
- * Create a directory and any nonexistent parents. Any portion
- * of the directory tree can exist without error.
- * @param path The directory path to create
- * @param perms The permissions to apply to the created directories.
- * @return true if successful, false otherwise
- * @throws IOException if the path is a child of a file.
- */
- @Override
- public boolean mkdirs(Path path, FsPermission perms) throws IOException {
- LOG.debug("mkdirs:enter with path " + path);
- Path abs_path = makeAbsolute(path);
-
- LOG.trace("mkdirs:calling ceph_mkdirs from Java");
- int result = ceph.ceph_mkdirs(getCephPath(abs_path), (int) perms.toShort());
-
- if (result != 0) {
- LOG.warn(
- "mkdirs: make directory " + abs_path + "Failing with result " + result);
- if (-ceph.ENOTDIR == result) {
- throw new IOException("Parent path is not a directory");
- }
- return false;
- } else {
- LOG.debug("mkdirs:exiting succesfully");
- return true;
- }
- }
-
- /**
- * Check if a path is a file. This is moderately faster than the
- * generic implementation.
- * @param path The path to check.
- * @return true if the path is definitely a file, false otherwise.
- */
- @Override
- public boolean isFile(Path path) throws IOException {
- LOG.debug("isFile:enter with path " + path);
- Path abs_path = makeAbsolute(path);
- boolean result;
-
- if (abs_path.equals(root)) {
- result = false;
- } else {
- LOG.trace("isFile:entering ceph_isfile from Java");
- result = ceph.ceph_isfile(getCephPath(abs_path));
- }
- LOG.debug("isFile:exit with result " + result);
- return result;
- }
-
- /**
- * Get stat information on a file. This does not fill owner or group, as
- * Ceph's support for these is a bit different than HDFS'.
- * @param path The path to stat.
- * @return FileStatus object containing the stat information.
- * @throws FileNotFoundException if the path could not be resolved.
- */
- public FileStatus getFileStatus(Path path) throws IOException {
- LOG.debug("getFileStatus:enter with path " + path);
- Path abs_path = makeAbsolute(path);
- // sadly, Ceph doesn't really do uids/gids just yet, but
- // everything else is filled
- FileStatus status;
- Stat lstat = new Stat();
-
- LOG.trace("getFileStatus: calling ceph_stat from Java");
- if (ceph.ceph_stat(getCephPath(abs_path), lstat)) {
- status = new FileStatus(lstat.size, lstat.is_dir,
- ceph.ceph_replication(getCephPath(abs_path)), lstat.block_size,
- lstat.mod_time, lstat.access_time,
- new FsPermission((short) lstat.mode), System.getProperty("user.name"), null,
- path.makeQualified(this));
- } else { // fail out
- throw new FileNotFoundException(
- "org.apache.hadoop.fs.ceph.CephFileSystem: File " + path
- + " does not exist or could not be accessed");
- }
-
- LOG.debug("getFileStatus:exit");
- return status;
- }
-
- /**
- * Get the FileStatus for each listing in a directory.
- * @param path The directory to get listings from.
- * @return FileStatus[] containing one FileStatus for each directory listing;
- * null if path does not exist.
- */
- public FileStatus[] listStatus(Path path) throws IOException {
- LOG.debug("listStatus:enter with path " + path);
- Path abs_path = makeAbsolute(path);
- Path[] paths = listPaths(abs_path);
-
- if (paths != null) {
- FileStatus[] statuses = new FileStatus[paths.length];
-
- for (int i = 0; i < paths.length; ++i) {
- statuses[i] = getFileStatus(paths[i]);
- }
- LOG.debug("listStatus:exit");
- return statuses;
- }
-
- if (isFile(path)) {
- return new FileStatus[] { getFileStatus(path) };
- }
-
- return null;
- }
-
- @Override
- public void setPermission(Path p, FsPermission permission) throws IOException {
- LOG.debug(
- "setPermission:enter with path " + p + " and permissions " + permission);
- Path abs_path = makeAbsolute(p);
-
- LOG.trace("setPermission:calling ceph_setpermission from Java");
- ceph.ceph_setPermission(getCephPath(abs_path), permission.toShort());
- LOG.debug("setPermission:exit");
- }
-
- /**
- * Set access/modification times of a file.
- * @param p The path
- * @param mtime Set modification time in number of millis since Jan 1, 1970.
- * @param atime Set access time in number of millis since Jan 1, 1970.
- */
- @Override
- public void setTimes(Path p, long mtime, long atime) throws IOException {
- LOG.debug(
- "setTimes:enter with path " + p + " mtime:" + mtime + " atime:" + atime);
- Path abs_path = makeAbsolute(p);
-
- LOG.trace("setTimes:calling ceph_setTimes from Java");
- int r = ceph.ceph_setTimes(getCephPath(abs_path), mtime, atime);
-
- if (r < 0) {
- throw new IOException(
- "Failed to set times on path " + abs_path.toString() + " Error code: "
- + r);
- }
- LOG.debug("setTimes:exit");
- }
-
- /**
- * Create a new file and open an FSDataOutputStream that's connected to it.
- * @param path The file to create.
- * @param permission The permissions to apply to the file.
- * @param overwrite If true, overwrite any existing file with
- * this name; otherwise don't.
- * @param bufferSize Ceph does internal buffering, but you can buffer
- * in the Java code too if you like.
- * @param replication Ignored by Ceph. This can be
- * configured via Ceph configuration.
- * @param blockSize Ignored by Ceph. You can set client-wide block sizes
- * via the fs.ceph.blockSize param if you like.
- * @param progress A Progressable to report back to.
- * Reporting is limited but exists.
- * @return An FSDataOutputStream pointing to the created file.
- * @throws IOException if the path is an
- * existing directory, or the path exists but overwrite is false, or there is a
- * failure in attempting to open for append with Ceph.
- */
- public FSDataOutputStream create(Path path,
- FsPermission permission,
- boolean overwrite,
- int bufferSize,
- short replication,
- long blockSize,
- Progressable progress) throws IOException {
- LOG.debug("create:enter with path " + path);
- Path abs_path = makeAbsolute(path);
-
- if (progress != null) {
- progress.progress();
- }
- // We ignore replication since that's not configurable here, and
- // progress reporting is quite limited.
- // Required semantics: if the file exists, overwrite if 'overwrite' is set;
- // otherwise, throw an exception
-
- // Step 1: existence test
- boolean exists = exists(abs_path);
-
- if (exists) {
- if (getFileStatus(abs_path).isDir()) {
- throw new IOException(
- "create: Cannot overwrite existing directory \"" + path.toString()
- + "\" with a file");
- }
- if (!overwrite) {
- throw new IOException(
- "createRaw: Cannot open existing file \"" + abs_path.toString()
- + "\" for writing without overwrite flag");
- }
- }
-
- if (progress != null) {
- progress.progress();
- }
-
- // Step 2: create any nonexistent directories in the path
- if (!exists) {
- Path parent = abs_path.getParent();
-
- if (parent != null) { // if parent is root, we're done
- int r = ceph.ceph_mkdirs(getCephPath(parent), permission.toShort());
-
- if (!(r == 0 || r == -ceph.EEXIST)) {
- throw new IOException("Error creating parent directory; code: " + r);
- }
- }
- if (progress != null) {
- progress.progress();
- }
- }
- // Step 3: open the file
- LOG.trace("calling ceph_open_for_overwrite from Java");
- int fh = ceph.ceph_open_for_overwrite(getCephPath(abs_path),
- (int) permission.toShort());
-
- if (progress != null) {
- progress.progress();
- }
- LOG.trace("Returned from ceph_open_for_overwrite to Java with fh " + fh);
- if (fh < 0) {
- throw new IOException(
- "create: Open for overwrite failed on path \"" + path.toString()
- + "\"");
- }
-
- // Step 4: create the stream
- OutputStream cephOStream = new CephOutputStream(getConf(), ceph, fh,
- bufferSize);
-
- LOG.debug("create:exit");
- return new FSDataOutputStream(cephOStream, statistics);
- }
-
- /**
- * Open a Ceph file and attach the file handle to an FSDataInputStream.
- * @param path The file to open
- * @param bufferSize Ceph does internal buffering; but you can buffer in
- * the Java code too if you like.
- * @return FSDataInputStream reading from the given path.
- * @throws IOException if the path DNE or is a
- * directory, or there is an error getting data to set up the FSDataInputStream.
- */
- public FSDataInputStream open(Path path, int bufferSize) throws IOException {
- LOG.debug("open:enter with path " + path);
- Path abs_path = makeAbsolute(path);
-
- int fh = ceph.ceph_open_for_read(getCephPath(abs_path));
-
- if (fh < 0) { // uh-oh, something's bad!
- if (fh == -ceph.ENOENT) { // well that was a stupid open
- throw new IOException(
- "open: absolute path \"" + abs_path.toString()
- + "\" does not exist");
- } else { // hrm...the file exists but we can't open it :(
- throw new IOException("open: Failed to open file " + abs_path.toString());
- }
- }
-
- if (getFileStatus(abs_path).isDir()) { // yes, it is possible to open Ceph directories
- // but that doesn't mean you should in Hadoop!
- ceph.ceph_close(fh);
- throw new IOException(
- "open: absolute path \"" + abs_path.toString() + "\" is a directory!");
- }
- Stat lstat = new Stat();
-
- LOG.trace("open:calling ceph_stat from Java");
- ceph.ceph_stat(getCephPath(abs_path), lstat);
- LOG.trace("open:returned to Java");
- long size = lstat.size;
-
- if (size < 0) {
- throw new IOException(
- "Failed to get file size for file " + abs_path.toString()
- + " but succeeded in opening file. Something bizarre is going on.");
- }
- FSInputStream cephIStream = new CephInputStream(getConf(), ceph, fh, size,
- bufferSize);
-
- LOG.debug("open:exit");
- return new FSDataInputStream(cephIStream);
- }
-
- /**
- * Rename a file or directory.
- * @param src The current path of the file/directory
- * @param dst The new name for the path.
- * @return true if the rename succeeded, false otherwise.
- */
- @Override
- public boolean rename(Path src, Path dst) throws IOException {
- LOG.debug("rename:enter with src:" + src + " and dest:" + dst);
- Path abs_src = makeAbsolute(src);
- Path abs_dst = makeAbsolute(dst);
-
- LOG.trace("calling ceph_rename from Java");
- boolean result = ceph.ceph_rename(getCephPath(abs_src), getCephPath(abs_dst));
-
- if (!result) {
- boolean isDir = false;
- try {
- isDir = getFileStatus(abs_dst).isDir();
- } catch (FileNotFoundException e) {}
- if (isDir) { // move the srcdir into destdir
- LOG.debug("ceph_rename failed but dst is a directory!");
- Path new_dst = new Path(abs_dst, abs_src.getName());
-
- result = rename(abs_src, new_dst);
- LOG.debug(
- "attempt to move " + abs_src.toString() + " to "
- + new_dst.toString() + "has result:" + result);
- }
- }
- LOG.debug("rename:exit with result: " + result);
- return result;
- }
-
- /*
- * Attempt to convert an IP into its hostname
- */
- private String[] ips2Hosts(String[] ips) {
- ArrayList<String> hosts = new ArrayList<String>();
- for (String ip : ips) {
- try {
- String host = DNS.reverseDns(InetAddress.getByName(ip), CEPH_NAMESERVER);
- if (host.charAt(host.length()-1) == '.') {
- host = host.substring(0, host.length()-1);
- }
- hosts.add(host); /* append */
- } catch (Exception e) {
- LOG.error("reverseDns ["+ip+"] failed: "+ e);
- }
- }
- return hosts.toArray(new String[hosts.size()]);
- }
-
- /**
- * Get a BlockLocation object for each block in a file.
- *
- * Note that this doesn't include port numbers in the name field as
- * Ceph handles slow/down servers internally. This data should be used
- * only for selecting which servers to run which jobs on.
- *
- * @param file A FileStatus object corresponding to the file you want locations for.
- * @param start The offset of the first part of the file you are interested in.
- * @param len The amount of the file past the offset you are interested in.
- * @return A BlockLocation[] where each object corresponds to a block within
- * the given range.
- */
- @Override
- public BlockLocation[] getFileBlockLocations(FileStatus file, long start, long len) throws IOException {
- Path abs_path = makeAbsolute(file.getPath());
-
- int fh = ceph.ceph_open_for_read(getCephPath(abs_path));
- if (fh < 0) {
- LOG.error("getFileBlockLocations:got error " + fh + ", exiting and returning null!");
- return null;
- }
-
- long blockSize = ceph.ceph_getblocksize(getCephPath(abs_path));
- BlockLocation[] locations = new BlockLocation[(int) Math.ceil(len / (float) blockSize)];
-
- for (int i = 0; i < locations.length; ++i) {
- long offset = start + i * blockSize;
- long blockStart = start + i * blockSize - (start % blockSize);
- String ips[] = ceph.ceph_hosts(fh, offset);
- String hosts[] = ips2Hosts(ips);
- locations[i] = new BlockLocation(null, hosts, blockStart, blockSize);
- LOG.debug("getFileBlockLocations: location[" + i + "]: " + locations[i]);
- }
-
- ceph.ceph_close(fh);
- return locations;
- }
-
- @Deprecated
- public boolean delete(Path path) throws IOException {
- return delete(path, false);
- }
-
- /**
- * Delete the given path, and optionally its children.
- * @param path the path to delete.
- * @param recursive If the path is a non-empty directory and this is false,
- * delete will throw an IOException. If path is a file this is ignored.
- * @return true if the delete succeeded, false otherwise (including if
- * path doesn't exist).
- * @throws IOException if you attempt to non-recursively delete a directory,
- * or you attempt to delete the root directory.
- */
- public boolean delete(Path path, boolean recursive) throws IOException {
- LOG.debug("delete:enter with path " + path + " and recursive=" + recursive);
- Path abs_path = makeAbsolute(path);
-
- // sanity check
- if (abs_path.equals(root)) {
- throw new IOException("Error: deleting the root directory is a Bad Idea.");
- }
- if (!exists(abs_path)) {
- return false;
- }
-
- // if the path is a file, try to delete it.
- if (isFile(abs_path)) {
- LOG.trace("delete:calling ceph_unlink from Java with path " + abs_path);
- boolean result = ceph.ceph_unlink(getCephPath(abs_path));
-
- if (!result) {
- LOG.error(
- "delete: failed to delete file \"" + abs_path.toString() + "\".");
- }
- LOG.debug("delete:exit with success=" + result);
- return result;
- }
-
- /* The path is a directory, so recursively try to delete its contents,
- and then delete the directory. */
- // get the entries; listPaths will remove . and .. for us
- Path[] contents = listPaths(abs_path);
-
- if (contents == null) {
- LOG.error(
- "delete: Failed to read contents of directory \""
- + abs_path.toString() + "\" while trying to delete it, BAILING");
- return false;
- }
- if (!recursive && contents.length > 0) {
- throw new IOException("Directories must be deleted recursively!");
- }
- // delete the entries
- LOG.debug("delete: recursively calling delete on contents of " + abs_path);
- for (Path p : contents) {
- if (!delete(p, true)) {
- LOG.error(
- "delete: Failed to delete file \"" + p.toString()
- + "\" while recursively deleting \"" + abs_path.toString()
- + "\", BAILING");
- return false;
- }
- }
- // if we've come this far it's a now-empty directory, so delete it!
- boolean result = ceph.ceph_rmdir(getCephPath(abs_path));
-
- if (!result) {
- LOG.error(
- "delete: failed to delete \"" + abs_path.toString() + "\", BAILING");
- }
- LOG.debug("delete:exit");
- return result;
- }
-
- /**
- * Returns the default replication value of 1. This may
- * NOT be the actual value, as replication is controlled
- * by a separate Ceph configuration.
- */
- @Override
- public short getDefaultReplication() {
- return 1;
- }
-
- /**
- * Get the default block size.
- * @return the default block size, in bytes, as a long.
- */
- @Override
- public long getDefaultBlockSize() {
- return getConf().getInt("fs.ceph.blockSize", 1 << 26);
- }
-
- /**
- * Adds the working directory to path if path is not already
- * an absolute path. The URI scheme is not removed here. It
- * is removed only when users (e.g. ceph native calls) need
- * the path-only portion.
- */
- private Path makeAbsolute(Path path) {
- if (path.isAbsolute()) {
- return path;
- }
- return new Path(workingDir, path);
- }
-
- private Path[] listPaths(Path path) throws IOException {
- LOG.debug("listPaths:enter with path " + path);
- String dirlist[];
-
- Path abs_path = makeAbsolute(path);
-
- // If it's a directory, get the listing. Otherwise, complain and give up.
- LOG.debug("calling ceph_getdir from Java with path " + abs_path);
- dirlist = ceph.ceph_getdir(getCephPath(abs_path));
- LOG.debug("returning from ceph_getdir to Java");
-
- if (dirlist == null) {
- return null;
- }
-
- // convert the strings to Paths
- Path[] paths = new Path[dirlist.length];
-
- for (int i = 0; i < dirlist.length; ++i) {
- LOG.trace(
- "Raw enumeration of paths in \"" + abs_path.toString() + "\": \""
- + dirlist[i] + "\"");
- // convert each listing to an absolute path
- Path raw_path = new Path(dirlist[i]);
-
- if (raw_path.isAbsolute()) {
- paths[i] = raw_path;
- } else {
- paths[i] = new Path(abs_path, raw_path);
- }
- }
- LOG.debug("listPaths:exit");
- return paths;
- }
-
- static class Stat {
- public long size;
- public boolean is_dir;
- public long block_size;
- public long mod_time;
- public long access_time;
- public int mode;
-
- public Stat() {}
- }
-}
diff --git a/src/client/hadoop/ceph/CephInputStream.java b/src/client/hadoop/ceph/CephInputStream.java
deleted file mode 100644
index d9668d031ba..00000000000
--- a/src/client/hadoop/ceph/CephInputStream.java
+++ /dev/null
@@ -1,254 +0,0 @@
-// -*- mode:Java; tab-width:2; c-basic-offset:2; indent-tabs-mode:t -*-
-
-/**
- *
- * Licensed under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- * implied. See the License for the specific language governing
- * permissions and limitations under the License.
- *
- *
- * Implements the Hadoop FS interfaces to allow applications to store
- * files in Ceph.
- */
-package org.apache.hadoop.fs.ceph;
-
-
-import java.io.IOException;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSInputStream;
-
-
-/**
- * <p>
- * An {@link FSInputStream} for a CephFileSystem and corresponding
- * Ceph instance.
- */
-public class CephInputStream extends FSInputStream {
- private static final Log LOG = LogFactory.getLog(CephInputStream.class);
- private boolean closed;
-
- private int fileHandle;
-
- private long fileLength;
-
- private CephFS ceph;
-
- private byte[] buffer;
- private int bufPos = 0;
- private int bufValid = 0;
- private long cephPos = 0;
-
- /**
- * Create a new CephInputStream.
- * @param conf The system configuration. Unused.
- * @param fh The filehandle provided by Ceph to reference.
- * @param flength The current length of the file. If the length changes
- * you will need to close and re-open it to access the new data.
- */
- public CephInputStream(Configuration conf, CephFS cephfs,
- int fh, long flength, int bufferSize) {
- // Whoever's calling the constructor is responsible for doing the actual ceph_open
- // call and providing the file handle.
- fileLength = flength;
- fileHandle = fh;
- closed = false;
- ceph = cephfs;
- buffer = new byte[bufferSize];
- LOG.debug(
- "CephInputStream constructor: initializing stream with fh " + fh
- + " and file length " + flength);
-
- }
-
- /** Ceph likes things to be closed before it shuts down,
- * so closing the IOStream stuff voluntarily in a finalizer is good
- */
- protected void finalize() throws Throwable {
- try {
- if (!closed) {
- close();
- }
- } finally {
- super.finalize();
- }
- }
-
- private synchronized boolean fillBuffer() throws IOException {
- bufValid = ceph.ceph_read(fileHandle, buffer, 0, buffer.length);
- bufPos = 0;
- if (bufValid < 0) {
- int err = bufValid;
-
- bufValid = 0;
- // attempt to reset to old position. If it fails, too bad.
- ceph.ceph_seek_from_start(fileHandle, cephPos);
- throw new IOException("Failed to fill read buffer! Error code:" + err);
- }
- cephPos += bufValid;
- return (bufValid != 0);
- }
-
- /*
- * Get the current position of the stream.
- */
- public synchronized long getPos() throws IOException {
- return cephPos - bufValid + bufPos;
- }
-
- /**
- * Find the number of bytes remaining in the file.
- */
- @Override
- public synchronized int available() throws IOException {
- return (int) (fileLength - getPos());
- }
-
- public synchronized void seek(long targetPos) throws IOException {
- LOG.trace(
- "CephInputStream.seek: Seeking to position " + targetPos + " on fd "
- + fileHandle);
- if (targetPos > fileLength) {
- throw new IOException(
- "CephInputStream.seek: failed seek to position " + targetPos
- + " on fd " + fileHandle + ": Cannot seek after EOF " + fileLength);
- }
- long oldPos = cephPos;
-
- cephPos = ceph.ceph_seek_from_start(fileHandle, targetPos);
- bufValid = 0;
- bufPos = 0;
- if (cephPos < 0) {
- cephPos = oldPos;
- throw new IOException("Ceph failed to seek to new position!");
- }
- }
-
- /**
- * Failovers are handled by the Ceph code at a very low level;
- * if there are issues that can be solved by changing sources
- * they'll be dealt with before anybody even tries to call this method!
- * @return false.
- */
- public synchronized boolean seekToNewSource(long targetPos) {
- return false;
- }
-
- /**
- * Read a byte from the file.
- * @return the next byte.
- */
- @Override
- public synchronized int read() throws IOException {
- LOG.trace(
- "CephInputStream.read: Reading a single byte from fd " + fileHandle
- + " by calling general read function");
-
- byte result[] = new byte[1];
-
- if (getPos() >= fileLength) {
- return -1;
- }
- if (-1 == read(result, 0, 1)) {
- return -1;
- }
- if (result[0] < 0) {
- return 256 + (int) result[0];
- } else {
- return result[0];
- }
- }
-
- /**
- * Read a specified number of bytes from the file into a byte[].
- * @param buf the byte array to read into.
- * @param off the offset to start at in the file
- * @param len the number of bytes to read
- * @return 0 if successful, otherwise an error code.
- * @throws IOException on bad input.
- */
- @Override
- public synchronized int read(byte buf[], int off, int len)
- throws IOException {
- LOG.trace(
- "CephInputStream.read: Reading " + len + " bytes from fd " + fileHandle);
-
- if (closed) {
- throw new IOException(
- "CephInputStream.read: cannot read " + len + " bytes from fd "
- + fileHandle + ": stream closed");
- }
-
- // ensure we're not past the end of the file
- if (getPos() >= fileLength) {
- LOG.debug(
- "CephInputStream.read: cannot read " + len + " bytes from fd "
- + fileHandle + ": current position is " + getPos()
- + " and file length is " + fileLength);
-
- return -1;
- }
-
- int totalRead = 0;
- int initialLen = len;
- int read;
-
- do {
- read = Math.min(len, bufValid - bufPos);
- try {
- System.arraycopy(buffer, bufPos, buf, off, read);
- } catch (IndexOutOfBoundsException ie) {
- throw new IOException(
- "CephInputStream.read: Indices out of bounds:" + "read length is "
- + len + ", buffer offset is " + off + ", and buffer size is "
- + buf.length);
- } catch (ArrayStoreException ae) {
- throw new IOException(
- "Uh-oh, CephInputStream failed to do an array"
- + "copy due to type mismatch...");
- } catch (NullPointerException ne) {
- throw new IOException(
- "CephInputStream.read: cannot read " + len + "bytes from fd:"
- + fileHandle + ": buf is null");
- }
- bufPos += read;
- len -= read;
- off += read;
- totalRead += read;
- } while (len > 0 && fillBuffer());
-
- LOG.trace(
- "CephInputStream.read: Reading " + initialLen + " bytes from fd "
- + fileHandle + ": succeeded in reading " + totalRead + " bytes");
- return totalRead;
- }
-
- /**
- * Close the CephInputStream and release the associated filehandle.
- */
- @Override
- public void close() throws IOException {
- LOG.trace("CephOutputStream.close:enter");
- if (!closed) {
- int result = ceph.ceph_close(fileHandle);
-
- closed = true;
- if (result != 0) {
- throw new IOException(
- "Close somehow failed!"
- + "Don't try and use this stream again, though");
- }
- LOG.trace("CephOutputStream.close:exit");
- }
- }
-}
diff --git a/src/client/hadoop/ceph/CephOutputStream.java b/src/client/hadoop/ceph/CephOutputStream.java
deleted file mode 100644
index 4c50f88467d..00000000000
--- a/src/client/hadoop/ceph/CephOutputStream.java
+++ /dev/null
@@ -1,219 +0,0 @@
-// -*- mode:Java; tab-width:2; c-basic-offset:2; indent-tabs-mode:t -*-
-
-/**
- *
- * Licensed under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- * implied. See the License for the specific language governing
- * permissions and limitations under the License.
- *
- *
- * Implements the Hadoop FS interfaces to allow applications to store
- * files in Ceph.
- */
-
-package org.apache.hadoop.fs.ceph;
-
-
-import java.io.IOException;
-import java.io.OutputStream;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.util.Progressable;
-
-
-/**
- * <p>
- * An {@link OutputStream} for a CephFileSystem and corresponding
- * Ceph instance.
- */
-public class CephOutputStream extends OutputStream {
- private static final Log LOG = LogFactory.getLog(CephOutputStream.class);
- private boolean closed;
-
- private CephFS ceph;
-
- private int fileHandle;
-
- private byte[] buffer;
- private int bufUsed = 0;
-
- /**
- * Construct the CephOutputStream.
- * @param conf The FileSystem configuration.
- * @param fh The Ceph filehandle to connect to.
- */
- public CephOutputStream(Configuration conf, CephFS cephfs,
- int fh, int bufferSize) {
- ceph = cephfs;
- fileHandle = fh;
- closed = false;
- buffer = new byte[bufferSize];
- }
-
- /** Ceph likes things to be closed before it shuts down,
- *so closing the IOStream stuff voluntarily is good
- */
- protected void finalize() throws Throwable {
- try {
- if (!closed) {
- close();
- }
- } finally {
- super.finalize();
- }
- }
-
- /**
- * Get the current position in the file.
- * @return The file offset in bytes.
- */
- public long getPos() throws IOException {
- return ceph.ceph_getpos(fileHandle);
- }
-
- /**
- * Write a byte.
- * @param b The byte to write.
- * @throws IOException If you have closed the CephOutputStream or the
- * write fails.
- */
- @Override
- public synchronized void write(int b) throws IOException {
- LOG.trace(
- "CephOutputStream.write: writing a single byte to fd " + fileHandle);
-
- if (closed) {
- throw new IOException(
- "CephOutputStream.write: cannot write " + "a byte to fd " + fileHandle
- + ": stream closed");
- }
- // Stick the byte in a buffer and write it
- byte buf[] = new byte[1];
-
- buf[0] = (byte) b;
- write(buf, 0, 1);
- return;
- }
-
- /**
- * Write a byte buffer into the Ceph file.
- * @param buf the byte array to write from
- * @param off the position in the file to start writing at.
- * @param len The number of bytes to actually write.
- * @throws IOException if you have closed the CephOutputStream, or
- * if buf is null or off + len > buf.length, or
- * if the write fails due to a Ceph error.
- */
- @Override
- public synchronized void write(byte buf[], int off, int len) throws IOException {
- LOG.trace(
- "CephOutputStream.write: writing " + len + " bytes to fd " + fileHandle);
- // make sure stream is open
- if (closed) {
- throw new IOException(
- "CephOutputStream.write: cannot write " + len + "bytes to fd "
- + fileHandle + ": stream closed");
- }
-
- int result;
- int write;
-
- while (len > 0) {
- write = Math.min(len, buffer.length - bufUsed);
- try {
- System.arraycopy(buf, off, buffer, bufUsed, write);
- } catch (IndexOutOfBoundsException ie) {
- throw new IOException(
- "CephOutputStream.write: Indices out of bounds: "
- + "write length is " + len + ", buffer offset is " + off
- + ", and buffer size is " + buf.length);
- } catch (ArrayStoreException ae) {
- throw new IOException(
- "Uh-oh, CephOutputStream failed to do an array"
- + " copy due to type mismatch...");
- } catch (NullPointerException ne) {
- throw new IOException(
- "CephOutputStream.write: cannot write " + len + "bytes to fd "
- + fileHandle + ": buffer is null");
- }
- bufUsed += write;
- len -= write;
- off += write;
- if (bufUsed == buffer.length) {
- result = ceph.ceph_write(fileHandle, buffer, 0, bufUsed);
- if (result < 0) {
- throw new IOException(
- "CephOutputStream.write: Buffered write of " + bufUsed
- + " bytes failed!");
- }
- if (result != bufUsed) {
- throw new IOException(
- "CephOutputStream.write: Wrote only " + result + " bytes of "
- + bufUsed + " in buffer! Data may be lost or written"
- + " twice to Ceph!");
- }
- bufUsed = 0;
- }
-
- }
- return;
- }
-
- /**
- * Flush the buffered data.
- * @throws IOException if you've closed the stream or the write fails.
- */
- @Override
- public synchronized void flush() throws IOException {
- if (!closed) {
- if (bufUsed == 0) {
- return;
- }
- int result = ceph.ceph_write(fileHandle, buffer, 0, bufUsed);
-
- if (result < 0) {
- throw new IOException(
- "CephOutputStream.write: Write of " + bufUsed + "bytes to fd "
- + fileHandle + " failed");
- }
- if (result != bufUsed) {
- throw new IOException(
- "CephOutputStream.write: Write of " + bufUsed + "bytes to fd "
- + fileHandle + "was incomplete: only " + result + " of " + bufUsed
- + " bytes were written.");
- }
- bufUsed = 0;
- return;
- }
- }
-
- /**
- * Close the CephOutputStream.
- * @throws IOException if Ceph somehow returns an error. In current code it can't.
- */
- @Override
- public synchronized void close() throws IOException {
- LOG.trace("CephOutputStream.close:enter");
- if (!closed) {
- flush();
- int result = ceph.ceph_close(fileHandle);
-
- if (result != 0) {
- throw new IOException("Close failed!");
- }
-
- closed = true;
- LOG.trace("CephOutputStream.close:exit");
- }
- }
-}
diff --git a/src/client/hadoop/ceph/CephTalker.java b/src/client/hadoop/ceph/CephTalker.java
deleted file mode 100644
index 569652fdd0b..00000000000
--- a/src/client/hadoop/ceph/CephTalker.java
+++ /dev/null
@@ -1,91 +0,0 @@
-// -*- mode:Java; tab-width:2; c-basic-offset:2; indent-tabs-mode:t -*-
-
-/**
- *
- * Licensed under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- * implied. See the License for the specific language governing
- * permissions and limitations under the License.
- *
- *
- * Wraps a number of native function calls to communicate with the Ceph
- * filesystem.
- */
-package org.apache.hadoop.fs.ceph;
-
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.commons.logging.Log;
-
-
-class CephTalker extends CephFS {
- // JNI doesn't give us any way to store pointers, so use a long.
- // Here we're assuming pointers aren't longer than 8 bytes.
- long cluster;
-
- // we write a constructor so we can load the libraries
- public CephTalker(Configuration conf, Log log) {
- System.load(conf.get("fs.ceph.libDir") + "/libcephfs.so");
- System.load(conf.get("fs.ceph.libDir") + "/libhadoopcephfs.so");
- cluster = 0;
- }
-
- protected native boolean ceph_initializeClient(String arguments, int block_size);
-
- protected native String ceph_getcwd();
-
- protected native boolean ceph_setcwd(String path);
-
- protected native boolean ceph_rmdir(String path);
-
- protected native boolean ceph_unlink(String path);
-
- protected native boolean ceph_rename(String old_path, String new_path);
-
- protected native boolean ceph_exists(String path);
-
- protected native long ceph_getblocksize(String path);
-
- protected native boolean ceph_isdirectory(String path);
-
- protected native boolean ceph_isfile(String path);
-
- protected native String[] ceph_getdir(String path);
-
- protected native int ceph_mkdirs(String path, int mode);
-
- protected native int ceph_open_for_append(String path);
-
- protected native int ceph_open_for_read(String path);
-
- protected native int ceph_open_for_overwrite(String path, int mode);
-
- protected native int ceph_close(int filehandle);
-
- protected native boolean ceph_setPermission(String path, int mode);
-
- protected native boolean ceph_kill_client();
-
- protected native boolean ceph_stat(String path, CephFileSystem.Stat fill);
-
- protected native int ceph_replication(String Path);
-
- protected native String[] ceph_hosts(int fh, long offset);
-
- protected native int ceph_setTimes(String path, long mtime, long atime);
-
- protected native long ceph_getpos(int fh);
-
- protected native int ceph_write(int fh, byte[] buffer, int buffer_offset, int length);
-
- protected native int ceph_read(int fh, byte[] buffer, int buffer_offset, int length);
-
- protected native long ceph_seek_from_start(int fh, long pos);
-}
diff --git a/src/client/hadoop/ceph/LICENSE b/src/client/hadoop/ceph/LICENSE
deleted file mode 100644
index 7a0decda573..00000000000
--- a/src/client/hadoop/ceph/LICENSE
+++ /dev/null
@@ -1,4 +0,0 @@
-Unlike the rest of the code in this repository, this
-directory (src/client/hadoop) is licensed under the Apache License 2.0. This
-is for the obvious reason that we want to integrate it into the Apache Hadoop
-project. \ No newline at end of file
diff --git a/src/client/hadoop/ceph/TestCeph.java b/src/client/hadoop/ceph/TestCeph.java
deleted file mode 100644
index e46b0eed3a1..00000000000
--- a/src/client/hadoop/ceph/TestCeph.java
+++ /dev/null
@@ -1,45 +0,0 @@
-// -*- mode:Java; tab-width:2; c-basic-offset:2; indent-tabs-mode:t -*-
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * Unit tests for the CephFileSystem API implementation.
- */
-
-package org.apache.hadoop.fs.ceph;
-
-
-import java.io.IOException;
-import java.net.URI;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystemContractBaseTest;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-
-
-public class TestCeph extends FileSystemContractBaseTest {
-
- @Override
- protected void setUp() throws IOException {
- Configuration conf = new Configuration();
- CephFaker cephfaker = new CephFaker(conf, FileSystem.LOG);
- CephFileSystem cephfs = new CephFileSystem(cephfaker);
-
- cephfs.initialize(URI.create("ceph://null"), conf);
- fs = cephfs;
- }
-}
diff --git a/src/client/hadoop/ceph/package.html b/src/client/hadoop/ceph/package.html
deleted file mode 100644
index 8167b1dde92..00000000000
--- a/src/client/hadoop/ceph/package.html
+++ /dev/null
@@ -1,101 +0,0 @@
-<html>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<head></head>
-<body>
-<h1>A client for the Ceph filesystem</h1>
-
-<h3>Introduction</h3>
-
-This page describes how to use <a href="http://ceph.newdream.net">Ceph</a>
-as a backing store with Hadoop. This page assumes that you have downloaded
-the Ceph software and installed necessary binaries as outlined in the Ceph
-documentation.
-
-<h3>Steps</h3>
-<ul>
- <li>In the Hadoop conf directory edit core-site.xml,
- adding the following (with appropriate substitutions). Note that
- different nodes can connect to different monitors in the same cluster
- without issue (the Ceph client will automatically redirect as necessary).
-<pre>
-&lt;property&gt;
- &lt;name&gt;fs.default.name&lt;/name&gt;
- &lt;value&gt;ceph://null&lt;/value&gt;
-&lt;/property&gt;
-
-&lt;property&gt;
- &lt;name&gt;fs.ceph.monAddr&lt;/name&gt;
- &lt;value&gt;&lt;serverIP:port&gt;&lt;/value&gt;
- &lt;description&gt;The location of the Ceph monitor to connect to.
- This should be an IP address, not a domain-based web address.&lt;/description&gt;
-&lt;/property&gt;
-
-&lt;property&gt;
- &lt;name&gt;fs.ceph.libDir&lt;/name&gt;
- &lt;value&gt;/usr/local/lib&lt;/value&gt;
- &lt;description&gt;The folder holding libcephfs and libhadoopceph&lt;/description&gt;
- &lt;/property&gt;
-</pre>
- <li>There are also a number of optional Ceph configuration options.
-<pre>
-&lt;property&gt;
- &lt;name&gt;fs.ceph.blockSize&lt;/name&gt;
- &lt;value&gt;67108864&lt;/value&gt;
- &lt;description&gt;Defaulting to 64MB, this is the size (in bytes) you want Ceph to use in striping data internally and presenting it to Hadoop.&lt;/description&gt;
-&lt;/property&gt;
-
-&lt;property&gt;
- &lt;name&gt;fs.ceph.debug&lt;/name&gt;
- &lt;value&gt;true&lt;/value&gt;
- &lt;description&gt;If true, the Java-based code will print debugging information to standard error. This is useful if attempting to debug a Ceph issue as it puts both outputs in the same place.&lt;/description&gt;
-&lt;/property&gt;
-
-&lt;property&gt;
- &lt;name&gt;fs.ceph.clientDebug&lt;/name&gt;
- &lt;value&gt;1&lt;/value&gt;
- &lt;description&gt;If non-zero, the Ceph client will print debugging information to standard error (a higher number=more debugging).&lt;/description&gt;
-&lt;/property&gt;
-
-&lt;property&gt;
- &lt;name&gt;fs.ceph.messengerDebug&lt;/name&gt;
- &lt;value&gt;1&lt;/value&gt;
- &lt;description&gt;If non-zero, the Ceph messenger will print debugging information to standard error(a higher number=more debugging)&lt;/description&gt;
-&lt;/property&gt;
-
-&lt;property&gt;
- &lt;name&gt;fs.ceph.readahead&lt;/name&gt;
- &lt;value&gt;1&lt;/value&gt;
- &lt;description&gt;Sets the number of object periods to read ahead in prefetching. This should probably be left at the default of 1.&lt;/description&gt;
-&lt;/property&gt;
-
-&lt;property&gt;
- &lt;name&gt;fs.ceph.commandLine&lt;/name&gt;
- &lt;value&gt;a string&lt;/value&gt;
- &lt;description&gt;If you prefer, you may enter any of Ceph's command-line configuration here and it will get passed to the C client. Note that any filled-in configuration options will override what you put here. <br>
-By default, Ceph performs writes across the network rather than locally. To force local writes, add "set_local_pg" in this property.&lt;/description&gt;
-&lt;/property&gt;
-</pre>
-
- <li>Start up your Ceph instance according to the Ceph documentation.</li>
- <li>Do not use the bin/start-all.sh commands, as they will attempt to start
- up an hdfs instance. Just start whatever systems you need and they will
- automatically make use of the Ceph filesystem once configured as above.</li>
-</body>
-</html>
diff --git a/src/client/hadoop/org_apache_hadoop_fs_ceph_CephFS.h b/src/client/hadoop/org_apache_hadoop_fs_ceph_CephFS.h
deleted file mode 100644
index 0c07fd56e37..00000000000
--- a/src/client/hadoop/org_apache_hadoop_fs_ceph_CephFS.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* DO NOT EDIT THIS FILE - it is machine generated */
-#include <jni.h>
-/* Header for class org_apache_hadoop_fs_ceph_CephFS */
-
-#ifndef _Included_org_apache_hadoop_fs_ceph_CephFS
-#define _Included_org_apache_hadoop_fs_ceph_CephFS
-#ifdef __cplusplus
-extern "C" {
-#endif
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/src/client/hadoop/org_apache_hadoop_fs_ceph_CephFileSystem.h b/src/client/hadoop/org_apache_hadoop_fs_ceph_CephFileSystem.h
deleted file mode 100644
index 6f2bc93926c..00000000000
--- a/src/client/hadoop/org_apache_hadoop_fs_ceph_CephFileSystem.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* DO NOT EDIT THIS FILE - it is machine generated */
-#include <jni.h>
-/* Header for class org_apache_hadoop_fs_ceph_CephFileSystem */
-
-#ifndef _Included_org_apache_hadoop_fs_ceph_CephFileSystem
-#define _Included_org_apache_hadoop_fs_ceph_CephFileSystem
-#ifdef __cplusplus
-extern "C" {
-#endif
-#undef org_apache_hadoop_fs_ceph_CephFileSystem_EEXIST
-#define org_apache_hadoop_fs_ceph_CephFileSystem_EEXIST 17L
-#undef org_apache_hadoop_fs_ceph_CephFileSystem_ENOENT
-#define org_apache_hadoop_fs_ceph_CephFileSystem_ENOENT 2L
-#undef org_apache_hadoop_fs_ceph_CephFileSystem_FATAL
-#define org_apache_hadoop_fs_ceph_CephFileSystem_FATAL 0L
-#undef org_apache_hadoop_fs_ceph_CephFileSystem_ERROR
-#define org_apache_hadoop_fs_ceph_CephFileSystem_ERROR 1L
-#undef org_apache_hadoop_fs_ceph_CephFileSystem_WARN
-#define org_apache_hadoop_fs_ceph_CephFileSystem_WARN 2L
-#undef org_apache_hadoop_fs_ceph_CephFileSystem_INFO
-#define org_apache_hadoop_fs_ceph_CephFileSystem_INFO 3L
-#undef org_apache_hadoop_fs_ceph_CephFileSystem_DEBUG
-#define org_apache_hadoop_fs_ceph_CephFileSystem_DEBUG 4L
-#undef org_apache_hadoop_fs_ceph_CephFileSystem_TRACE
-#define org_apache_hadoop_fs_ceph_CephFileSystem_TRACE 5L
-#undef org_apache_hadoop_fs_ceph_CephFileSystem_NOLOG
-#define org_apache_hadoop_fs_ceph_CephFileSystem_NOLOG 6L
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/src/client/hadoop/org_apache_hadoop_fs_ceph_CephFileSystem_CephStat.h b/src/client/hadoop/org_apache_hadoop_fs_ceph_CephFileSystem_CephStat.h
deleted file mode 100644
index 5ab70f7dc66..00000000000
--- a/src/client/hadoop/org_apache_hadoop_fs_ceph_CephFileSystem_CephStat.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* DO NOT EDIT THIS FILE - it is machine generated */
-#include <jni.h>
-/* Header for class org_apache_hadoop_fs_ceph_CephFileSystem_CephStat */
-
-#ifndef _Included_org_apache_hadoop_fs_ceph_CephFileSystem_CephStat
-#define _Included_org_apache_hadoop_fs_ceph_CephFileSystem_CephStat
-#ifdef __cplusplus
-extern "C" {
-#endif
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/src/client/hadoop/org_apache_hadoop_fs_ceph_CephFileSystem_Stat.h b/src/client/hadoop/org_apache_hadoop_fs_ceph_CephFileSystem_Stat.h
deleted file mode 100644
index e9ade0e4504..00000000000
--- a/src/client/hadoop/org_apache_hadoop_fs_ceph_CephFileSystem_Stat.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* DO NOT EDIT THIS FILE - it is machine generated */
-#include <jni.h>
-/* Header for class org_apache_hadoop_fs_ceph_CephFileSystem_Stat */
-
-#ifndef _Included_org_apache_hadoop_fs_ceph_CephFileSystem_Stat
-#define _Included_org_apache_hadoop_fs_ceph_CephFileSystem_Stat
-#ifdef __cplusplus
-extern "C" {
-#endif
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/src/client/hadoop/org_apache_hadoop_fs_ceph_CephInputStream.h b/src/client/hadoop/org_apache_hadoop_fs_ceph_CephInputStream.h
deleted file mode 100644
index 4ec903294b7..00000000000
--- a/src/client/hadoop/org_apache_hadoop_fs_ceph_CephInputStream.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* DO NOT EDIT THIS FILE - it is machine generated */
-#include <jni.h>
-/* Header for class org_apache_hadoop_fs_ceph_CephInputStream */
-
-#ifndef _Included_org_apache_hadoop_fs_ceph_CephInputStream
-#define _Included_org_apache_hadoop_fs_ceph_CephInputStream
-#ifdef __cplusplus
-extern "C" {
-#endif
-#undef org_apache_hadoop_fs_ceph_CephInputStream_SKIP_BUFFER_SIZE
-#define org_apache_hadoop_fs_ceph_CephInputStream_SKIP_BUFFER_SIZE 2048L
-/*
- * Class: org_apache_hadoop_fs_ceph_CephInputStream
- * Method: ceph_read
- * Signature: (I[BII)I
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1read
- (JNIEnv *, jobject, jint, jbyteArray, jint, jint);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephInputStream
- * Method: ceph_seek_from_start
- * Signature: (IJ)J
- */
-JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1seek_1from_1start
- (JNIEnv *, jobject, jint, jlong);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephInputStream
- * Method: ceph_getpos
- * Signature: (I)J
- */
-JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1getpos
- (JNIEnv *, jobject, jint);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephInputStream
- * Method: ceph_close
- * Signature: (I)I
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1close
- (JNIEnv *, jobject, jint);
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/src/client/hadoop/org_apache_hadoop_fs_ceph_CephOutputStream.h b/src/client/hadoop/org_apache_hadoop_fs_ceph_CephOutputStream.h
deleted file mode 100644
index 676b137c9f9..00000000000
--- a/src/client/hadoop/org_apache_hadoop_fs_ceph_CephOutputStream.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* DO NOT EDIT THIS FILE - it is machine generated */
-#include <jni.h>
-/* Header for class org_apache_hadoop_fs_ceph_CephOutputStream */
-
-#ifndef _Included_org_apache_hadoop_fs_ceph_CephOutputStream
-#define _Included_org_apache_hadoop_fs_ceph_CephOutputStream
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class: org_apache_hadoop_fs_ceph_CephOutputStream
- * Method: ceph_getpos
- * Signature: (I)J
- */
-JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1getpos
- (JNIEnv *, jobject, jint);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephOutputStream
- * Method: ceph_close
- * Signature: (I)I
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1close
- (JNIEnv *, jobject, jint);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephOutputStream
- * Method: ceph_write
- * Signature: (I[BII)I
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1write
- (JNIEnv *, jobject, jint, jbyteArray, jint, jint);
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/src/client/hadoop/org_apache_hadoop_fs_ceph_CephTalker.h b/src/client/hadoop/org_apache_hadoop_fs_ceph_CephTalker.h
deleted file mode 100644
index 55854549b8c..00000000000
--- a/src/client/hadoop/org_apache_hadoop_fs_ceph_CephTalker.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/* DO NOT EDIT THIS FILE - it is machine generated */
-#include <jni.h>
-/* Header for class org_apache_hadoop_fs_ceph_CephTalker */
-
-#ifndef _Included_org_apache_hadoop_fs_ceph_CephTalker
-#define _Included_org_apache_hadoop_fs_ceph_CephTalker
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_initializeClient
- * Signature: (Ljava/lang/String;I)Z
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1initializeClient
- (JNIEnv *, jobject, jstring, jint);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_getcwd
- * Signature: ()Ljava/lang/String;
- */
-JNIEXPORT jstring JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1getcwd
- (JNIEnv *, jobject);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_setcwd
- * Signature: (Ljava/lang/String;)Z
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1setcwd
- (JNIEnv *, jobject, jstring);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_rmdir
- * Signature: (Ljava/lang/String;)Z
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1rmdir
- (JNIEnv *, jobject, jstring);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_unlink
- * Signature: (Ljava/lang/String;)Z
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1unlink
- (JNIEnv *, jobject, jstring);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_rename
- * Signature: (Ljava/lang/String;Ljava/lang/String;)Z
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1rename
- (JNIEnv *, jobject, jstring, jstring);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_exists
- * Signature: (Ljava/lang/String;)Z
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1exists
- (JNIEnv *, jobject, jstring);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_getblocksize
- * Signature: (Ljava/lang/String;)J
- */
-JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1getblocksize
- (JNIEnv *, jobject, jstring);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_isdirectory
- * Signature: (Ljava/lang/String;)Z
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1isdirectory
- (JNIEnv *, jobject, jstring);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_isfile
- * Signature: (Ljava/lang/String;)Z
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1isfile
- (JNIEnv *, jobject, jstring);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_getdir
- * Signature: (Ljava/lang/String;)[Ljava/lang/String;
- */
-JNIEXPORT jobjectArray JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1getdir
- (JNIEnv *, jobject, jstring);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_mkdirs
- * Signature: (Ljava/lang/String;I)I
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1mkdirs
- (JNIEnv *, jobject, jstring, jint);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_open_for_append
- * Signature: (Ljava/lang/String;)I
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1open_1for_1append
- (JNIEnv *, jobject, jstring);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_open_for_read
- * Signature: (Ljava/lang/String;)I
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1open_1for_1read
- (JNIEnv *, jobject, jstring);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_open_for_overwrite
- * Signature: (Ljava/lang/String;I)I
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1open_1for_1overwrite
- (JNIEnv *, jobject, jstring, jint);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_close
- * Signature: (I)I
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1close
- (JNIEnv *, jobject, jint);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_setPermission
- * Signature: (Ljava/lang/String;I)Z
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1setPermission
- (JNIEnv *, jobject, jstring, jint);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_kill_client
- * Signature: ()Z
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1kill_1client
- (JNIEnv *, jobject);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_stat
- * Signature: (Ljava/lang/String;Lorg/apache/hadoop/fs/ceph/CephFileSystem/Stat;)Z
- */
-JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1stat
- (JNIEnv *, jobject, jstring, jobject);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_statfs
- * Signature: (Ljava/lang/String;Lorg/apache/hadoop/fs/ceph/CephFileSystem/CephStat;)I
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1statfs
- (JNIEnv *, jobject, jstring, jobject);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_replication
- * Signature: (Ljava/lang/String;)I
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1replication
- (JNIEnv *, jobject, jstring);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_hosts
- * Signature: (IJ)Ljava/lang/String;
- */
-JNIEXPORT jstring JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1hosts
- (JNIEnv *, jobject, jint, jlong);
-
-/*
- * Class: org_apache_hadoop_fs_ceph_CephTalker
- * Method: ceph_setTimes
- * Signature: (Ljava/lang/String;JJ)I
- */
-JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephTalker_ceph_1setTimes
- (JNIEnv *, jobject, jstring, jlong, jlong);
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/src/client/ioctl.h b/src/client/ioctl.h
index c15d3340a76..35b4ce4ac0f 100644
--- a/src/client/ioctl.h
+++ b/src/client/ioctl.h
@@ -1,13 +1,14 @@
#ifndef FS_CEPH_IOCTL_H
#define FS_CEPH_IOCTL_H
+#include "include/int_types.h"
+
#if defined(__linux__)
#include <linux/ioctl.h>
#include <linux/types.h>
#elif defined(__FreeBSD__)
#include <sys/ioctl.h>
#include <sys/types.h>
-#include "include/inttypes.h"
#endif
#define CEPH_IOCTL_MAGIC 0x97
diff --git a/src/cls/Makefile.am b/src/cls/Makefile.am
new file mode 100644
index 00000000000..2d3d43cb1e3
--- /dev/null
+++ b/src/cls/Makefile.am
@@ -0,0 +1,122 @@
+## Rados object classes
+
+libcls_hello_la_SOURCES = cls/hello/cls_hello.cc
+libcls_hello_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libcls_hello_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__cls_.*'
+radoslib_LTLIBRARIES += libcls_hello.la
+
+libcls_rbd_la_SOURCES = cls/rbd/cls_rbd.cc
+libcls_rbd_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libcls_rbd_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__cls_.*'
+radoslib_LTLIBRARIES += libcls_rbd.la
+
+libcls_lock_la_SOURCES = cls/lock/cls_lock.cc
+libcls_lock_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libcls_lock_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__cls_.*'
+radoslib_LTLIBRARIES += libcls_lock.la
+
+libcls_refcount_la_SOURCES = \
+ cls/refcount/cls_refcount.cc \
+ cls/refcount/cls_refcount_ops.cc \
+ common/ceph_json.cc
+libcls_refcount_la_LIBADD = libjson_spirit.la $(PTHREAD_LIBS) $(EXTRALIBS)
+libcls_refcount_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__cls_.*'
+radoslib_LTLIBRARIES += libcls_refcount.la
+
+libcls_version_la_SOURCES = cls/version/cls_version.cc
+libcls_version_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libcls_version_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__cls_.*'
+radoslib_LTLIBRARIES += libcls_version.la
+
+libcls_log_la_SOURCES = cls/log/cls_log.cc
+libcls_log_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libcls_log_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__cls_.*'
+radoslib_LTLIBRARIES += libcls_log.la
+
+libcls_statelog_la_SOURCES = cls/statelog/cls_statelog.cc
+libcls_statelog_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libcls_statelog_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__cls_.*'
+radoslib_LTLIBRARIES += libcls_statelog.la
+
+libcls_replica_log_la_SOURCES = cls/replica_log/cls_replica_log.cc
+libcls_replica_log_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libcls_replica_log_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__cls_.*'
+radoslib_LTLIBRARIES += libcls_replica_log.la
+
+libcls_rgw_la_SOURCES = \
+ cls/rgw/cls_rgw.cc \
+ cls/rgw/cls_rgw_ops.cc \
+ cls/rgw/cls_rgw_types.cc \
+ common/ceph_json.cc
+libcls_rgw_la_LIBADD = libjson_spirit.la $(PTHREAD_LIBS) $(EXTRALIBS)
+libcls_rgw_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__cls_.*'
+radoslib_LTLIBRARIES += libcls_rgw.la
+
+## Rados object client classes
+
+libcls_lock_client_la_SOURCES = \
+ cls/lock/cls_lock_client.cc \
+ cls/lock/cls_lock_types.cc \
+ cls/lock/cls_lock_ops.cc
+noinst_LTLIBRARIES += libcls_lock_client.la
+DENCODER_DEPS += libcls_lock_client.la
+
+libcls_refcount_client_la_SOURCES = \
+ cls/refcount/cls_refcount_client.cc \
+ cls/refcount/cls_refcount_ops.cc
+noinst_LTLIBRARIES += libcls_refcount_client.la
+DENCODER_DEPS += libcls_refcount_client.la
+
+libcls_version_client_a_SOURCES = \
+ cls/version/cls_version_client.cc \
+ cls/version/cls_version_types.cc
+noinst_LIBRARIES += libcls_version_client.a
+
+libcls_log_client_a_SOURCES = cls/log/cls_log_client.cc
+noinst_LIBRARIES += libcls_log_client.a
+
+libcls_statelog_client_a_SOURCES = cls/statelog/cls_statelog_client.cc
+noinst_LIBRARIES += libcls_statelog_client.a
+
+libcls_replica_log_client_a_SOURCES = \
+ cls/replica_log/cls_replica_log_types.cc \
+ cls/replica_log/cls_replica_log_ops.cc \
+ cls/replica_log/cls_replica_log_client.cc
+noinst_LIBRARIES += libcls_replica_log_client.a
+DENCODER_DEPS += libcls_replica_log_client.a
+
+libcls_rgw_client_la_SOURCES = \
+ cls/rgw/cls_rgw_client.cc \
+ cls/rgw/cls_rgw_types.cc \
+ cls/rgw/cls_rgw_ops.cc
+noinst_LTLIBRARIES += libcls_rgw_client.la
+DENCODER_DEPS += libcls_rgw_client.la
+
+libcls_rbd_client_la_SOURCES = cls/rbd/cls_rbd_client.cc
+noinst_LTLIBRARIES += libcls_rbd_client.la
+
+
+noinst_HEADERS += \
+ cls/lock/cls_lock_types.h \
+ cls/lock/cls_lock_ops.h \
+ cls/lock/cls_lock_client.h \
+ cls/rbd/cls_rbd.h \
+ cls/rbd/cls_rbd_client.h \
+ cls/refcount/cls_refcount_ops.h \
+ cls/refcount/cls_refcount_client.h \
+ cls/version/cls_version_types.h \
+ cls/version/cls_version_ops.h \
+ cls/version/cls_version_client.h \
+ cls/log/cls_log_types.h \
+ cls/log/cls_log_ops.h \
+ cls/log/cls_log_client.h \
+ cls/statelog/cls_statelog_types.h \
+ cls/statelog/cls_statelog_ops.h \
+ cls/statelog/cls_statelog_client.h \
+ cls/replica_log/cls_replica_log_types.h \
+ cls/replica_log/cls_replica_log_ops.h \
+ cls/replica_log/cls_replica_log_client.h \
+ cls/rgw/cls_rgw_client.h \
+ cls/rgw/cls_rgw_ops.h \
+ cls/rgw/cls_rgw_types.h
+
diff --git a/src/cls/rbd/cls_rbd.cc b/src/cls/rbd/cls_rbd.cc
index 420de514ee6..12947a08540 100644
--- a/src/cls/rbd/cls_rbd.cc
+++ b/src/cls/rbd/cls_rbd.cc
@@ -25,6 +25,8 @@
* parameters as the client sees them - it would be silly to mention
* in each one that they take an input and an output bufferlist.
*/
+#include "include/int_types.h"
+#include "include/types.h"
#include <algorithm>
#include <cstring>
@@ -35,10 +37,8 @@
#include <sstream>
#include <vector>
-#include "include/types.h"
#include "objclass/objclass.h"
#include "include/rbd_types.h"
-#include <inttypes.h>
#include "cls/rbd/cls_rbd.h"
diff --git a/src/cls/rgw/cls_rgw.cc b/src/cls/rgw/cls_rgw.cc
index 6cda4cba5c3..2f5711ecb32 100644
--- a/src/cls/rgw/cls_rgw.cc
+++ b/src/cls/rgw/cls_rgw.cc
@@ -1,14 +1,14 @@
// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
-#include <iostream>
+#include "include/int_types.h"
+#include "include/types.h"
+#include <iostream>
#include <string.h>
#include <stdlib.h>
#include <errno.h>
-#include <inttypes.h>
-#include "include/types.h"
#include "include/utime.h"
#include "objclass/objclass.h"
#include "cls/rgw/cls_rgw_ops.h"
diff --git a/src/cls/rgw/cls_rgw_client.cc b/src/cls/rgw/cls_rgw_client.cc
index 165ca437987..2851f2bd702 100644
--- a/src/cls/rgw/cls_rgw_client.cc
+++ b/src/cls/rgw/cls_rgw_client.cc
@@ -2,6 +2,7 @@
#include "include/types.h"
#include "cls/rgw/cls_rgw_ops.h"
+#include "cls/rgw/cls_rgw_client.h"
#include "include/rados/librados.hpp"
#include "common/debug.h"
@@ -157,6 +158,44 @@ int cls_rgw_get_dir_header(IoCtx& io_ctx, string& oid, rgw_bucket_dir_header *he
return r;
}
+class GetDirHeaderCompletion : public ObjectOperationCompletion {
+ RGWGetDirHeader_CB *ret_ctx;
+public:
+ GetDirHeaderCompletion(RGWGetDirHeader_CB *_ctx) : ret_ctx(_ctx) {}
+ ~GetDirHeaderCompletion() {
+ ret_ctx->put();
+ }
+ void handle_completion(int r, bufferlist& outbl) {
+ struct rgw_cls_list_ret ret;
+ try {
+ bufferlist::iterator iter = outbl.begin();
+ ::decode(ret, iter);
+ } catch (buffer::error& err) {
+ r = -EIO;
+ }
+
+ ret_ctx->handle_response(r, ret.dir.header);
+ };
+};
+
+int cls_rgw_get_dir_header_async(IoCtx& io_ctx, string& oid, RGWGetDirHeader_CB *ctx)
+{
+ bufferlist in, out;
+ struct rgw_cls_list_op call;
+ call.num_entries = 0;
+ ::encode(call, in);
+ ObjectReadOperation op;
+ GetDirHeaderCompletion *cb = new GetDirHeaderCompletion(ctx);
+ op.exec("rgw", "bucket_list", in, cb);
+ AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
+ int r = io_ctx.aio_operate(oid, c, &op, NULL);
+ c->release();
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
int cls_rgw_bi_log_list(IoCtx& io_ctx, string& oid, string& marker, uint32_t max,
list<rgw_bi_log_entry>& entries, bool *truncated)
{
diff --git a/src/cls/rgw/cls_rgw_client.h b/src/cls/rgw/cls_rgw_client.h
index 2ea5d9ca771..39bb3c9fc4a 100644
--- a/src/cls/rgw/cls_rgw_client.h
+++ b/src/cls/rgw/cls_rgw_client.h
@@ -4,6 +4,13 @@
#include "include/types.h"
#include "include/rados/librados.hpp"
#include "cls_rgw_types.h"
+#include "common/RefCountedObj.h"
+
+class RGWGetDirHeader_CB : public RefCountedObject {
+public:
+ virtual ~RGWGetDirHeader_CB() {}
+ virtual void handle_response(int r, rgw_bucket_dir_header& header) = 0;
+};
/* bucket index */
void cls_rgw_bucket_init(librados::ObjectWriteOperation& o);
@@ -27,6 +34,7 @@ int cls_rgw_bucket_check_index_op(librados::IoCtx& io_ctx, string& oid,
int cls_rgw_bucket_rebuild_index_op(librados::IoCtx& io_ctx, string& oid);
int cls_rgw_get_dir_header(librados::IoCtx& io_ctx, string& oid, rgw_bucket_dir_header *header);
+int cls_rgw_get_dir_header_async(librados::IoCtx& io_ctx, string& oid, RGWGetDirHeader_CB *ctx);
void cls_rgw_encode_suggestion(char op, rgw_bucket_dir_entry& dirent, bufferlist& updates);
diff --git a/src/common/Cond.h b/src/common/Cond.h
index e6a13ae48bb..46fdf159112 100644
--- a/src/common/Cond.h
+++ b/src/common/Cond.h
@@ -32,8 +32,8 @@ class Cond {
Mutex *waiter_mutex;
// don't allow copying.
- void operator=(Cond &C) {}
- Cond( const Cond &C ) {}
+ void operator=(Cond &C);
+ Cond(const Cond &C);
public:
Cond() : waiter_mutex(NULL) {
diff --git a/src/common/Formatter.cc b/src/common/Formatter.cc
index c08ea5b9a20..4ef833a45f8 100644
--- a/src/common/Formatter.cc
+++ b/src/common/Formatter.cc
@@ -14,11 +14,12 @@
#define LARGE_SIZE 1024
+#include "include/int_types.h"
+
#include "assert.h"
#include "Formatter.h"
#include "common/escape.h"
-#include <inttypes.h>
#include <iostream>
#include <sstream>
#include <stdarg.h>
diff --git a/src/common/Formatter.h b/src/common/Formatter.h
index da730103f41..ac68b7f461d 100644
--- a/src/common/Formatter.h
+++ b/src/common/Formatter.h
@@ -1,8 +1,9 @@
#ifndef CEPH_FORMATTER_H
#define CEPH_FORMATTER_H
+#include "include/int_types.h"
+
#include <deque>
-#include <inttypes.h>
#include <iostream>
#include <list>
#include <ostream>
@@ -43,6 +44,9 @@ class Formatter {
virtual void dump_int(const char *name, int64_t s) = 0;
virtual void dump_float(const char *name, double d) = 0;
virtual void dump_string(const char *name, std::string s) = 0;
+ virtual void dump_bool(const char *name, bool b) {
+ dump_format_unquoted(name, "%s", (b ? "true" : "false"));
+ }
virtual std::ostream& dump_stream(const char *name) = 0;
virtual void dump_format(const char *name, const char *fmt, ...) = 0;
virtual void dump_format_unquoted(const char *name, const char *fmt, ...) = 0;
diff --git a/src/common/Makefile.am b/src/common/Makefile.am
new file mode 100644
index 00000000000..9ec6c3e895b
--- /dev/null
+++ b/src/common/Makefile.am
@@ -0,0 +1,202 @@
+libcommon_la_SOURCES = \
+ ceph_ver.c \
+ common/DecayCounter.cc \
+ common/LogClient.cc \
+ common/LogEntry.cc \
+ common/PrebufferedStreambuf.cc \
+ common/SloppyCRCMap.cc \
+ common/BackTrace.cc \
+ common/perf_counters.cc \
+ common/Mutex.cc \
+ common/OutputDataSocket.cc \
+ common/admin_socket.cc \
+ common/admin_socket_client.cc \
+ common/cmdparse.cc \
+ common/escape.c \
+ common/Clock.cc \
+ common/Throttle.cc \
+ common/Timer.cc \
+ common/Finisher.cc \
+ common/environment.cc\
+ common/assert.cc \
+ common/run_cmd.cc \
+ common/WorkQueue.cc \
+ common/ConfUtils.cc \
+ common/MemoryModel.cc \
+ common/armor.c \
+ common/fd.cc \
+ common/xattr.c \
+ common/safe_io.c \
+ common/snap_types.cc \
+ common/str_list.cc \
+ common/errno.cc \
+ common/RefCountedObj.cc \
+ common/blkdev.cc \
+ common/common_init.cc \
+ common/pipe.c \
+ common/ceph_argparse.cc \
+ common/ceph_context.cc \
+ common/buffer.cc \
+ common/code_environment.cc \
+ common/dout.cc \
+ common/signal.cc \
+ common/simple_spin.cc \
+ common/Thread.cc \
+ common/Formatter.cc \
+ common/HeartbeatMap.cc \
+ common/config.cc \
+ common/utf8.c \
+ common/mime.c \
+ common/strtol.cc \
+ common/page.cc \
+ common/lockdep.cc \
+ common/version.cc \
+ common/hex.cc \
+ common/entity_name.cc \
+ common/ceph_crypto.cc \
+ common/ceph_crypto_cms.cc \
+ common/ceph_json.cc \
+ common/ipaddr.cc \
+ common/pick_address.cc \
+ common/util.cc \
+ common/TextTable.cc \
+ common/ceph_fs.cc \
+ common/ceph_hash.cc \
+ common/ceph_strings.cc \
+ common/ceph_frag.cc \
+ common/addr_parsing.c \
+ common/hobject.cc \
+ common/bloom_filter.cc
+
+if LINUX
+libcommon_la_SOURCES += common/secret.c
+endif
+
+# these should go out of libcommon
+libcommon_la_SOURCES += \
+ mon/MonCap.cc \
+ mon/MonClient.cc \
+ mon/MonMap.cc \
+ osd/OSDMap.cc \
+ osd/osd_types.cc \
+ mds/MDSMap.cc \
+ mds/inode_backtrace.cc \
+ mds/mdstypes.cc
+
+# inject crc in common
+libcommon_crc_la_SOURCES = \
+ common/sctp_crc32.c \
+ common/crc32c.cc \
+ common/crc32c_intel_baseline.c \
+ common/crc32c_intel_fast.c
+
+if WITH_GOOD_YASM_ELF64
+libcommon_crc_la_SOURCES += common/crc32c_intel_fast_asm.S
+libcommon_crc_la_LIBTOOLFLAGS = --tag=CC
+endif
+LIBCOMMON_DEPS += libcommon_crc.la
+noinst_LTLIBRARIES += libcommon_crc.la
+
+noinst_HEADERS += \
+ common/bloom_filter.hpp \
+ common/sctp_crc32.h \
+ common/crc32c_intel_baseline.h \
+ common/crc32c_intel_fast.h
+
+
+# important; libmsg before libauth!
+LIBCOMMON_DEPS += \
+ $(LIBMSG) $(LIBAUTH) \
+ $(LIBCRUSH) $(LIBJSON_SPIRIT) $(LIBLOG) $(LIBARCH) \
+ -lkeyutils
+
+if LINUX
+LIBCOMMON_DEPS += -lrt
+endif # LINUX
+
+libcommon_la_LIBADD = $(LIBCOMMON_DEPS)
+
+noinst_HEADERS += \
+ common/BackTrace.h \
+ common/RefCountedObj.h \
+ common/HeartbeatMap.h \
+ common/LogClient.h \
+ common/LogEntry.h \
+ common/Preforker.h \
+ common/SloppyCRCMap.h \
+ common/WorkQueue.h \
+ common/PrioritizedQueue.h \
+ common/ceph_argparse.h \
+ common/ceph_context.h \
+ common/xattr.h \
+ common/blkdev.h \
+ common/compiler_extensions.h \
+ common/debug.h \
+ common/dout.h \
+ common/escape.h \
+ common/fd.h \
+ common/version.h \
+ common/hex.h \
+ common/entity_name.h \
+ common/errno.h \
+ common/environment.h \
+ common/likely.h \
+ common/lockdep.h \
+ common/obj_bencher.h \
+ common/snap_types.h \
+ common/Clock.h \
+ common/Cond.h \
+ common/ConfUtils.h \
+ common/DecayCounter.h \
+ common/Finisher.h \
+ common/Formatter.h \
+ common/perf_counters.h \
+ common/OutputDataSocket.h \
+ common/admin_socket.h \
+ common/admin_socket_client.h \
+ common/shared_cache.hpp \
+ common/tracked_int_ptr.hpp \
+ common/simple_cache.hpp \
+ common/sharedptr_registry.hpp \
+ common/map_cacher.hpp \
+ common/MemoryModel.h \
+ common/Mutex.h \
+ common/PrebufferedStreambuf.h \
+ common/RWLock.h \
+ common/Semaphore.h \
+ common/SimpleRNG.h \
+ common/TextTable.h \
+ common/Thread.h \
+ common/Throttle.h \
+ common/Timer.h \
+ common/TrackedOp.h \
+ common/arch.h \
+ common/armor.h \
+ common/common_init.h \
+ common/pipe.h \
+ common/code_environment.h \
+ common/signal.h \
+ common/simple_spin.h \
+ common/run_cmd.h \
+ common/safe_io.h \
+ common/config.h \
+ common/config_obs.h \
+ common/config_opts.h \
+ common/ceph_crypto.h \
+ common/ceph_crypto_cms.h \
+ common/ceph_json.h \
+ common/lru_map.h \
+ common/utf8.h \
+ common/mime.h \
+ common/pick_address.h \
+ common/secret.h \
+ common/strtol.h \
+ common/static_assert.h \
+ common/AsyncReserver.h \
+ common/sync_filesystem.h \
+ common/cmdparse.h \
+ common/hobject.h
+
+noinst_LTLIBRARIES += libcommon.la
+
+
diff --git a/src/common/Mutex.h b/src/common/Mutex.h
index 06e435d49df..e26a090703d 100644
--- a/src/common/Mutex.h
+++ b/src/common/Mutex.h
@@ -46,8 +46,8 @@ private:
PerfCounters *logger;
// don't allow copying.
- void operator=(Mutex &M) {}
- Mutex( const Mutex &M ) {}
+ void operator=(Mutex &M);
+ Mutex(const Mutex &M);
void _register() {
id = lockdep_register(name);
diff --git a/src/common/OutputDataSocket.cc b/src/common/OutputDataSocket.cc
index e4d21fe13ee..3051ca02dbe 100644
--- a/src/common/OutputDataSocket.cc
+++ b/src/common/OutputDataSocket.cc
@@ -12,6 +12,8 @@
*
*/
+#include "include/int_types.h"
+
#include "common/Thread.h"
#include "common/OutputDataSocket.h"
#include "common/config.h"
@@ -25,7 +27,6 @@
#include <errno.h>
#include <fcntl.h>
-#include <inttypes.h>
#include <map>
#include <poll.h>
#include <set>
diff --git a/src/common/SloppyCRCMap.cc b/src/common/SloppyCRCMap.cc
new file mode 100644
index 00000000000..7924ae6e8a7
--- /dev/null
+++ b/src/common/SloppyCRCMap.cc
@@ -0,0 +1,180 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/SloppyCRCMap.h"
+#include "common/Formatter.h"
+
+void SloppyCRCMap::write(uint64_t offset, uint64_t len, const bufferlist& bl,
+ std::ostream *out)
+{
+ int64_t left = len;
+ uint64_t pos = offset;
+ unsigned o = offset % block_size;
+ if (o) {
+ crc_map.erase(offset - o);
+ if (out)
+ *out << "write invalidate " << (offset - o) << "\n";
+ pos += (block_size - o);
+ left -= (block_size - o);
+ }
+ while (left >= block_size) {
+ bufferlist t;
+ t.substr_of(bl, pos - offset, block_size);
+ crc_map[pos] = t.crc32c(crc_iv);
+ if (out)
+ *out << "write set " << pos << " " << crc_map[pos] << "\n";
+ pos += block_size;
+ left -= block_size;
+ }
+ if (left > 0) {
+ crc_map.erase(pos);
+ if (out)
+ *out << "write invalidate " << pos << "\n";
+ }
+}
+
+int SloppyCRCMap::read(uint64_t offset, uint64_t len, const bufferlist& bl,
+ std::ostream *err)
+{
+ int errors = 0;
+ int64_t left = len;
+ uint64_t pos = offset;
+ unsigned o = offset % block_size;
+ if (o) {
+ pos += (block_size - o);
+ left -= (block_size - o);
+ }
+ while (left >= block_size) {
+ // FIXME: this could be more efficient if we avoid doing a find()
+ // on each iteration
+ std::map<uint64_t,uint32_t>::iterator p = crc_map.find(pos);
+ if (p != crc_map.end()) {
+ bufferlist t;
+ t.substr_of(bl, pos - offset, block_size);
+ uint32_t crc = t.crc32c(crc_iv);
+ if (p->second != crc) {
+ errors++;
+ if (err)
+ *err << "offset " << pos << " len " << block_size
+ << " has crc " << crc << " expected " << p->second << "\n";
+ }
+ }
+ pos += block_size;
+ left -= block_size;
+ }
+ return errors;
+}
+
+void SloppyCRCMap::truncate(uint64_t offset)
+{
+ offset -= offset % block_size;
+ std::map<uint64_t,uint32_t>::iterator p = crc_map.lower_bound(offset);
+ while (p != crc_map.end())
+ crc_map.erase(p++);
+}
+
+void SloppyCRCMap::zero(uint64_t offset, uint64_t len)
+{
+ int64_t left = len;
+ uint64_t pos = offset;
+ unsigned o = offset % block_size;
+ if (o) {
+ crc_map.erase(offset - o);
+ pos += (block_size - o);
+ left -= (block_size - o);
+ }
+ while (left >= block_size) {
+ crc_map[pos] = zero_crc;
+ pos += block_size;
+ left -= block_size;
+ }
+ if (left > 0)
+ crc_map.erase(pos);
+}
+
+void SloppyCRCMap::clone_range(uint64_t offset, uint64_t len,
+ uint64_t srcoff, const SloppyCRCMap& src,
+ std::ostream *out)
+{
+ int64_t left = len;
+ uint64_t pos = offset;
+ uint64_t srcpos = srcoff;
+ unsigned o = offset % block_size;
+ if (o) {
+ crc_map.erase(offset - o);
+ pos += (block_size - o);
+ srcpos += (block_size - o);
+ left -= (block_size - o);
+ if (out)
+ *out << "clone_range invalidate " << (offset - o) << "\n";
+ }
+ while (left >= block_size) {
+ // FIXME: this could be more efficient.
+ if (block_size == src.block_size) {
+ map<uint64_t,uint32_t>::const_iterator p = src.crc_map.find(srcpos);
+ if (p != src.crc_map.end()) {
+ crc_map[pos] = p->second;
+ if (out)
+ *out << "clone_range copy " << pos << " " << p->second << "\n";
+ } else {
+ crc_map.erase(pos);
+ if (out)
+ *out << "clone_range invalidate " << pos << "\n";
+ }
+ } else {
+ crc_map.erase(pos);
+ if (out)
+ *out << "clone_range invalidate " << pos << "\n";
+ }
+ pos += block_size;
+ srcpos += block_size;
+ left -= block_size;
+ }
+ if (left > 0) {
+ crc_map.erase(pos);
+ if (out)
+ *out << "clone_range invalidate " << pos << "\n";
+ }
+}
+
+void SloppyCRCMap::encode(bufferlist& bl) const
+{
+ ENCODE_START(1, 1, bl);
+ ::encode(block_size, bl);
+ ::encode(crc_map, bl);
+ ENCODE_FINISH(bl);
+}
+
+void SloppyCRCMap::decode(bufferlist::iterator& bl)
+{
+ DECODE_START(1, bl);
+ uint32_t bs;
+ ::decode(bs, bl);
+ set_block_size(bs);
+ ::decode(crc_map, bl);
+ DECODE_FINISH(bl);
+}
+
+void SloppyCRCMap::dump(Formatter *f) const
+{
+ f->dump_unsigned("block_size", block_size);
+ f->open_array_section("crc_map");
+ for (map<uint64_t,uint32_t>::const_iterator p = crc_map.begin(); p != crc_map.end(); ++p) {
+ f->open_object_section("crc");
+ f->dump_unsigned("offset", p->first);
+ f->dump_unsigned("crc", p->second);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void SloppyCRCMap::generate_test_instances(list<SloppyCRCMap*>& ls)
+{
+ ls.push_back(new SloppyCRCMap);
+ ls.push_back(new SloppyCRCMap(2));
+ bufferlist bl;
+ bl.append("some data");
+ ls.back()->write(1, bl.length(), bl);
+ ls.back()->write(10, bl.length(), bl);
+ ls.back()->zero(4, 2);
+}
diff --git a/src/common/SloppyCRCMap.h b/src/common/SloppyCRCMap.h
new file mode 100644
index 00000000000..c07b4d9bb9d
--- /dev/null
+++ b/src/common/SloppyCRCMap.h
@@ -0,0 +1,78 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_COMMON_SLOPPYCRCMAP_H
+#define CEPH_COMMON_SLOPPYCRCMAP_H
+
+#include "include/types.h"
+#include "include/encoding.h"
+
+#include <map>
+#include <ostream>
+
+/**
+ * SloppyCRCMap
+ *
+ * Opportunistically track CRCs on any reads or writes that cover full
+ * blocks. Verify read results when we have CRC data available for
+ * the given extent.
+ */
+class SloppyCRCMap {
+ static const int crc_iv = 0xffffffff;
+
+ std::map<uint64_t, uint32_t> crc_map; // offset -> crc(-1)
+ uint32_t block_size;
+ uint32_t zero_crc;
+
+public:
+ SloppyCRCMap(uint32_t b=0) {
+ set_block_size(b);
+ }
+
+ void set_block_size(uint32_t b) {
+ block_size = b;
+ //zero_crc = ceph_crc32c(0xffffffff, NULL, block_size);
+ if (b) {
+ bufferlist bl;
+ bufferptr bp(block_size);
+ bp.zero();
+ bl.append(bp);
+ zero_crc = bl.crc32c(crc_iv);
+ } else {
+ zero_crc = crc_iv;
+ }
+ }
+
+ /// update based on a write
+ void write(uint64_t offset, uint64_t len, const bufferlist& bl,
+ std::ostream *out = NULL);
+
+ /// update based on a truncate
+ void truncate(uint64_t offset);
+
+ /// update based on a zero/punch_hole
+ void zero(uint64_t offset, uint64_t len);
+
+ /// update based on a zero/punch_hole
+ void clone_range(uint64_t offset, uint64_t len, uint64_t srcoff, const SloppyCRCMap& src,
+ std::ostream *out = NULL);
+
+ /**
+ * validate a read result
+ *
+ * @param offset offset
+ * @param length length
+ * @param bl data read
+ * @param err option ostream to describe errors in detail
+ * @returns error count, 0 for success
+ */
+ int read(uint64_t offset, uint64_t len, const bufferlist& bl, std::ostream *err);
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::iterator& bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<SloppyCRCMap*>& ls);
+};
+WRITE_CLASS_ENCODER(SloppyCRCMap)
+
+#endif
diff --git a/src/common/TrackedOp.cc b/src/common/TrackedOp.cc
new file mode 100644
index 00000000000..d1dbc1e7135
--- /dev/null
+++ b/src/common/TrackedOp.cc
@@ -0,0 +1,265 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ * Copyright 2013 Inktank
+ */
+
+#include "TrackedOp.h"
+#include "common/Formatter.h"
+#include <iostream>
+#include <vector>
+#include "common/debug.h"
+#include "common/config.h"
+#include "msg/Message.h"
+#include "include/assert.h"
+
+#define dout_subsys ceph_subsys_optracker
+#undef dout_prefix
+#define dout_prefix _prefix(_dout)
+
+static ostream& _prefix(std::ostream* _dout)
+{
+ return *_dout << "-- op tracker -- ";
+}
+
+void OpHistory::on_shutdown()
+{
+ arrived.clear();
+ duration.clear();
+ shutdown = true;
+}
+
+void OpHistory::insert(utime_t now, TrackedOpRef op)
+{
+ if (shutdown)
+ return;
+ duration.insert(make_pair(op->get_duration(), op));
+ arrived.insert(make_pair(op->get_arrived(), op));
+ cleanup(now);
+}
+
+void OpHistory::cleanup(utime_t now)
+{
+ while (arrived.size() &&
+ (now - arrived.begin()->first >
+ (double)(history_duration))) {
+ duration.erase(make_pair(
+ arrived.begin()->second->get_duration(),
+ arrived.begin()->second));
+ arrived.erase(arrived.begin());
+ }
+
+ while (duration.size() > history_size) {
+ arrived.erase(make_pair(
+ duration.begin()->second->get_arrived(),
+ duration.begin()->second));
+ duration.erase(duration.begin());
+ }
+}
+
+void OpHistory::dump_ops(utime_t now, Formatter *f)
+{
+ cleanup(now);
+ f->open_object_section("OpHistory");
+ f->dump_int("num to keep", history_size);
+ f->dump_int("duration to keep", history_duration);
+ {
+ f->open_array_section("Ops");
+ for (set<pair<utime_t, TrackedOpRef> >::const_iterator i =
+ arrived.begin();
+ i != arrived.end();
+ ++i) {
+ f->open_object_section("Op");
+ i->second->dump(now, f);
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void OpTracker::dump_historic_ops(Formatter *f)
+{
+ Mutex::Locker locker(ops_in_flight_lock);
+ utime_t now = ceph_clock_now(cct);
+ history.dump_ops(now, f);
+}
+
+void OpTracker::dump_ops_in_flight(Formatter *f)
+{
+ Mutex::Locker locker(ops_in_flight_lock);
+ f->open_object_section("ops_in_flight"); // overall dump
+ f->dump_int("num_ops", ops_in_flight.size());
+ f->open_array_section("ops"); // list of TrackedOps
+ utime_t now = ceph_clock_now(cct);
+ for (xlist<TrackedOp*>::iterator p = ops_in_flight.begin(); !p.end(); ++p) {
+ f->open_object_section("op");
+ (*p)->dump(now, f);
+ f->close_section(); // this TrackedOp
+ }
+ f->close_section(); // list of TrackedOps
+ f->close_section(); // overall dump
+}
+
+void OpTracker::register_inflight_op(xlist<TrackedOp*>::item *i)
+{
+ Mutex::Locker locker(ops_in_flight_lock);
+ ops_in_flight.push_back(i);
+ ops_in_flight.back()->seq = seq++;
+}
+
+void OpTracker::unregister_inflight_op(TrackedOp *i)
+{
+ Mutex::Locker locker(ops_in_flight_lock);
+ assert(i->xitem.get_list() == &ops_in_flight);
+ utime_t now = ceph_clock_now(cct);
+ i->xitem.remove_myself();
+ i->request->clear_data();
+ history.insert(now, TrackedOpRef(i));
+}
+
+bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector)
+{
+ Mutex::Locker locker(ops_in_flight_lock);
+ if (!ops_in_flight.size())
+ return false;
+
+ utime_t now = ceph_clock_now(cct);
+ utime_t too_old = now;
+ too_old -= complaint_time;
+
+ utime_t oldest_secs = now - ops_in_flight.front()->get_arrived();
+
+ dout(10) << "ops_in_flight.size: " << ops_in_flight.size()
+ << "; oldest is " << oldest_secs
+ << " seconds old" << dendl;
+
+ if (oldest_secs < complaint_time)
+ return false;
+
+ xlist<TrackedOp*>::iterator i = ops_in_flight.begin();
+ warning_vector.reserve(log_threshold + 1);
+
+ int slow = 0; // total slow
+ int warned = 0; // total logged
+ while (!i.end() && (*i)->get_arrived() < too_old) {
+ slow++;
+
+ // exponential backoff of warning intervals
+ if (((*i)->get_arrived() +
+ (complaint_time * (*i)->warn_interval_multiplier)) < now) {
+ // will warn
+ if (warning_vector.empty())
+ warning_vector.push_back("");
+ warned++;
+ if (warned > log_threshold)
+ break;
+
+ utime_t age = now - (*i)->get_arrived();
+ stringstream ss;
+ ss << "slow request " << age << " seconds old, received at " << (*i)->get_arrived()
+ << ": " << *((*i)->request) << " currently "
+ << ((*i)->current.size() ? (*i)->current : (*i)->state_string());
+ warning_vector.push_back(ss.str());
+
+ // only those that have been shown will backoff
+ (*i)->warn_interval_multiplier *= 2;
+ }
+ ++i;
+ }
+
+ // only summarize if we warn about any. if everything has backed
+ // off, we will stay silent.
+ if (warned > 0) {
+ stringstream ss;
+ ss << slow << " slow requests, " << warned << " included below; oldest blocked for > "
+ << oldest_secs << " secs";
+ warning_vector[0] = ss.str();
+ }
+
+ return warning_vector.size();
+}
+
+void OpTracker::get_age_ms_histogram(pow2_hist_t *h)
+{
+ Mutex::Locker locker(ops_in_flight_lock);
+
+ h->clear();
+
+ utime_t now = ceph_clock_now(NULL);
+ unsigned bin = 30;
+ uint32_t lb = 1 << (bin-1); // lower bound for this bin
+ int count = 0;
+ for (xlist<TrackedOp*>::iterator i = ops_in_flight.begin(); !i.end(); ++i) {
+ utime_t age = now - (*i)->get_arrived();
+ uint32_t ms = (long)(age * 1000.0);
+ if (ms >= lb) {
+ count++;
+ continue;
+ }
+ if (count)
+ h->set(bin, count);
+ while (lb > ms) {
+ bin--;
+ lb >>= 1;
+ }
+ count = 1;
+ }
+ if (count)
+ h->set(bin, count);
+}
+
+void OpTracker::mark_event(TrackedOp *op, const string &dest)
+{
+ utime_t now = ceph_clock_now(cct);
+ return _mark_event(op, dest, now);
+}
+
+void OpTracker::_mark_event(TrackedOp *op, const string &evt,
+ utime_t time)
+{
+ Mutex::Locker locker(ops_in_flight_lock);
+ dout(5) << //"reqid: " << op->get_reqid() <<
+ ", seq: " << op->seq
+ << ", time: " << time << ", event: " << evt
+ << ", request: " << *op->request << dendl;
+}
+
+void OpTracker::RemoveOnDelete::operator()(TrackedOp *op) {
+ op->mark_event("done");
+ tracker->unregister_inflight_op(op);
+ // Do not delete op, unregister_inflight_op took control
+}
+
+void TrackedOp::mark_event(const string &event)
+{
+ utime_t now = ceph_clock_now(g_ceph_context);
+ {
+ Mutex::Locker l(lock);
+ events.push_back(make_pair(now, event));
+ }
+ tracker->mark_event(this, event);
+ _event_marked();
+}
+
+void TrackedOp::dump(utime_t now, Formatter *f) const
+{
+ Message *m = request;
+ stringstream name;
+ m->print(name);
+ f->dump_string("description", name.str().c_str()); // this TrackedOp
+ f->dump_stream("received_at") << get_arrived();
+ f->dump_float("age", now - get_arrived());
+ f->dump_float("duration", get_duration());
+ {
+ f->open_array_section("type_data");
+ _dump(now, f);
+ f->close_section();
+ }
+}
diff --git a/src/common/TrackedOp.h b/src/common/TrackedOp.h
index 753331df7f3..44e03905759 100644
--- a/src/common/TrackedOp.h
+++ b/src/common/TrackedOp.h
@@ -17,15 +17,163 @@
#include <stdint.h>
#include <include/utime.h>
#include "common/Mutex.h"
+#include "include/histogram.h"
#include "include/xlist.h"
#include "msg/Message.h"
#include <tr1/memory>
+class TrackedOp;
+typedef std::tr1::shared_ptr<TrackedOp> TrackedOpRef;
+
+class OpTracker;
+class OpHistory {
+ set<pair<utime_t, TrackedOpRef> > arrived;
+ set<pair<double, TrackedOpRef> > duration;
+ void cleanup(utime_t now);
+ bool shutdown;
+ OpTracker *tracker;
+ uint32_t history_size;
+ uint32_t history_duration;
+
+public:
+ OpHistory(OpTracker *tracker_) : shutdown(false), tracker(tracker_),
+ history_size(0), history_duration(0) {}
+ ~OpHistory() {
+ assert(arrived.empty());
+ assert(duration.empty());
+ }
+ void insert(utime_t now, TrackedOpRef op);
+ void dump_ops(utime_t now, Formatter *f);
+ void on_shutdown();
+ void set_size_and_duration(uint32_t new_size, uint32_t new_duration) {
+ history_size = new_size;
+ history_duration = new_duration;
+ }
+};
+
+class OpTracker {
+ class RemoveOnDelete {
+ OpTracker *tracker;
+ public:
+ RemoveOnDelete(OpTracker *tracker) : tracker(tracker) {}
+ void operator()(TrackedOp *op);
+ };
+ friend class RemoveOnDelete;
+ friend class OpHistory;
+ uint64_t seq;
+ Mutex ops_in_flight_lock;
+ xlist<TrackedOp *> ops_in_flight;
+ OpHistory history;
+ float complaint_time;
+ int log_threshold;
+
+public:
+ CephContext *cct;
+ OpTracker(CephContext *cct_) : seq(0), ops_in_flight_lock("OpTracker mutex"),
+ history(this), complaint_time(0), log_threshold(0), cct(cct_) {}
+ void set_complaint_and_threshold(float time, int threshold) {
+ complaint_time = time;
+ log_threshold = threshold;
+ }
+ void set_history_size_and_duration(uint32_t new_size, uint32_t new_duration) {
+ history.set_size_and_duration(new_size, new_duration);
+ }
+ void dump_ops_in_flight(Formatter *f);
+ void dump_historic_ops(Formatter *f);
+ void register_inflight_op(xlist<TrackedOp*>::item *i);
+ void unregister_inflight_op(TrackedOp *i);
+
+ void get_age_ms_histogram(pow2_hist_t *h);
+
+ /**
+ * Look for Ops which are too old, and insert warning
+ * strings for each Op that is too old.
+ *
+ * @param warning_strings A vector<string> reference which is filled
+ * with a warning string for each old Op.
+ * @return True if there are any Ops to warn on, false otherwise.
+ */
+ bool check_ops_in_flight(std::vector<string> &warning_strings);
+ void mark_event(TrackedOp *op, const string &evt);
+ void _mark_event(TrackedOp *op, const string &evt, utime_t now);
+
+ void on_shutdown() {
+ Mutex::Locker l(ops_in_flight_lock);
+ history.on_shutdown();
+ }
+ ~OpTracker() {
+ assert(ops_in_flight.empty());
+ }
+
+ template <typename T>
+ typename T::Ref create_request(Message *ref)
+ {
+ typename T::Ref retval(new T(ref, this),
+ RemoveOnDelete(this));
+
+ _mark_event(retval.get(), "header_read", ref->get_recv_stamp());
+ _mark_event(retval.get(), "throttled", ref->get_throttle_stamp());
+ _mark_event(retval.get(), "all_read", ref->get_recv_complete_stamp());
+ _mark_event(retval.get(), "dispatched", ref->get_dispatch_stamp());
+
+ retval->init_from_message();
+
+ return retval;
+ }
+};
+
class TrackedOp {
+private:
+ friend class OpHistory;
+ friend class OpTracker;
+ xlist<TrackedOp*>::item xitem;
+protected:
+ Message *request; /// the logical request we are tracking
+ OpTracker *tracker; /// the tracker we are associated with
+
+ list<pair<utime_t, string> > events; /// list of events and their times
+ Mutex lock; /// to protect the events list
+ string current; /// the current state the event is in
+ uint64_t seq; /// a unique value set by the OpTracker
+
+ uint32_t warn_interval_multiplier; // limits output of a given op warning
+
+ TrackedOp(Message *req, OpTracker *_tracker) :
+ xitem(this),
+ request(req),
+ tracker(_tracker),
+ lock("TrackedOp::lock"),
+ seq(0),
+ warn_interval_multiplier(1)
+ {
+ tracker->register_inflight_op(&xitem);
+ }
+
+ virtual void init_from_message() {}
+ /// output any type-specific data you want to get when dump() is called
+ virtual void _dump(utime_t now, Formatter *f) const {}
+ /// if you want something else to happen when events are marked, implement
+ virtual void _event_marked() {}
+
public:
- virtual void mark_event(const string &event) = 0;
- virtual ~TrackedOp() {}
+ virtual ~TrackedOp() { assert(request); request->put(); }
+
+ utime_t get_arrived() const {
+ return request->get_recv_stamp();
+ }
+ // This function maybe needs some work; assumes last event is completion time
+ double get_duration() const {
+ return events.size() ?
+ (events.rbegin()->first - get_arrived()) :
+ 0.0;
+ }
+ Message *get_req() const { return request; }
+
+ void mark_event(const string &event);
+ virtual const char *state_string() const {
+ return events.rbegin()->second.c_str();
+ }
+ void dump(utime_t now, Formatter *f) const;
};
-typedef std::tr1::shared_ptr<TrackedOp> TrackedOpRef;
#endif
diff --git a/src/common/WorkQueue.h b/src/common/WorkQueue.h
index b2742accdce..794b577a71d 100644
--- a/src/common/WorkQueue.h
+++ b/src/common/WorkQueue.h
@@ -390,6 +390,43 @@ public:
void drain(WorkQueue_* wq = 0);
};
+class GenContextWQ :
+ public ThreadPool::WorkQueueVal<GenContext<ThreadPool::TPHandle&>*> {
+ list<GenContext<ThreadPool::TPHandle&>*> _queue;
+public:
+ GenContextWQ(const string &name, time_t ti, ThreadPool *tp)
+ : ThreadPool::WorkQueueVal<
+ GenContext<ThreadPool::TPHandle&>*>(name, ti, ti*10, tp) {}
+
+ void _enqueue(GenContext<ThreadPool::TPHandle&> *c) {
+ _queue.push_back(c);
+ };
+ void _enqueue_front(GenContext<ThreadPool::TPHandle&> *c) {
+ _queue.push_front(c);
+ }
+ bool _empty() {
+ return _queue.empty();
+ }
+ GenContext<ThreadPool::TPHandle&> *_dequeue() {
+ assert(!_queue.empty());
+ GenContext<ThreadPool::TPHandle&> *c = _queue.front();
+ _queue.pop_front();
+ return c;
+ }
+ void _process(GenContext<ThreadPool::TPHandle&> *c, ThreadPool::TPHandle &tp) {
+ c->complete(tp);
+ }
+};
+class C_QueueInWQ : public Context {
+ GenContextWQ *wq;
+ GenContext<ThreadPool::TPHandle&> *c;
+public:
+ C_QueueInWQ(GenContextWQ *wq, GenContext<ThreadPool::TPHandle &> *c)
+ : wq(wq), c(c) {}
+ void finish(int) {
+ wq->queue(c);
+ }
+};
#endif
diff --git a/src/include/addr_parsing.c b/src/common/addr_parsing.c
index c01f817772c..c01f817772c 100644
--- a/src/include/addr_parsing.c
+++ b/src/common/addr_parsing.c
diff --git a/src/common/admin_socket.cc b/src/common/admin_socket.cc
index 1a507e606bf..12e5868b409 100644
--- a/src/common/admin_socket.cc
+++ b/src/common/admin_socket.cc
@@ -12,6 +12,8 @@
*
*/
+#include "include/int_types.h"
+
#include "common/Thread.h"
#include "common/admin_socket.h"
#include "common/config.h"
@@ -26,7 +28,6 @@
#include <errno.h>
#include <fcntl.h>
-#include <inttypes.h>
#include <map>
#include <poll.h>
#include <set>
diff --git a/src/common/admin_socket_client.cc b/src/common/admin_socket_client.cc
index 782e808ad86..335695f9b4b 100644
--- a/src/common/admin_socket_client.cc
+++ b/src/common/admin_socket_client.cc
@@ -12,6 +12,8 @@
*
*/
+#include "include/int_types.h"
+
#include "common/admin_socket.h"
#include "common/ceph_context.h"
#include "common/errno.h"
@@ -21,7 +23,6 @@
#include <arpa/inet.h>
#include <errno.h>
#include <fcntl.h>
-#include <inttypes.h>
#include <map>
#include <poll.h>
#include <sstream>
diff --git a/src/common/blkdev.cc b/src/common/blkdev.cc
index 22b0aa67b3e..b0dc0a54e9e 100644
--- a/src/common/blkdev.cc
+++ b/src/common/blkdev.cc
@@ -1,8 +1,8 @@
-#include <inttypes.h>
+#include "include/int_types.h"
+
#include <fcntl.h>
#include <sys/ioctl.h>
#include <errno.h>
-
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mount.h>
diff --git a/src/common/bloom_filter.cc b/src/common/bloom_filter.cc
new file mode 100644
index 00000000000..68875e925bf
--- /dev/null
+++ b/src/common/bloom_filter.cc
@@ -0,0 +1,137 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/types.h"
+#include "common/bloom_filter.hpp"
+
+void bloom_filter::encode(bufferlist& bl) const
+{
+ ENCODE_START(2, 2, bl);
+ ::encode((uint64_t)salt_count_, bl);
+ ::encode((uint64_t)insert_count_, bl);
+ ::encode((uint64_t)target_element_count_, bl);
+ ::encode((uint64_t)random_seed_, bl);
+ bufferptr bp((const char*)bit_table_, table_size_);
+ ::encode(bp, bl);
+ ENCODE_FINISH(bl);
+}
+
+void bloom_filter::decode(bufferlist::iterator& p)
+{
+ DECODE_START(2, p);
+ uint64_t v;
+ ::decode(v, p);
+ salt_count_ = v;
+ ::decode(v, p);
+ insert_count_ = v;
+ ::decode(v, p);
+ target_element_count_ = v;
+ ::decode(v, p);
+ random_seed_ = v;
+ bufferlist t;
+ ::decode(t, p);
+
+ salt_.clear();
+ generate_unique_salt();
+ table_size_ = t.length();
+ delete bit_table_;
+ if (table_size_) {
+ bit_table_ = new cell_type[table_size_];
+ t.copy(0, table_size_, (char *)bit_table_);
+ } else {
+ bit_table_ = NULL;
+ }
+
+ DECODE_FINISH(p);
+}
+
+void bloom_filter::dump(Formatter *f) const
+{
+ f->dump_unsigned("salt_count", salt_count_);
+ f->dump_unsigned("table_size", table_size_);
+ f->dump_unsigned("insert_count", insert_count_);
+ f->dump_unsigned("target_element_count", target_element_count_);
+ f->dump_unsigned("random_seed", random_seed_);
+
+ f->open_array_section("salt_table");
+ for (std::vector<bloom_type>::const_iterator i = salt_.begin(); i != salt_.end(); ++i)
+ f->dump_unsigned("salt", *i);
+ f->close_section();
+
+ f->open_array_section("bit_table");
+ for (unsigned i = 0; i < table_size_; ++i)
+ f->dump_unsigned("byte", (unsigned)bit_table_[i]);
+ f->close_section();
+}
+
+void bloom_filter::generate_test_instances(list<bloom_filter*>& ls)
+{
+ ls.push_back(new bloom_filter(10, .5, 1));
+ ls.push_back(new bloom_filter(10, .5, 1));
+ ls.back()->insert("foo");
+ ls.back()->insert("bar");
+ ls.push_back(new bloom_filter(50, .5, 1));
+ ls.back()->insert("foo");
+ ls.back()->insert("bar");
+ ls.back()->insert("baz");
+ ls.back()->insert("boof");
+ ls.back()->insert("boogggg");
+}
+
+
+void compressible_bloom_filter::encode(bufferlist& bl) const
+{
+ ENCODE_START(2, 2, bl);
+ bloom_filter::encode(bl);
+
+ uint32_t s = size_list.size();
+ ::encode(s, bl);
+ for (vector<size_t>::const_iterator p = size_list.begin();
+ p != size_list.end(); ++p)
+ ::encode((uint64_t)*p, bl);
+
+ ENCODE_FINISH(bl);
+}
+
+void compressible_bloom_filter::decode(bufferlist::iterator& p)
+{
+ DECODE_START(2, p);
+ bloom_filter::decode(p);
+
+ uint32_t s;
+ ::decode(s, p);
+ size_list.resize(s);
+ for (unsigned i = 0; i < s; i++) {
+ uint64_t v;
+ ::decode(v, p);
+ size_list[i] = v;
+ }
+
+ DECODE_FINISH(p);
+}
+
+void compressible_bloom_filter::dump(Formatter *f) const
+{
+ bloom_filter::dump(f);
+
+ f->open_array_section("table_sizes");
+ for (vector<size_t>::const_iterator p = size_list.begin();
+ p != size_list.end(); ++p)
+ f->dump_unsigned("size", (uint64_t)*p);
+ f->close_section();
+}
+
+void compressible_bloom_filter::generate_test_instances(list<compressible_bloom_filter*>& ls)
+{
+ ls.push_back(new compressible_bloom_filter(10, .5, 1));
+ ls.push_back(new compressible_bloom_filter(10, .5, 1));
+ ls.back()->insert("foo");
+ ls.back()->insert("bar");
+ ls.push_back(new compressible_bloom_filter(50, .5, 1));
+ ls.back()->insert("foo");
+ ls.back()->insert("bar");
+ ls.back()->insert("baz");
+ ls.back()->insert("boof");
+ ls.back()->compress(20);
+ ls.back()->insert("boogggg");
+}
diff --git a/src/common/bloom_filter.hpp b/src/common/bloom_filter.hpp
new file mode 100644
index 00000000000..93787a89a60
--- /dev/null
+++ b/src/common/bloom_filter.hpp
@@ -0,0 +1,700 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ *******************************************************************
+ * *
+ * Open Bloom Filter *
+ * *
+ * Author: Arash Partow - 2000 *
+ * URL: http://www.partow.net/programming/hashfunctions/index.html *
+ * *
+ * Copyright notice: *
+ * Free use of the Open Bloom Filter Library is permitted under *
+ * the guidelines and in accordance with the most current version *
+ * of the Boost Software License, Version 1.0 *
+ * http://www.opensource.org/licenses/bsl1.0.html *
+ * *
+ *******************************************************************
+*/
+
+
+#ifndef COMMON_BLOOM_FILTER_HPP
+#define COMMON_BLOOM_FILTER_HPP
+
+#include <cstddef>
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <list>
+#include <string>
+#include <vector>
+
+#include "include/encoding.h"
+#include "common/Formatter.h"
+
+static const std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned)
+static const unsigned char bit_mask[bits_per_char] = {
+ 0x01, //00000001
+ 0x02, //00000010
+ 0x04, //00000100
+ 0x08, //00001000
+ 0x10, //00010000
+ 0x20, //00100000
+ 0x40, //01000000
+ 0x80 //10000000
+};
+
+
+class bloom_filter
+{
+protected:
+
+ typedef unsigned int bloom_type;
+ typedef unsigned char cell_type;
+
+ unsigned char* bit_table_; ///< pointer to bit map
+ std::vector<bloom_type> salt_; ///< vector of salts
+ std::size_t salt_count_; ///< number of salts
+ std::size_t table_size_; ///< bit table size in bytes
+ std::size_t insert_count_; ///< insertion count
+ std::size_t target_element_count_; ///< target number of unique insertions
+ std::size_t random_seed_; ///< random seed
+
+public:
+
+ bloom_filter()
+ : bit_table_(0),
+ salt_count_(0),
+ table_size_(0),
+ insert_count_(0),
+ target_element_count_(0),
+ random_seed_(0)
+ {}
+
+ bloom_filter(const std::size_t& predicted_inserted_element_count,
+ const double& false_positive_probability,
+ const std::size_t& random_seed)
+ : bit_table_(0),
+ insert_count_(0),
+ target_element_count_(predicted_inserted_element_count),
+ random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
+ {
+ find_optimal_parameters(predicted_inserted_element_count, false_positive_probability,
+ &salt_count_, &table_size_);
+ init();
+ }
+
+ bloom_filter(const std::size_t& salt_count,
+ std::size_t table_size,
+ const std::size_t& random_seed,
+ std::size_t target_element_count)
+ : bit_table_(0),
+ salt_count_(salt_count),
+ table_size_(table_size),
+ insert_count_(0),
+ target_element_count_(target_element_count),
+ random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
+ {
+ init();
+ }
+
+ void init() {
+ generate_unique_salt();
+ if (table_size_) {
+ bit_table_ = new cell_type[table_size_];
+ std::fill_n(bit_table_, table_size_, 0x00);
+ } else {
+ bit_table_ = NULL;
+ }
+ }
+
+ bloom_filter(const bloom_filter& filter)
+ {
+ this->operator=(filter);
+ }
+
+ bloom_filter& operator = (const bloom_filter& filter)
+ {
+ if (this != &filter) {
+ salt_count_ = filter.salt_count_;
+ table_size_ = filter.table_size_;
+ insert_count_ = filter.insert_count_;
+ random_seed_ = filter.random_seed_;
+ delete[] bit_table_;
+ bit_table_ = new cell_type[table_size_];
+ std::copy(filter.bit_table_, filter.bit_table_ + table_size_, bit_table_);
+ salt_ = filter.salt_;
+ }
+ return *this;
+ }
+
+ virtual ~bloom_filter()
+ {
+ delete[] bit_table_;
+ }
+
+ inline bool operator!() const
+ {
+ return (0 == table_size_);
+ }
+
+ inline void clear()
+ {
+ if (bit_table_)
+ std::fill_n(bit_table_, table_size_, 0x00);
+ insert_count_ = 0;
+ }
+
+ /**
+ * insert a u32 into the set
+ *
+ * NOTE: the internal hash is weak enough that consecutive inputs do
+ * not achieve the desired fpp. Well-mixed values should be used
+ * here (e.g., put rjhash(x) into the filter instead of just x).
+ *
+ * @param val integer value to insert
+ */
+ inline void insert(uint32_t val) {
+ assert(bit_table_);
+ std::size_t bit_index = 0;
+ std::size_t bit = 0;
+ for (std::size_t i = 0; i < salt_.size(); ++i)
+ {
+ compute_indices(hash_ap(val,salt_[i]),bit_index,bit);
+ bit_table_[bit_index >> 3] |= bit_mask[bit];
+ }
+ ++insert_count_;
+ }
+
+ inline void insert(const unsigned char* key_begin, const std::size_t& length)
+ {
+ assert(bit_table_);
+ std::size_t bit_index = 0;
+ std::size_t bit = 0;
+ for (std::size_t i = 0; i < salt_.size(); ++i)
+ {
+ compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
+ bit_table_[bit_index >> 3] |= bit_mask[bit];
+ }
+ ++insert_count_;
+ }
+
+ template<typename T>
+ inline void insert(const T& t)
+ {
+ // Note: T must be a C++ POD type.
+ insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T));
+ }
+
+ inline void insert(const std::string& key)
+ {
+ insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
+ }
+
+ inline void insert(const char* data, const std::size_t& length)
+ {
+ insert(reinterpret_cast<const unsigned char*>(data),length);
+ }
+
+ template<typename InputIterator>
+ inline void insert(const InputIterator begin, const InputIterator end)
+ {
+ InputIterator itr = begin;
+ while (end != itr)
+ {
+ insert(*(itr++));
+ }
+ }
+
+ /**
+ * check if a u32 is contained by set
+ *
+ * NOTE: the internal hash is weak enough that consecutive inputs do
+ * not achieve the desired fpp. Well-mixed values should be used
+ * here (e.g., put rjhash(x) into the filter instead of just x).
+ *
+ * @param val integer value to query
+ * @returns true if value is (probably) in the set, false if it definitely is not
+ */
+ inline virtual bool contains(uint32_t val) const
+ {
+ if (!bit_table_)
+ return false;
+ std::size_t bit_index = 0;
+ std::size_t bit = 0;
+ for (std::size_t i = 0; i < salt_.size(); ++i)
+ {
+ compute_indices(hash_ap(val,salt_[i]),bit_index,bit);
+ if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit])
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
+ {
+ if (!bit_table_)
+ return false;
+ std::size_t bit_index = 0;
+ std::size_t bit = 0;
+ for (std::size_t i = 0; i < salt_.size(); ++i)
+ {
+ compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
+ if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit])
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ template<typename T>
+ inline bool contains(const T& t) const
+ {
+ return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T)));
+ }
+
+ inline bool contains(const std::string& key) const
+ {
+ return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
+ }
+
+ inline bool contains(const char* data, const std::size_t& length) const
+ {
+ return contains(reinterpret_cast<const unsigned char*>(data),length);
+ }
+
+ template<typename InputIterator>
+ inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
+ {
+ InputIterator itr = begin;
+ while (end != itr)
+ {
+ if (!contains(*itr))
+ {
+ return itr;
+ }
+ ++itr;
+ }
+ return end;
+ }
+
+ template<typename InputIterator>
+ inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
+ {
+ InputIterator itr = begin;
+ while (end != itr)
+ {
+ if (contains(*itr))
+ {
+ return itr;
+ }
+ ++itr;
+ }
+ return end;
+ }
+
+ inline virtual std::size_t size() const
+ {
+ return table_size_ * bits_per_char;
+ }
+
+ inline std::size_t element_count() const
+ {
+ return insert_count_;
+ }
+
+ /*
+ * density of bits set. inconvenient units, but:
+ * .3 = ~50% target insertions
+ * .5 = 100% target insertions, "perfectly full"
+ * .75 = 200% target insertions
+ * 1.0 = all bits set... infinite insertions
+ */
+ inline double density() const
+ {
+ if (!bit_table_)
+ return 0.0;
+ size_t set = 0;
+ uint8_t *p = bit_table_;
+ size_t left = table_size_;
+ while (left-- > 0) {
+ uint8_t c = *p;
+ for (; c; ++set)
+ c &= c - 1;
+ ++p;
+ }
+ return (double)set / (double)(table_size_ << 3);
+ }
+
+ virtual inline double approx_unique_element_count() const {
+ // this is not a very good estimate; a better solution should have
+ // some asymptotic behavior as density() approaches 1.0.
+ return (double)target_element_count_ * 2.0 * density();
+ }
+
+ inline double effective_fpp() const
+ {
+ /*
+ Note:
+ The effective false positive probability is calculated using the
+ designated table size and hash function count in conjunction with
+ the current number of inserted elements - not the user defined
+ predicated/expected number of inserted elements.
+ */
+ return std::pow(1.0 - std::exp(-1.0 * salt_.size() * insert_count_ / size()), 1.0 * salt_.size());
+ }
+
+ inline bloom_filter& operator &= (const bloom_filter& filter)
+ {
+ /* intersection */
+ if (
+ (salt_count_ == filter.salt_count_) &&
+ (table_size_ == filter.table_size_) &&
+ (random_seed_ == filter.random_seed_)
+ ) {
+ for (std::size_t i = 0; i < table_size_; ++i) {
+ bit_table_[i] &= filter.bit_table_[i];
+ }
+ }
+ return *this;
+ }
+
+ inline bloom_filter& operator |= (const bloom_filter& filter)
+ {
+ /* union */
+ if (
+ (salt_count_ == filter.salt_count_) &&
+ (table_size_ == filter.table_size_) &&
+ (random_seed_ == filter.random_seed_)
+ ) {
+ for (std::size_t i = 0; i < table_size_; ++i) {
+ bit_table_[i] |= filter.bit_table_[i];
+ }
+ }
+ return *this;
+ }
+
+ inline bloom_filter& operator ^= (const bloom_filter& filter)
+ {
+ /* difference */
+ if (
+ (salt_count_ == filter.salt_count_) &&
+ (table_size_ == filter.table_size_) &&
+ (random_seed_ == filter.random_seed_)
+ ) {
+ for (std::size_t i = 0; i < table_size_; ++i) {
+ bit_table_[i] ^= filter.bit_table_[i];
+ }
+ }
+ return *this;
+ }
+
+ inline const cell_type* table() const
+ {
+ return bit_table_;
+ }
+
+protected:
+
+ inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
+ {
+ bit_index = hash % (table_size_ << 3);
+ bit = bit_index & 7;
+ }
+
+ void generate_unique_salt()
+ {
+ /*
+ Note:
+ A distinct hash function need not be implementation-wise
+ distinct. In the current implementation "seeding" a common
+ hash function with different values seems to be adequate.
+ */
+ const unsigned int predef_salt_count = 128;
+ static const bloom_type predef_salt[predef_salt_count] = {
+ 0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
+ 0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
+ 0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
+ 0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
+ 0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
+ 0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
+ 0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
+ 0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
+ 0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
+ 0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
+ 0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
+ 0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
+ 0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
+ 0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
+ 0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
+ 0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
+ 0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
+ 0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
+ 0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
+ 0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
+ 0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
+ 0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
+ 0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
+ 0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
+ 0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
+ 0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
+ 0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
+ 0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
+ 0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
+ 0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
+ 0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
+ 0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
+ };
+
+ if (salt_count_ <= predef_salt_count)
+ {
+ std::copy(predef_salt,
+ predef_salt + salt_count_,
+ std::back_inserter(salt_));
+ for (unsigned int i = 0; i < salt_.size(); ++i)
+ {
+ /*
+ Note:
+ This is done to integrate the user defined random seed,
+ so as to allow for the generation of unique bloom filter
+ instances.
+ */
+ salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + random_seed_;
+ }
+ }
+ else
+ {
+ std::copy(predef_salt,predef_salt + predef_salt_count,
+ std::back_inserter(salt_));
+ srand(static_cast<unsigned int>(random_seed_));
+ while (salt_.size() < salt_count_)
+ {
+ bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
+ if (0 == current_salt)
+ continue;
+ if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
+ {
+ salt_.push_back(current_salt);
+ }
+ }
+ }
+ }
+
+ static void find_optimal_parameters(std::size_t target_insert_count,
+ double target_fpp,
+ std::size_t *salt_count,
+ std::size_t *table_size)
+ {
+ /*
+ Note:
+ The following will attempt to find the number of hash functions
+ and minimum amount of storage bits required to construct a bloom
+ filter consistent with the user defined false positive probability
+ and estimated element insertion count.
+ */
+
+ double min_m = std::numeric_limits<double>::infinity();
+ double min_k = 0.0;
+ double curr_m = 0.0;
+ double k = 1.0;
+ while (k < 1000.0)
+ {
+ double numerator = (- k * target_insert_count);
+ double denominator = std::log(1.0 - std::pow(target_fpp, 1.0 / k));
+ curr_m = numerator / denominator;
+
+ if (curr_m < min_m)
+ {
+ min_m = curr_m;
+ min_k = k;
+ }
+ k += 1.0;
+ }
+
+ *salt_count = static_cast<std::size_t>(min_k);
+ size_t t = static_cast<std::size_t>(min_m);
+ t += (((t & 7) != 0) ? (bits_per_char - (t & 7)) : 0);
+ *table_size = t >> 3;
+ }
+
+ inline bloom_type hash_ap(uint32_t val, bloom_type hash) const
+ {
+ hash ^= (hash << 7) ^ ((val & 0xff000000) >> 24) * (hash >> 3);
+ hash ^= (~((hash << 11) + (((val & 0xff0000) >> 16) ^ (hash >> 5))));
+ hash ^= (hash << 7) ^ ((val & 0xff00) >> 8) * (hash >> 3);
+ hash ^= (~((hash << 11) + (((val & 0xff)) ^ (hash >> 5))));
+ return hash;
+ }
+
+ inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
+ {
+ const unsigned char* itr = begin;
+
+ while (remaining_length >= 4)
+ {
+ hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
+ hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+ hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
+ hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+ remaining_length -= 4;
+ }
+
+ while (remaining_length >= 2)
+ {
+ hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
+ hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+ remaining_length -= 2;
+ }
+
+ if (remaining_length)
+ {
+ hash ^= (hash << 7) ^ (*itr) * (hash >> 3);
+ }
+
+ return hash;
+ }
+
+public:
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::iterator& bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(std::list<bloom_filter*>& ls);
+};
+WRITE_CLASS_ENCODER(bloom_filter)
+
+inline bloom_filter operator & (const bloom_filter& a, const bloom_filter& b)
+{
+ bloom_filter result = a;
+ result &= b;
+ return result;
+}
+
+inline bloom_filter operator | (const bloom_filter& a, const bloom_filter& b)
+{
+ bloom_filter result = a;
+ result |= b;
+ return result;
+}
+
+inline bloom_filter operator ^ (const bloom_filter& a, const bloom_filter& b)
+{
+ bloom_filter result = a;
+ result ^= b;
+ return result;
+}
+
+
+class compressible_bloom_filter : public bloom_filter
+{
+public:
+
+ compressible_bloom_filter() : bloom_filter() {}
+
+ compressible_bloom_filter(const std::size_t& predicted_element_count,
+ const double& false_positive_probability,
+ const std::size_t& random_seed)
+ : bloom_filter(predicted_element_count, false_positive_probability, random_seed)
+ {
+ size_list.push_back(table_size_);
+ }
+
+ compressible_bloom_filter(const std::size_t& salt_count,
+ std::size_t table_size,
+ const std::size_t& random_seed,
+ std::size_t target_count)
+ : bloom_filter(salt_count, table_size, random_seed, target_count)
+ {
+ size_list.push_back(table_size_);
+ }
+
+ inline virtual std::size_t size() const
+ {
+ return size_list.back() * bits_per_char;
+ }
+
+ inline bool compress(const double& target_ratio)
+ {
+ if (!bit_table_)
+ return false;
+
+ if ((0.0 >= target_ratio) || (target_ratio >= 1.0))
+ {
+ return false;
+ }
+
+ std::size_t original_table_size = size_list.back();
+ std::size_t new_table_size = static_cast<std::size_t>(size_list.back() * target_ratio);
+
+ if ((!new_table_size) || (new_table_size >= original_table_size))
+ {
+ return false;
+ }
+
+ cell_type* tmp = new cell_type[new_table_size];
+ std::copy(bit_table_, bit_table_ + (new_table_size), tmp);
+ cell_type* itr = bit_table_ + (new_table_size);
+ cell_type* end = bit_table_ + (original_table_size);
+ cell_type* itr_tmp = tmp;
+ cell_type* itr_end = tmp + (new_table_size);
+ while (end != itr)
+ {
+ *(itr_tmp++) |= (*itr++);
+ if (itr_tmp == itr_end)
+ itr_tmp = tmp;
+ }
+
+ delete[] bit_table_;
+ bit_table_ = tmp;
+ size_list.push_back(new_table_size);
+ table_size_ = new_table_size;
+
+ return true;
+ }
+
+ virtual inline double approx_unique_element_count() const {
+ // this is not a very good estimate; a better solution should have
+ // some asymptotic behavior as density() approaches 1.0.
+ //
+ // the compress() correction is also bad; it tends to under-estimate.
+ return (double)target_element_count_ * 2.0 * density() * (double)size_list.back() / (double)size_list.front();
+ }
+
+private:
+
+ inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
+ {
+ bit_index = hash;
+ for (std::size_t i = 0; i < size_list.size(); ++i)
+ {
+ bit_index %= size_list[i] << 3;
+ }
+ bit = bit_index & 7;
+ }
+
+ std::vector<std::size_t> size_list;
+public:
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::iterator& bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(std::list<compressible_bloom_filter*>& ls);
+};
+WRITE_CLASS_ENCODER(compressible_bloom_filter)
+
+#endif
+
+
+/*
+ Note 1:
+ If it can be guaranteed that bits_per_char will be of the form 2^n then
+ the following optimization can be used:
+
+ hash_table[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)];
+
+ Note 2:
+ For performance reasons where possible when allocating memory it should
+ be aligned (aligned_alloc) according to the architecture being used.
+*/
diff --git a/src/common/buffer.cc b/src/common/buffer.cc
index 0424887139e..8da4c106d1b 100644
--- a/src/common/buffer.cc
+++ b/src/common/buffer.cc
@@ -990,11 +990,14 @@ void buffer::list::rebuild_page_aligned()
*/
char *buffer::list::c_str()
{
- if (_buffers.size() == 0)
+ if (_buffers.empty())
return 0; // no buffers
- if (_buffers.size() > 1)
+
+ std::list<ptr>::const_iterator iter = _buffers.begin();
+ iter++;
+
+ if (iter != _buffers.end())
rebuild();
- assert(_buffers.size() == 1);
return _buffers.front().c_str(); // good, we're already contiguous.
}
@@ -1267,6 +1270,15 @@ int buffer::list::write_fd(int fd) const
return 0;
}
+__u32 buffer::list::crc32c(__u32 crc) const
+{
+ for (std::list<ptr>::const_iterator it = _buffers.begin();
+ it != _buffers.end();
+ ++it)
+ if (it->length())
+ crc = ceph_crc32c(crc, (unsigned char*)it->c_str(), it->length());
+ return crc;
+}
void buffer::list::hexdump(std::ostream &out) const
{
diff --git a/src/common/ceph_argparse.cc b/src/common/ceph_argparse.cc
index 2950a81f89d..6c8053897f3 100644
--- a/src/common/ceph_argparse.cc
+++ b/src/common/ceph_argparse.cc
@@ -464,18 +464,19 @@ CephInitParameters ceph_argparse_early_args
static void generic_usage(bool is_server)
{
cout << "\
- --conf/-c Read configuration from the given configuration file\n\
- --id/-i set ID portion of my name\n\
- --name/-n set name (TYPE.ID)\n\
- --version show version and quit\n\
+ --conf/-c FILE read configuration from the given configuration file\n\
+ --id/-i ID set ID portion of my name\n\
+ --name/-n TYPE.ID set name\n\
+ --cluster NAME set cluster name (default: ceph)\n\
+ --version show version and quit\n\
" << std::endl;
if (is_server) {
cout << "\
- -d Run in foreground, log to stderr.\n\
- -f Run in foreground, log to usual location.\n";
- cout << " --debug_ms N\n";
- cout << " set message debug level (e.g. 1)\n";
+ -d run in foreground, log to stderr.\n\
+ -f run in foreground, log to usual location.\n";
+ cout << "\
+ --debug_ms N set message debug level (e.g. 1)\n";
}
}
diff --git a/src/include/ceph_frag.cc b/src/common/ceph_frag.cc
index ab6cf35c409..444b910c24e 100644
--- a/src/include/ceph_frag.cc
+++ b/src/common/ceph_frag.cc
@@ -1,7 +1,7 @@
/*
* Ceph 'frag' type
*/
-#include "types.h"
+#include "include/types.h"
int ceph_frag_compare(__u32 a, __u32 b)
{
diff --git a/src/include/ceph_fs.cc b/src/common/ceph_fs.cc
index 6e3c143361e..3172c577162 100644
--- a/src/include/ceph_fs.cc
+++ b/src/common/ceph_fs.cc
@@ -9,7 +9,7 @@
/*
* Some non-inline ceph helpers
*/
-#include "types.h"
+#include "include/types.h"
/*
* return true if @layout appears to be valid
diff --git a/src/include/ceph_hash.cc b/src/common/ceph_hash.cc
index d6081dfa97d..c581806d83d 100644
--- a/src/include/ceph_hash.cc
+++ b/src/common/ceph_hash.cc
@@ -1,5 +1,5 @@
-#include "types.h"
+#include "include/types.h"
/*
* Robert Jenkin's hash function.
diff --git a/src/common/ceph_json.cc b/src/common/ceph_json.cc
index 84355575c6c..a48e0636fcf 100644
--- a/src/common/ceph_json.cc
+++ b/src/common/ceph_json.cc
@@ -222,9 +222,7 @@ bool JSONParser::parse(const char *buf_, int len)
return false;
}
- string json_string = buf_;
- // make a substring to len
- json_string = json_string.substr(0, len);
+ string json_string(buf_, len);
success = read(json_string, data);
if (success)
handle_value(data);
diff --git a/src/include/ceph_strings.cc b/src/common/ceph_strings.cc
index d46eca6aaf8..47648ce19b3 100644
--- a/src/include/ceph_strings.cc
+++ b/src/common/ceph_strings.cc
@@ -1,7 +1,7 @@
/*
* Ceph string constants
*/
-#include "types.h"
+#include "include/types.h"
const char *ceph_entity_type_name(int type)
{
@@ -48,6 +48,11 @@ const char *ceph_osd_op_name(int op)
case CEPH_OSD_OP_TMAPPUT: return "tmapput";
case CEPH_OSD_OP_WATCH: return "watch";
+ case CEPH_OSD_OP_COPY_GET: return "copy-get";
+ case CEPH_OSD_OP_COPY_FROM: return "copy-from";
+ case CEPH_OSD_OP_UNDIRTY: return "undirty";
+ case CEPH_OSD_OP_ISDIRTY: return "isdirty";
+
case CEPH_OSD_OP_CLONERANGE: return "clonerange";
case CEPH_OSD_OP_ASSERT_SRC_VERSION: return "assert-src-version";
case CEPH_OSD_OP_SRC_CMPXATTR: return "src-cmpxattr";
diff --git a/src/common/code_environment.cc b/src/common/code_environment.cc
index 2cf19f48bc5..662fa36c9bd 100644
--- a/src/common/code_environment.cc
+++ b/src/common/code_environment.cc
@@ -11,6 +11,7 @@
* Foundation. See file COPYING.
*
*/
+#include "acconfig.h"
#include "common/code_environment.h"
@@ -19,7 +20,8 @@
#include <stdlib.h>
#include <string.h>
#include <string>
-#if defined(__linux__)
+
+#ifdef HAVE_SYS_PRCTL_H
#include <sys/prctl.h>
#endif
@@ -45,6 +47,8 @@ std::ostream &operator<<(std::ostream &oss, enum code_environment_t e)
return oss;
}
+#if defined(HAVE_SYS_PRCTL_H) && defined(PR_GET_NAME) /* Since 2.6.11 */
+
int get_process_name(char *buf, int len)
{
if (len <= 16) {
@@ -53,17 +57,19 @@ int get_process_name(char *buf, int len)
* null-terminated. */
return -ENAMETOOLONG;
}
-#if defined(__FreeBSD__)
-#warning XXX
- return -ENAMETOOLONG;
-#else
memset(buf, 0, len);
- int ret;
- ret = prctl(PR_GET_NAME, buf);
- return ret;
-#endif
+ return prctl(PR_GET_NAME, buf);
}
+#else
+
+int get_process_name(char *buf, int len)
+{
+ return -ENOSYS;
+}
+
+#endif
+
std::string get_process_name_cpp()
{
char buf[32];
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 89405121698..700a210b412 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -91,7 +91,6 @@ SUBSYS(finisher, 1, 1)
SUBSYS(heartbeatmap, 1, 5)
SUBSYS(perfcounter, 1, 5)
SUBSYS(rgw, 1, 5) // log level for the Rados gateway
-SUBSYS(hadoop, 1, 5)
SUBSYS(javaclient, 1, 5)
SUBSYS(asok, 1, 5)
SUBSYS(throttle, 1, 1)
@@ -159,6 +158,8 @@ OPTION(mon_timecheck_interval, OPT_FLOAT, 300.0) // on leader, timecheck (clock
OPTION(mon_accept_timeout, OPT_FLOAT, 10.0) // on leader, if paxos update isn't accepted
OPTION(mon_pg_create_interval, OPT_FLOAT, 30.0) // no more than every 30s
OPTION(mon_pg_stuck_threshold, OPT_INT, 300) // number of seconds after which pgs can be considered inactive, unclean, or stale (see doc/control.rst under dump_stuck for more info)
+OPTION(mon_pg_warn_min_per_osd, OPT_INT, 20) // min # pgs per (in) osd before we warn the admin
+OPTION(mon_pg_warn_max_object_skew, OPT_FLOAT, 10.0) // max skew few average in objects per pg
OPTION(mon_osd_full_ratio, OPT_FLOAT, .95) // what % full makes an OSD "full"
OPTION(mon_osd_nearfull_ratio, OPT_FLOAT, .85) // what % full makes an OSD near full
OPTION(mon_globalid_prealloc, OPT_INT, 100) // how many globalids to prealloc
@@ -203,6 +204,7 @@ OPTION(mon_leveldb_max_open_files, OPT_INT, 0) // monitor's leveldb max open fil
OPTION(mon_leveldb_compression, OPT_BOOL, false) // monitor's leveldb uses compression
OPTION(mon_leveldb_paranoid, OPT_BOOL, false) // monitor's leveldb paranoid flag
OPTION(mon_leveldb_log, OPT_STR, "")
+OPTION(mon_leveldb_size_warn, OPT_U64, 40*1024*1024*1024) // issue a warning when the monitor's leveldb goes over 40GB (in bytes)
OPTION(paxos_stash_full_interval, OPT_INT, 25) // how often (in commits) to stash a full copy of the PaxosService state
OPTION(paxos_max_join_drift, OPT_INT, 10) // max paxos iterations before we must first sync the monitor stores
OPTION(paxos_propose_interval, OPT_DOUBLE, 1.0) // gather updates for this long before proposing a map update
@@ -360,12 +362,6 @@ OPTION(mds_standby_replay, OPT_BOOL, false)
// If true, compact leveldb store on mount
OPTION(osd_compact_leveldb_on_mount, OPT_BOOL, false)
-// If true, uses tmap as initial value for omap on old objects
-OPTION(osd_auto_upgrade_tmap, OPT_BOOL, true)
-
-// If true, TMAPPUT sets uses_tmap DEBUGGING ONLY
-OPTION(osd_tmapput_sets_uses_tmap, OPT_BOOL, false)
-
// Maximum number of backfills to or from a single osd
OPTION(osd_max_backfills, OPT_U64, 10)
@@ -443,6 +439,7 @@ OPTION(osd_recovery_delay_start, OPT_FLOAT, 0)
OPTION(osd_recovery_max_active, OPT_INT, 15)
OPTION(osd_recovery_max_single_start, OPT_INT, 5)
OPTION(osd_recovery_max_chunk, OPT_U64, 8<<20) // max size of push chunk
+OPTION(osd_copyfrom_max_chunk, OPT_U64, 8<<20) // max size of a COPYFROM chunk
OPTION(osd_push_per_object_cost, OPT_U64, 1000) // push cost per object
OPTION(osd_max_push_cost, OPT_U64, 8<<20) // max size of push message
OPTION(osd_max_push_objects, OPT_U64, 10) // max objects in single push op
@@ -519,11 +516,12 @@ OPTION(osd_recovery_op_warn_multiple, OPT_U32, 16)
OPTION(osd_mon_shutdown_timeout, OPT_DOUBLE, 5)
OPTION(osd_max_object_size, OPT_U64, 100*1024L*1024L*1024L) // OSD's maximum object size
-OPTION(osd_max_attr_size, OPT_U64, 65536)
+OPTION(osd_max_attr_size, OPT_U64, 0)
OPTION(filestore, OPT_BOOL, false)
/// filestore wb throttle limits
+OPTION(filestore_wbthrottle_enable, OPT_BOOL, true)
OPTION(filestore_wbthrottle_btrfs_bytes_start_flusher, OPT_U64, 41943040)
OPTION(filestore_wbthrottle_btrfs_bytes_hard_limit, OPT_U64, 419430400)
OPTION(filestore_wbthrottle_btrfs_ios_start_flusher, OPT_U64, 500)
@@ -546,12 +544,22 @@ OPTION(filestore_index_retry_probability, OPT_DOUBLE, 0)
OPTION(filestore_debug_inject_read_err, OPT_BOOL, false)
OPTION(filestore_debug_omap_check, OPT_BOOL, 0) // Expensive debugging check on sync
+
// Use omap for xattrs for attrs over
-OPTION(filestore_xattr_use_omap, OPT_BOOL, false)
// filestore_max_inline_xattr_size or
-OPTION(filestore_max_inline_xattr_size, OPT_U32, 512)
+OPTION(filestore_max_inline_xattr_size, OPT_U32, 0) //Override
+OPTION(filestore_max_inline_xattr_size_xfs, OPT_U32, 65536)
+OPTION(filestore_max_inline_xattr_size_btrfs, OPT_U32, 2048)
+OPTION(filestore_max_inline_xattr_size_other, OPT_U32, 512)
+
// for more than filestore_max_inline_xattrs attrs
-OPTION(filestore_max_inline_xattrs, OPT_U32, 2)
+OPTION(filestore_max_inline_xattrs, OPT_U32, 0) //Override
+OPTION(filestore_max_inline_xattrs_xfs, OPT_U32, 10)
+OPTION(filestore_max_inline_xattrs_btrfs, OPT_U32, 10)
+OPTION(filestore_max_inline_xattrs_other, OPT_U32, 2)
+
+OPTION(filestore_sloppy_crc, OPT_BOOL, false) // track sloppy crcs
+OPTION(filestore_sloppy_crc_block_size, OPT_INT, 65536)
OPTION(filestore_max_sync_interval, OPT_DOUBLE, 5) // seconds
OPTION(filestore_min_sync_interval, OPT_DOUBLE, .01) // seconds
@@ -654,6 +662,8 @@ OPTION(rgw_keystone_admin_token, OPT_STR, "") // keystone admin token (shared s
OPTION(rgw_keystone_accepted_roles, OPT_STR, "Member, admin") // roles required to serve requests
OPTION(rgw_keystone_token_cache_size, OPT_INT, 10000) // max number of entries in keystone token cache
OPTION(rgw_keystone_revocation_interval, OPT_INT, 15 * 60) // seconds between tokens revocation check
+OPTION(rgw_s3_auth_use_rados, OPT_BOOL, true) // should we try to use the internal credentials for s3?
+OPTION(rgw_s3_auth_use_keystone, OPT_BOOL, false) // should we try to use keystone for s3?
OPTION(rgw_admin_entry, OPT_STR, "admin") // entry point for which a url is considered an admin request
OPTION(rgw_enforce_swift_acls, OPT_BOOL, true)
OPTION(rgw_swift_token_expiration, OPT_INT, 24 * 3600) // time in seconds for swift token expiration
@@ -711,6 +721,10 @@ OPTION(rgw_data_log_num_shards, OPT_INT, 128) // number of objects to keep data
OPTION(rgw_data_log_obj_prefix, OPT_STR, "data_log") //
OPTION(rgw_replica_log_obj_prefix, OPT_STR, "replica_log") //
+OPTION(rgw_bucket_quota_ttl, OPT_INT, 600) // time for cached bucket stats to be cached within rgw instance
+OPTION(rgw_bucket_quota_soft_threshold, OPT_DOUBLE, 0.95) // threshold from which we don't rely on cached info for quota decisions
+OPTION(rgw_bucket_quota_cache_size, OPT_INT, 10000) // number of entries in bucket quota cache
+
OPTION(mutex_perf_counter, OPT_BOOL, false) // enable/disable mutex perf counter
// This will be set to true when it is safe to start threads.
diff --git a/src/common/crc32c_intel_baseline.c b/src/common/crc32c_intel_baseline.c
index cfcfec624ae..3a92c77b63c 100644
--- a/src/common/crc32c_intel_baseline.c
+++ b/src/common/crc32c_intel_baseline.c
@@ -34,7 +34,8 @@
* OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#include <inttypes.h>
+#include "include/int_types.h"
+
#include <stdlib.h>
#define MAX_ITER 8
diff --git a/src/common/crc32c_intel_baseline.h b/src/common/crc32c_intel_baseline.h
index 5b14ddfc07e..e463575e28c 100644
--- a/src/common/crc32c_intel_baseline.h
+++ b/src/common/crc32c_intel_baseline.h
@@ -1,6 +1,8 @@
#ifndef CEPH_COMMON_CRC32C_INTEL_BASELINE_H
#define CEPH_COMMON_CRC32C_INTEL_BASELINE_H
+#include "include/int_types.h"
+
#ifdef __cplusplus
extern "C" {
#endif
diff --git a/src/common/crc32c_intel_fast.c b/src/common/crc32c_intel_fast.c
index 10b3c1c5c27..42338a7bcd4 100644
--- a/src/common/crc32c_intel_fast.c
+++ b/src/common/crc32c_intel_fast.c
@@ -1,13 +1,29 @@
-#include <inttypes.h>
#include "acconfig.h"
+#include "include/int_types.h"
+#include "common/crc32c_intel_baseline.h"
extern unsigned int crc32_iscsi_00(unsigned char const *buffer, int len, unsigned int crc);
-#ifdef WITH_GOOD_YASM_ELF64
+#ifdef HAVE_GOOD_YASM_ELF64
uint32_t ceph_crc32c_intel_fast(uint32_t crc, unsigned char const *buffer, unsigned len)
{
- return crc32_iscsi_00(buffer, len, crc);
+ uint32_t v;
+ unsigned left;
+
+ /*
+ * the crc32_iscsi_00 method reads past buffer+len (because it
+ * reads full words) which makes valgrind unhappy. don't do
+ * that.
+ */
+ if (len < 16)
+ return ceph_crc32c_intel_baseline(crc, buffer, len);
+ left = ((unsigned long)buffer + len) & 7;
+ len -= left;
+ v = crc32_iscsi_00(buffer, len, crc);
+ if (left)
+ v = ceph_crc32c_intel_baseline(v, buffer + len, left);
+ return v;
}
int ceph_crc32c_intel_fast_exists(void)
diff --git a/src/common/crc32c_intel_fast.h b/src/common/crc32c_intel_fast.h
index 7a394a0b82c..26a444f6061 100644
--- a/src/common/crc32c_intel_fast.h
+++ b/src/common/crc32c_intel_fast.h
@@ -8,7 +8,7 @@ extern "C" {
/* is the fast version compiled in */
extern int ceph_crc32c_intel_fast_exists(void);
-#ifdef __LP64__
+#ifdef __x86_64__
extern uint32_t ceph_crc32c_intel_fast(uint32_t crc, unsigned char const *buffer, unsigned len);
diff --git a/src/os/hobject.cc b/src/common/hobject.cc
index d6273693c62..b68baedd524 100644
--- a/src/os/hobject.cc
+++ b/src/common/hobject.cc
@@ -191,3 +191,90 @@ ostream& operator<<(ostream& out, const hobject_t& o)
out << "/" << o.nspace << "/" << o.pool;
return out;
}
+
+// This is compatible with decode for hobject_t prior to
+// version 5.
+void ghobject_t::encode(bufferlist& bl) const
+{
+ ENCODE_START(5, 3, bl);
+ ::encode(hobj.key, bl);
+ ::encode(hobj.oid, bl);
+ ::encode(hobj.snap, bl);
+ ::encode(hobj.hash, bl);
+ ::encode(hobj.max, bl);
+ ::encode(hobj.nspace, bl);
+ ::encode(hobj.pool, bl);
+ ::encode(generation, bl);
+ ::encode(shard_id, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ghobject_t::decode(bufferlist::iterator& bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(5, 3, 3, bl);
+ if (struct_v >= 1)
+ ::decode(hobj.key, bl);
+ ::decode(hobj.oid, bl);
+ ::decode(hobj.snap, bl);
+ ::decode(hobj.hash, bl);
+ if (struct_v >= 2)
+ ::decode(hobj.max, bl);
+ else
+ hobj.max = false;
+ if (struct_v >= 4) {
+ ::decode(hobj.nspace, bl);
+ ::decode(hobj.pool, bl);
+ }
+ if (struct_v >= 5) {
+ ::decode(generation, bl);
+ ::decode(shard_id, bl);
+ } else {
+ generation = ghobject_t::NO_GEN;
+ shard_id = ghobject_t::NO_SHARD;
+ }
+ DECODE_FINISH(bl);
+}
+
+void ghobject_t::dump(Formatter *f) const
+{
+ hobj.dump(f);
+ if (generation != NO_GEN) {
+ f->dump_int("generation", generation);
+ f->dump_int("shard_id", shard_id);
+ }
+}
+
+void ghobject_t::generate_test_instances(list<ghobject_t*>& o)
+{
+ o.push_back(new ghobject_t);
+ o.push_back(new ghobject_t);
+ o.back()->hobj.max = true;
+ o.push_back(new ghobject_t(hobject_t(object_t("oname"), string(), 1, 234, -1, "")));
+
+ o.push_back(new ghobject_t(hobject_t(object_t("oname2"), string("okey"), CEPH_NOSNAP,
+ 67, 0, "n1"), 1, 0));
+ o.push_back(new ghobject_t(hobject_t(object_t("oname2"), string("okey"), CEPH_NOSNAP,
+ 67, 0, "n1"), 1, 1));
+ o.push_back(new ghobject_t(hobject_t(object_t("oname2"), string("okey"), CEPH_NOSNAP,
+ 67, 0, "n1"), 1, 2));
+ o.push_back(new ghobject_t(hobject_t(object_t("oname3"), string("oname3"),
+ CEPH_SNAPDIR, 910, 1, "n2"), 1, 0));
+ o.push_back(new ghobject_t(hobject_t(object_t("oname3"), string("oname3"),
+ CEPH_SNAPDIR, 910, 1, "n2"), 2, 0));
+ o.push_back(new ghobject_t(hobject_t(object_t("oname3"), string("oname3"),
+ CEPH_SNAPDIR, 910, 1, "n2"), 3, 0));
+ o.push_back(new ghobject_t(hobject_t(object_t("oname3"), string("oname3"),
+ CEPH_SNAPDIR, 910, 1, "n2"), 3, 1));
+ o.push_back(new ghobject_t(hobject_t(object_t("oname3"), string("oname3"),
+ CEPH_SNAPDIR, 910, 1, "n2"), 3, 2));
+}
+
+ostream& operator<<(ostream& out, const ghobject_t& o)
+{
+ out << o.hobj;
+ if (o.generation != ghobject_t::NO_GEN) {
+ assert(o.shard_id != ghobject_t::NO_SHARD);
+ out << "/" << o.generation << "/" << o.shard_id;
+ }
+ return out;
+}
diff --git a/src/os/hobject.h b/src/common/hobject.h
index 633e471dffc..a769ad060d9 100644
--- a/src/os/hobject.h
+++ b/src/common/hobject.h
@@ -79,6 +79,30 @@ public:
return ret;
}
+ /// @return head version of this hobject_t
+ hobject_t get_head() const {
+ hobject_t ret(*this);
+ ret.snap = CEPH_NOSNAP;
+ return ret;
+ }
+
+ /// @return snapdir version of this hobject_t
+ hobject_t get_snapdir() const {
+ hobject_t ret(*this);
+ ret.snap = CEPH_SNAPDIR;
+ return ret;
+ }
+
+ /// @return true if object is neither head nor snapdir
+ bool is_snap() const {
+ return (snap != CEPH_NOSNAP) && (snap != CEPH_SNAPDIR);
+ }
+
+ /// @return true iff the object should have a snapset in it's attrs
+ bool has_snapset() const {
+ return !is_snap();
+ }
+
/* Do not use when a particular hash function is needed */
explicit hobject_t(const sobject_t &o) :
oid(o.oid), snap(o.snap), max(false), pool(-1) {
@@ -138,7 +162,7 @@ public:
(*this) = temp;
}
- string get_namespace() const {
+ const string &get_namespace() const {
return nspace;
}
@@ -153,6 +177,7 @@ public:
friend bool operator>=(const hobject_t&, const hobject_t&);
friend bool operator==(const hobject_t&, const hobject_t&);
friend bool operator!=(const hobject_t&, const hobject_t&);
+ friend class ghobject_t;
};
WRITE_CLASS_ENCODER(hobject_t)
@@ -179,4 +204,102 @@ WRITE_CMP_OPERATORS_7(hobject_t,
oid,
snap)
+typedef uint64_t gen_t;
+typedef uint8_t shard_t;
+
+#ifndef UINT8_MAX
+#define UINT8_MAX (255)
+#endif
+#ifndef UINT64_MAX
+#define UINT64_MAX (18446744073709551615ULL)
+#endif
+
+struct ghobject_t {
+ hobject_t hobj;
+ gen_t generation;
+ shard_t shard_id;
+
+public:
+ static const shard_t NO_SHARD = UINT8_MAX;
+ static const gen_t NO_GEN = UINT64_MAX;
+
+ ghobject_t() : generation(NO_GEN), shard_id(NO_SHARD) {}
+
+ ghobject_t(const hobject_t &obj) : hobj(obj), generation(NO_GEN), shard_id(NO_SHARD) {}
+
+ ghobject_t(const hobject_t &obj, gen_t gen, shard_t shard) : hobj(obj), generation(gen), shard_id(shard) {}
+
+ bool match(uint32_t bits, uint32_t match) const {
+ return hobj.match_hash(hobj.hash, bits, match);
+ }
+ /// @return min ghobject_t ret s.t. ret.hash == this->hash
+ ghobject_t get_boundary() const {
+ if (hobj.is_max())
+ return *this;
+ ghobject_t ret;
+ ret.hobj.hash = hobj.hash;
+ return ret;
+ }
+ filestore_hobject_key_t get_filestore_key_u32() const {
+ return hobj.get_filestore_key_u32();
+ }
+ filestore_hobject_key_t get_filestore_key() const {
+ return hobj.get_filestore_key();
+ }
+
+ bool is_degenerate() const {
+ return generation == NO_GEN && shard_id == NO_SHARD;
+ }
+
+ // maximum sorted value.
+ static ghobject_t get_max() {
+ ghobject_t h(hobject_t::get_max());
+ return h;
+ }
+ bool is_max() const {
+ return hobj.is_max();
+ }
+
+ void swap(ghobject_t &o) {
+ ghobject_t temp(o);
+ o = (*this);
+ (*this) = temp;
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::iterator& bl);
+ void decode(json_spirit::Value& v);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<ghobject_t*>& o);
+ friend bool operator<(const ghobject_t&, const ghobject_t&);
+ friend bool operator>(const ghobject_t&, const ghobject_t&);
+ friend bool operator<=(const ghobject_t&, const ghobject_t&);
+ friend bool operator>=(const ghobject_t&, const ghobject_t&);
+ friend bool operator==(const ghobject_t&, const ghobject_t&);
+ friend bool operator!=(const ghobject_t&, const ghobject_t&);
+};
+WRITE_CLASS_ENCODER(ghobject_t)
+
+namespace __gnu_cxx {
+ template<> struct hash<ghobject_t> {
+ size_t operator()(const ghobject_t &r) const {
+ static hash<object_t> H;
+ static rjhash<uint64_t> I;
+ return H(r.hobj.oid) ^ I(r.hobj.snap);
+ }
+ };
+}
+
+ostream& operator<<(ostream& out, const ghobject_t& o);
+
+WRITE_EQ_OPERATORS_3(ghobject_t, hobj, shard_id, generation)
+// sort ghobject_t's by <hobj, shard_id, generation>
+//
+// Two objects which differ by generation are more related than
+// two objects of the same generation which differ by shard.
+//
+WRITE_CMP_OPERATORS_3(ghobject_t,
+ hobj,
+ shard_id,
+ generation)
#endif
diff --git a/src/common/lru_map.h b/src/common/lru_map.h
index fb637478884..1e1acc95f76 100644
--- a/src/common/lru_map.h
+++ b/src/common/lru_map.h
@@ -13,82 +13,123 @@ class lru_map {
typename std::list<K>::iterator lru_iter;
};
- std::map<K, entry> tokens;
- std::list<K> tokens_lru;
+ std::map<K, entry> entries;
+ std::list<K> entries_lru;
Mutex lock;
size_t max;
public:
+ class UpdateContext {
+ public:
+ virtual ~UpdateContext() {}
+
+ /* update should return true if object is updated */
+ virtual bool update(V *v) = 0;
+ };
+
+ bool _find(const K& key, V *value, UpdateContext *ctx);
+ void _add(const K& key, V& value);
+
+public:
lru_map(int _max) : lock("lru_map"), max(_max) {}
virtual ~lru_map() {}
bool find(const K& key, V& value);
+
+ /*
+ * find_and_update()
+ *
+ * - will return true if object is found
+ * - if ctx is set will return true if object is found and updated
+ */
+ bool find_and_update(const K& key, V *value, UpdateContext *ctx);
void add(const K& key, V& value);
void erase(const K& key);
};
template <class K, class V>
-bool lru_map<K, V>::find(const K& key, V& value)
+bool lru_map<K, V>::_find(const K& key, V *value, UpdateContext *ctx)
{
- lock.Lock();
- typename std::map<K, entry>::iterator iter = tokens.find(key);
- if (iter == tokens.end()) {
- lock.Unlock();
+ typename std::map<K, entry>::iterator iter = entries.find(key);
+ if (iter == entries.end()) {
return false;
}
entry& e = iter->second;
- tokens_lru.erase(e.lru_iter);
+ entries_lru.erase(e.lru_iter);
- value = e.value;
+ bool r = true;
- tokens_lru.push_front(key);
- e.lru_iter = tokens_lru.begin();
+ if (ctx)
+ r = ctx->update(&e.value);
- lock.Unlock();
+ if (value)
+ *value = e.value;
- return true;
+ entries_lru.push_front(key);
+ e.lru_iter = entries_lru.begin();
+
+ return r;
}
template <class K, class V>
-void lru_map<K, V>::add(const K& key, V& value)
+bool lru_map<K, V>::find(const K& key, V& value)
{
- lock.Lock();
- typename std::map<K, entry>::iterator iter = tokens.find(key);
- if (iter != tokens.end()) {
+ Mutex::Locker l(lock);
+ return _find(key, &value, NULL);
+}
+
+template <class K, class V>
+bool lru_map<K, V>::find_and_update(const K& key, V *value, UpdateContext *ctx)
+{
+ Mutex::Locker l(lock);
+ return _find(key, value, ctx);
+}
+
+template <class K, class V>
+void lru_map<K, V>::_add(const K& key, V& value)
+{
+ typename std::map<K, entry>::iterator iter = entries.find(key);
+ if (iter != entries.end()) {
entry& e = iter->second;
- tokens_lru.erase(e.lru_iter);
+ entries_lru.erase(e.lru_iter);
}
- tokens_lru.push_front(key);
- entry& e = tokens[key];
+ entries_lru.push_front(key);
+ entry& e = entries[key];
e.value = value;
- e.lru_iter = tokens_lru.begin();
-
- while (tokens_lru.size() > max) {
- typename std::list<K>::reverse_iterator riter = tokens_lru.rbegin();
- iter = tokens.find(*riter);
- // assert(iter != tokens.end());
- tokens.erase(iter);
- tokens_lru.pop_back();
+ e.lru_iter = entries_lru.begin();
+
+ while (entries.size() > max) {
+ typename std::list<K>::reverse_iterator riter = entries_lru.rbegin();
+ iter = entries.find(*riter);
+ // assert(iter != entries.end());
+ entries.erase(iter);
+ entries_lru.pop_back();
}
-
- lock.Unlock();
+}
+
+
+template <class K, class V>
+void lru_map<K, V>::add(const K& key, V& value)
+{
+ Mutex::Locker l(lock);
+ _add(key, value);
}
template <class K, class V>
void lru_map<K, V>::erase(const K& key)
{
Mutex::Locker l(lock);
- typename std::map<K, entry>::iterator iter = tokens.find(key);
- if (iter == tokens.end())
+ typename std::map<K, entry>::iterator iter = entries.find(key);
+ if (iter == entries.end())
return;
entry& e = iter->second;
- tokens_lru.erase(e.lru_iter);
- tokens.erase(iter);
+ entries_lru.erase(e.lru_iter);
+ entries.erase(iter);
}
#endif
diff --git a/src/common/obj_bencher.cc b/src/common/obj_bencher.cc
index 6490b4f5932..599cbbc62fb 100644
--- a/src/common/obj_bencher.cc
+++ b/src/common/obj_bencher.cc
@@ -79,7 +79,7 @@ ostream& ObjBencher::out(ostream& os, utime_t& t)
ostream& ObjBencher::out(ostream& os)
{
- utime_t cur_time = ceph_clock_now(g_ceph_context);
+ utime_t cur_time = ceph_clock_now(cct);
return out(os, cur_time);
}
@@ -95,7 +95,7 @@ void *ObjBencher::status_printer(void *_bencher) {
ONE_SECOND.set_from_double(1.0);
bencher->lock.Lock();
while(!data.done) {
- utime_t cur_time = ceph_clock_now(g_ceph_context);
+ utime_t cur_time = ceph_clock_now(bencher->cct);
if (i % 20 == 0) {
if (i > 0)
@@ -158,7 +158,7 @@ void *ObjBencher::status_printer(void *_bencher) {
}
++i;
++cycleSinceChange;
- cond.WaitInterval(g_ceph_context, bencher->lock, ONE_SECOND);
+ cond.WaitInterval(bencher->cct, bencher->lock, ONE_SECOND);
}
bencher->lock.Unlock();
return NULL;
@@ -339,10 +339,10 @@ int ObjBencher::write_bench(int secondsToRun, int maxObjectsToCreate,
pthread_create(&print_thread, NULL, ObjBencher::status_printer, (void *)this);
lock.Lock();
- data.start_time = ceph_clock_now(g_ceph_context);
+ data.start_time = ceph_clock_now(cct);
lock.Unlock();
for (int i = 0; i<concurrentios; ++i) {
- start_times[i] = ceph_clock_now(g_ceph_context);
+ start_times[i] = ceph_clock_now(cct);
r = create_completion(i, _aio_cb, (void *)&lc);
if (r < 0)
goto ERR;
@@ -365,7 +365,7 @@ int ObjBencher::write_bench(int secondsToRun, int maxObjectsToCreate,
stopTime = data.start_time + runtime;
slot = 0;
lock.Lock();
- while( ceph_clock_now(g_ceph_context) < stopTime &&
+ while( ceph_clock_now(cct) < stopTime &&
(!maxObjectsToCreate || data.started < maxObjectsToCreate)) {
bool found = false;
while (1) {
@@ -397,7 +397,7 @@ int ObjBencher::write_bench(int secondsToRun, int maxObjectsToCreate,
lock.Unlock();
goto ERR;
}
- data.cur_latency = ceph_clock_now(g_ceph_context) - start_times[slot];
+ data.cur_latency = ceph_clock_now(cct) - start_times[slot];
data.history.latency.push_back(data.cur_latency);
total_latency += data.cur_latency;
if( data.cur_latency > data.max_latency) data.max_latency = data.cur_latency;
@@ -407,11 +407,11 @@ int ObjBencher::write_bench(int secondsToRun, int maxObjectsToCreate,
--data.in_flight;
lock.Unlock();
release_completion(slot);
- timePassed = ceph_clock_now(g_ceph_context) - data.start_time;
+ timePassed = ceph_clock_now(cct) - data.start_time;
//write new stuff to backend, then delete old stuff
//and save locations of new stuff for later deletion
- start_times[slot] = ceph_clock_now(g_ceph_context);
+ start_times[slot] = ceph_clock_now(cct);
r = create_completion(slot, _aio_cb, &lc);
if (r < 0)
goto ERR;
@@ -438,7 +438,7 @@ int ObjBencher::write_bench(int secondsToRun, int maxObjectsToCreate,
lock.Unlock();
goto ERR;
}
- data.cur_latency = ceph_clock_now(g_ceph_context) - start_times[slot];
+ data.cur_latency = ceph_clock_now(cct) - start_times[slot];
data.history.latency.push_back(data.cur_latency);
total_latency += data.cur_latency;
if (data.cur_latency > data.max_latency) data.max_latency = data.cur_latency;
@@ -451,7 +451,7 @@ int ObjBencher::write_bench(int secondsToRun, int maxObjectsToCreate,
delete contents[slot];
}
- timePassed = ceph_clock_now(g_ceph_context) - data.start_time;
+ timePassed = ceph_clock_now(cct) - data.start_time;
lock.Lock();
data.done = true;
lock.Unlock();
@@ -529,7 +529,7 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre
lock.Lock();
data.finished = 0;
- data.start_time = ceph_clock_now(g_ceph_context);
+ data.start_time = ceph_clock_now(cct);
lock.Unlock();
pthread_t print_thread;
@@ -539,7 +539,7 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre
//start initial reads
for (int i = 0; i < concurrentios; ++i) {
index[i] = i;
- start_times[i] = ceph_clock_now(g_ceph_context);
+ start_times[i] = ceph_clock_now(cct);
create_completion(i, _aio_cb, (void *)&lc);
r = aio_read(name[i], i, contents[i], data.object_size);
if (r < 0) { //naughty, doesn't clean up heap -- oh, or handle the print thread!
@@ -557,7 +557,7 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre
bufferlist *cur_contents;
slot = 0;
- while (seconds_to_run && (ceph_clock_now(g_ceph_context) < finish_time) &&
+ while (seconds_to_run && (ceph_clock_now(cct) < finish_time) &&
num_objects > data.started) {
lock.Lock();
int old_slot = slot;
@@ -590,7 +590,7 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre
lock.Unlock();
goto ERR;
}
- data.cur_latency = ceph_clock_now(g_ceph_context) - start_times[slot];
+ data.cur_latency = ceph_clock_now(cct) - start_times[slot];
total_latency += data.cur_latency;
if( data.cur_latency > data.max_latency) data.max_latency = data.cur_latency;
if (data.cur_latency < data.min_latency) data.min_latency = data.cur_latency;
@@ -602,7 +602,7 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre
cur_contents = contents[slot];
//start new read and check data if requested
- start_times[slot] = ceph_clock_now(g_ceph_context);
+ start_times[slot] = ceph_clock_now(cct);
contents[slot] = new bufferlist();
create_completion(slot, _aio_cb, (void *)&lc);
r = aio_read(newName, slot, contents[slot], data.object_size);
@@ -633,7 +633,7 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre
lock.Unlock();
goto ERR;
}
- data.cur_latency = ceph_clock_now(g_ceph_context) - start_times[slot];
+ data.cur_latency = ceph_clock_now(cct) - start_times[slot];
total_latency += data.cur_latency;
if (data.cur_latency > data.max_latency) data.max_latency = data.cur_latency;
if (data.cur_latency < data.min_latency) data.min_latency = data.cur_latency;
@@ -650,7 +650,7 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre
delete contents[slot];
}
- runtime = ceph_clock_now(g_ceph_context) - data.start_time;
+ runtime = ceph_clock_now(cct) - data.start_time;
lock.Lock();
data.done = true;
lock.Unlock();
diff --git a/src/common/obj_bencher.h b/src/common/obj_bencher.h
index d626eda376d..c8f671f8c90 100644
--- a/src/common/obj_bencher.h
+++ b/src/common/obj_bencher.h
@@ -16,6 +16,7 @@
#include "common/config.h"
#include "common/Cond.h"
+#include "common/ceph_context.h"
struct bench_interval_data {
double min_bandwidth;
@@ -51,6 +52,8 @@ const int OP_RAND_READ = 3;
class ObjBencher {
bool show_time;
+public:
+ CephContext *cct;
protected:
Mutex lock;
@@ -89,7 +92,7 @@ protected:
ostream& out(ostream& os);
ostream& out(ostream& os, utime_t& t);
public:
- ObjBencher() : show_time(false), lock("ObjBencher::lock") {}
+ ObjBencher(CephContext *cct_) : show_time(false), cct(cct_), lock("ObjBencher::lock") {}
virtual ~ObjBencher() {}
int aio_bench(
int operation, int secondsToRun, int maxObjectsToCreate,
diff --git a/src/common/perf_counters.cc b/src/common/perf_counters.cc
index 339ff6a372b..4fe1354fa63 100644
--- a/src/common/perf_counters.cc
+++ b/src/common/perf_counters.cc
@@ -12,13 +12,14 @@
*
*/
+#include "include/int_types.h"
+
#include "common/perf_counters.h"
#include "common/dout.h"
#include "common/errno.h"
#include "common/Formatter.h"
#include <errno.h>
-#include <inttypes.h>
#include <map>
#include <sstream>
#include <stdint.h>
diff --git a/src/common/safe_io.c b/src/common/safe_io.c
index ac99db04ad3..afee82edf07 100644
--- a/src/common/safe_io.c
+++ b/src/common/safe_io.c
@@ -14,8 +14,12 @@
#define _XOPEN_SOURCE 500
+#include <stdio.h>
+#include <string.h>
#include <unistd.h>
#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
#include "common/safe_io.h"
@@ -112,3 +116,79 @@ ssize_t safe_pwrite(int fd, const void *buf, size_t count, off_t offset)
}
return 0;
}
+
+int safe_write_file(const char *base, const char *file,
+ const char *val, size_t vallen)
+{
+ int ret;
+ char fn[PATH_MAX];
+ char tmp[PATH_MAX];
+ int fd;
+
+ // does the file already have correct content?
+ char oldval[80];
+ ret = safe_read_file(base, file, oldval, sizeof(oldval));
+ if (ret == (int)vallen && memcmp(oldval, val, vallen) == 0)
+ return 0; // yes.
+
+ snprintf(fn, sizeof(fn), "%s/%s", base, file);
+ snprintf(tmp, sizeof(tmp), "%s/%s.tmp", base, file);
+ fd = open(tmp, O_WRONLY|O_CREAT|O_TRUNC, 0644);
+ if (fd < 0) {
+ ret = errno;
+ return -ret;
+ }
+ ret = safe_write(fd, val, vallen);
+ if (ret) {
+ TEMP_FAILURE_RETRY(close(fd));
+ return ret;
+ }
+
+ ret = fsync(fd);
+ if (ret < 0) ret = -errno;
+ TEMP_FAILURE_RETRY(close(fd));
+ if (ret < 0) {
+ unlink(tmp);
+ return ret;
+ }
+ ret = rename(tmp, fn);
+ if (ret < 0) {
+ ret = -errno;
+ unlink(tmp);
+ return ret;
+ }
+
+ fd = open(base, O_RDONLY);
+ if (fd < 0) {
+ ret = -errno;
+ return ret;
+ }
+ ret = fsync(fd);
+ if (ret < 0) ret = -errno;
+ TEMP_FAILURE_RETRY(close(fd));
+
+ return ret;
+}
+
+int safe_read_file(const char *base, const char *file,
+ char *val, size_t vallen)
+{
+ char fn[PATH_MAX];
+ int fd, len;
+
+ snprintf(fn, sizeof(fn), "%s/%s", base, file);
+ fd = open(fn, O_RDONLY);
+ if (fd < 0) {
+ return -errno;
+ }
+ len = safe_read(fd, val, vallen - 1);
+ if (len < 0) {
+ TEMP_FAILURE_RETRY(close(fd));
+ return len;
+ }
+ // close sometimes returns errors, but only after write()
+ TEMP_FAILURE_RETRY(close(fd));
+
+ val[len] = 0;
+ return len;
+}
diff --git a/src/common/safe_io.h b/src/common/safe_io.h
index 4c2991fe6e8..a4c9bc7a72f 100644
--- a/src/common/safe_io.h
+++ b/src/common/safe_io.h
@@ -45,6 +45,15 @@ extern "C" {
ssize_t safe_pread_exact(int fd, void *buf, size_t count, off_t offset)
WARN_UNUSED_RESULT;
+
+ /*
+ * Safe functions to read and write an entire file.
+ */
+ int safe_write_file(const char *base, const char *file,
+ const char *val, size_t vallen);
+ int safe_read_file(const char *base, const char *file,
+ char *val, size_t vallen);
+
#ifdef __cplusplus
}
#endif
diff --git a/src/common/sharedptr_registry.hpp b/src/common/sharedptr_registry.hpp
index 6579bd4ba71..90043001ee7 100644
--- a/src/common/sharedptr_registry.hpp
+++ b/src/common/sharedptr_registry.hpp
@@ -64,16 +64,21 @@ public:
}
bool get_next(const K &key, pair<K, VPtr> *next) {
- VPtr next_val;
- Mutex::Locker l(lock);
- typename map<K, WeakVPtr>::iterator i = contents.upper_bound(key);
- while (i != contents.end() &&
- !(next_val = i->second.lock()))
- ++i;
- if (i == contents.end())
- return false;
+ pair<K, VPtr> r;
+ {
+ Mutex::Locker l(lock);
+ VPtr next_val;
+ typename map<K, WeakVPtr>::iterator i = contents.upper_bound(key);
+ while (i != contents.end() &&
+ !(next_val = i->second.lock()))
+ ++i;
+ if (i == contents.end())
+ return false;
+ if (next)
+ r = make_pair(i->first, next_val);
+ }
if (next)
- *next = make_pair(i->first, next_val);
+ *next = r;
return true;
}
diff --git a/src/common/util.cc b/src/common/util.cc
index 6da37e88833..ab417befef6 100644
--- a/src/common/util.cc
+++ b/src/common/util.cc
@@ -58,6 +58,7 @@ int64_t unit_to_bytesize(string val, ostream *pss)
switch (c) {
case 'B':
break;
+ case 'k':
case 'K':
modifier = 10;
break;
diff --git a/src/crush/CrushCompiler.cc b/src/crush/CrushCompiler.cc
index aee621d8e32..5f92bf7e4ec 100644
--- a/src/crush/CrushCompiler.cc
+++ b/src/crush/CrushCompiler.cc
@@ -527,9 +527,17 @@ int CrushCompiler::parse_bucket(iter_t const& i)
item_id[name] = id;
item_weight[id] = bucketweight;
- crush.add_bucket(id, alg, hash, type, size, &items[0], &weights[0]);
- crush.set_item_name(id, name.c_str());
- return 0;
+ assert(id != 0);
+ int r = crush.add_bucket(id, alg, hash, type, size, &items[0], &weights[0], NULL);
+ if (r < 0) {
+ if (r == -EEXIST)
+ err << "Duplicate bucket id " << id << std::endl;
+ else
+ err << "add_bucket failed " << strerror(-r) << std::endl;
+ return r;
+ }
+ r = crush.set_item_name(id, name.c_str());
+ return r;
}
int CrushCompiler::parse_rule(iter_t const& i)
diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc
index e96e6123aab..d17166bc4a9 100644
--- a/src/crush/CrushWrapper.cc
+++ b/src/crush/CrushWrapper.cc
@@ -363,9 +363,15 @@ int CrushWrapper::insert_item(CephContext *cct, int item, float weight, string n
if (!name_exists(q->second)) {
ldout(cct, 5) << "insert_item creating bucket " << q->second << dendl;
- int empty = 0;
- cur = add_bucket(0, CRUSH_BUCKET_STRAW, CRUSH_HASH_DEFAULT, p->first, 1, &cur, &empty);
- set_item_name(cur, q->second);
+ int empty = 0, newid;
+ int r = add_bucket(0, CRUSH_BUCKET_STRAW, CRUSH_HASH_DEFAULT, p->first, 1, &cur, &empty, &newid);
+ if (r < 0) {
+ char buf[128];
+ ldout(cct, 1) << "add_bucket failure error: " << strerror_r(-r, buf, sizeof(buf)) << dendl;
+ return r;
+ }
+ set_item_name(newid, q->second);
+ cur = newid;
continue;
}
@@ -855,7 +861,6 @@ void CrushWrapper::decode(bufferlist::iterator& blp)
decode_32_or_64_string_map(type_map, blp);
decode_32_or_64_string_map(name_map, blp);
decode_32_or_64_string_map(rule_name_map, blp);
- build_rmaps();
// tunables
if (!blp.end()) {
diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h
index 3d07a281956..b4bb67bb742 100644
--- a/src/crush/CrushWrapper.h
+++ b/src/crush/CrushWrapper.h
@@ -94,6 +94,7 @@ public:
crush_destroy(crush);
crush = crush_create();
assert(crush);
+ have_rmaps = false;
}
// tunables
@@ -720,10 +721,10 @@ public:
/* modifiers */
int add_bucket(int bucketno, int alg, int hash, int type, int size,
- int *items, int *weights) {
+ int *items, int *weights, int *idout) {
crush_bucket *b = crush_make_bucket(alg, hash, type, size, items, weights);
assert(b);
- return crush_add_bucket(crush, bucketno, b);
+ return crush_add_bucket(crush, bucketno, b, idout);
}
void finalize() {
diff --git a/src/crush/Makefile.am b/src/crush/Makefile.am
new file mode 100644
index 00000000000..3d2f45b80e6
--- /dev/null
+++ b/src/crush/Makefile.am
@@ -0,0 +1,30 @@
+libcrush_la_SOURCES = \
+ crush/builder.c \
+ crush/mapper.c \
+ crush/crush.c \
+ crush/hash.c \
+ crush/CrushWrapper.cc \
+ crush/CrushCompiler.cc \
+ crush/CrushTester.cc
+noinst_LTLIBRARIES += libcrush.la
+
+noinst_HEADERS += \
+ crush/CrushCompiler.h \
+ crush/CrushTester.h \
+ crush/CrushWrapper.h \
+ crush/CrushWrapper.i \
+ crush/builder.h \
+ crush/crush.h \
+ crush/grammar.h \
+ crush/hash.h \
+ crush/mapper.h \
+ crush/sample.txt \
+ crush/types.h
+
+#crush_includedir = $(includedir)/crush
+#crush_include_DATA = \
+# $(srcdir)/crush/hash.h \
+# $(srcdir)/crush/crush.h \
+# $(srcdir)/crush/mapper.h \
+# $(srcdir)/crush/types.h
+
diff --git a/src/crush/builder.c b/src/crush/builder.c
index 2eb6ff5fc1e..9bfde0bd8e2 100644
--- a/src/crush/builder.c
+++ b/src/crush/builder.c
@@ -123,7 +123,8 @@ int crush_get_next_bucket_id(struct crush_map *map)
int crush_add_bucket(struct crush_map *map,
int id,
- struct crush_bucket *bucket)
+ struct crush_bucket *bucket,
+ int *idout)
{
int pos;
@@ -148,13 +149,16 @@ int crush_add_bucket(struct crush_map *map,
memset(map->buckets + oldsize, 0, (map->max_buckets-oldsize) * sizeof(map->buckets[0]));
}
- assert(map->buckets[pos] == 0);
+ if (map->buckets[pos] != 0) {
+ return -EEXIST;
+ }
/* add it */
bucket->id = id;
map->buckets[pos] = bucket;
- return id;
+ if (idout) *idout = id;
+ return 0;
}
int crush_remove_bucket(struct crush_map *map, struct crush_bucket *bucket)
diff --git a/src/crush/builder.h b/src/crush/builder.h
index 7d30c882343..1003c353e60 100644
--- a/src/crush/builder.h
+++ b/src/crush/builder.h
@@ -15,7 +15,7 @@ extern int crush_add_rule(struct crush_map *map, struct crush_rule *rule, int ru
extern int crush_get_next_bucket_id(struct crush_map *map);
extern int crush_add_bucket(struct crush_map *map,
int bucketno,
- struct crush_bucket *bucket);
+ struct crush_bucket *bucket, int *idout);
struct crush_bucket *crush_make_bucket(int alg, int hash, int type, int size, int *items, int *weights);
extern int crush_bucket_add_item(struct crush_bucket *bucket, int item, int weight);
extern int crush_bucket_adjust_item_weight(struct crush_bucket *bucket, int item, int weight);
diff --git a/src/crush/crush.h b/src/crush/crush.h
index 82d032879d9..4adabcbf331 100644
--- a/src/crush/crush.h
+++ b/src/crush/crush.h
@@ -1,11 +1,12 @@
#ifndef CEPH_CRUSH_CRUSH_H
#define CEPH_CRUSH_CRUSH_H
+#include "include/int_types.h"
+
#if defined(__linux__)
#include <linux/types.h>
#elif defined(__FreeBSD__)
#include <sys/types.h>
-#include "include/inttypes.h"
#endif
/*
diff --git a/src/crush/hash.c b/src/crush/hash.c
index 9f7f3257ebd..9b15321d783 100644
--- a/src/crush/hash.c
+++ b/src/crush/hash.c
@@ -1,10 +1,11 @@
+#include "include/int_types.h"
#if defined(__linux__)
#include <linux/types.h>
#elif defined(__FreeBSD__)
#include <sys/types.h>
-#include "include/inttypes.h"
#endif
+
#include "hash.h"
/*
diff --git a/src/crush/mapper.c b/src/crush/mapper.c
index 3215564172a..ce23ef7c711 100644
--- a/src/crush/mapper.c
+++ b/src/crush/mapper.c
@@ -562,7 +562,7 @@ int crush_do_rule(const struct crush_map *map,
/* copy final _leaf_ values to output set */
memcpy(o, c, osize*sizeof(*o));
- /* swap t and w arrays */
+ /* swap o and w arrays */
tmp = o;
o = w;
w = tmp;
diff --git a/src/global/Makefile.am b/src/global/Makefile.am
new file mode 100644
index 00000000000..79a7ffff689
--- /dev/null
+++ b/src/global/Makefile.am
@@ -0,0 +1,14 @@
+libglobal_la_SOURCES = \
+ global/global_context.cc \
+ global/global_init.cc \
+ global/pidfile.cc \
+ global/signal_handler.cc
+libglobal_la_LIBADD = $(LIBCOMMON)
+noinst_LTLIBRARIES += libglobal.la
+
+noinst_HEADERS += \
+ global/pidfile.h \
+ global/global_init.h \
+ global/global_context.h \
+ global/signal_handler.h
+
diff --git a/src/global/signal_handler.cc b/src/global/signal_handler.cc
index ce604fe1e5d..ffdc5402caf 100644
--- a/src/global/signal_handler.cc
+++ b/src/global/signal_handler.cc
@@ -196,13 +196,13 @@ struct SignalHandler : public Thread {
lock.Lock();
int num_fds = 0;
fds[num_fds].fd = pipefd[0];
- fds[num_fds].events = POLLIN | POLLOUT | POLLERR;
+ fds[num_fds].events = POLLIN | POLLERR;
fds[num_fds].revents = 0;
++num_fds;
for (unsigned i=0; i<32; i++) {
if (handlers[i]) {
fds[num_fds].fd = handlers[i]->pipefd[0];
- fds[num_fds].events = POLLIN | POLLOUT | POLLERR;
+ fds[num_fds].events = POLLIN | POLLERR;
fds[num_fds].revents = 0;
++num_fds;
}
diff --git a/src/include/CompatSet.h b/src/include/CompatSet.h
index 26c438c05f2..b23883093ac 100644
--- a/src/include/CompatSet.h
+++ b/src/include/CompatSet.h
@@ -36,8 +36,8 @@ struct CompatSet {
FeatureSet() : mask(1), names() {}
void insert(Feature f) {
assert(f.id > 0);
- assert(f.id < 63);
- mask |= (1<<f.id);
+ assert(f.id < 64);
+ mask |= ((uint64_t)1<<f.id);
names[f.id] = f.name;
}
@@ -50,7 +50,7 @@ struct CompatSet {
void remove(uint64_t f) {
if (names.count(f)) {
names.erase(f);
- mask &= ~(1<<f);
+ mask &= ~((uint64_t)1<<f);
}
}
void remove(Feature f) {
@@ -156,24 +156,48 @@ struct CompatSet {
((other.ro_compat.mask ^ ro_compat.mask) & other.ro_compat.mask);
uint64_t other_incompat =
((other.incompat.mask ^ incompat.mask) & other.incompat.mask);
- for (int i = 0; i < 64; ++i) {
- int mask = 1 << i;
+ for (int id = 1; id < 64; ++id) {
+ uint64_t mask = (uint64_t)1 << id;
if (mask & other_compat) {
- diff.compat.insert( Feature(mask & other_compat,
- other.compat.names[mask&other_compat]));
+ diff.compat.insert( Feature(id, other.compat.names[id]));
}
if (mask & other_ro_compat) {
- diff.ro_compat.insert(Feature(mask & other_ro_compat,
- other.compat.names[mask&other_ro_compat]));
+ diff.ro_compat.insert(Feature(id, other.ro_compat.names[id]));
}
if (mask & other_incompat) {
- diff.incompat.insert( Feature(mask & other_incompat,
- other.incompat.names[mask&other_incompat]));
+ diff.incompat.insert( Feature(id, other.incompat.names[id]));
}
}
return diff;
}
+ /* Merge features supported by other CompatSet into this one.
+ * Return: true if some features were merged
+ */
+ bool merge(CompatSet& other) {
+ uint64_t other_compat =
+ ((other.compat.mask ^ compat.mask) & other.compat.mask);
+ uint64_t other_ro_compat =
+ ((other.ro_compat.mask ^ ro_compat.mask) & other.ro_compat.mask);
+ uint64_t other_incompat =
+ ((other.incompat.mask ^ incompat.mask) & other.incompat.mask);
+ if (!other_compat && !other_ro_compat && !other_incompat)
+ return false;
+ for (int id = 1; id < 64; ++id) {
+ uint64_t mask = (uint64_t)1 << id;
+ if (mask & other_compat) {
+ compat.insert( Feature(id, other.compat.names[id]));
+ }
+ if (mask & other_ro_compat) {
+ ro_compat.insert(Feature(id, other.ro_compat.names[id]));
+ }
+ if (mask & other_incompat) {
+ incompat.insert( Feature(id, other.incompat.names[id]));
+ }
+ }
+ return true;
+ }
+
void encode(bufferlist& bl) const {
compat.encode(bl);
ro_compat.encode(bl);
diff --git a/src/include/Context.h b/src/include/Context.h
index 9ec4414a047..663313ceec1 100644
--- a/src/include/Context.h
+++ b/src/include/Context.h
@@ -28,6 +28,26 @@
#define mydout(cct, v) lgeneric_subdout(cct, context, v)
/*
+ * GenContext - abstract callback class
+ */
+template <typename T>
+class GenContext {
+ GenContext(const GenContext& other);
+ const GenContext& operator=(const GenContext& other);
+
+ protected:
+ virtual void finish(T t) = 0;
+
+ public:
+ GenContext() {}
+ virtual ~GenContext() {} // we want a virtual destructor!!!
+ virtual void complete(T t) {
+ finish(t);
+ delete this;
+ }
+};
+
+/*
* Context - abstract callback class
*/
class Context {
diff --git a/src/include/Makefile.am b/src/include/Makefile.am
new file mode 100644
index 00000000000..c8823ce523d
--- /dev/null
+++ b/src/include/Makefile.am
@@ -0,0 +1,80 @@
+libcephfs_includedir = $(includedir)/cephfs
+libcephfs_include_DATA = $(srcdir)/include/cephfs/libcephfs.h
+
+librbd_includedir = $(includedir)/rbd
+librbd_include_DATA = \
+ $(srcdir)/include/rbd/features.h \
+ $(srcdir)/include/rbd/librbd.h \
+ $(srcdir)/include/rbd/librbd.hpp
+
+rados_includedir = $(includedir)/rados
+rados_include_DATA = \
+ $(srcdir)/include/rados/librados.h \
+ $(srcdir)/include/rados/rados_types.h \
+ $(srcdir)/include/rados/rados_types.hpp \
+ $(srcdir)/include/rados/librados.hpp \
+ $(srcdir)/include/buffer.h \
+ $(srcdir)/include/page.h \
+ $(srcdir)/include/crc32c.h
+
+noinst_HEADERS += \
+ include/Context.h \
+ include/CompatSet.h \
+ include/Distribution.h \
+ include/addr_parsing.h \
+ include/assert.h \
+ include/atomic.h \
+ include/bitmapper.h \
+ include/blobhash.h \
+ include/buffer.h \
+ include/byteorder.h \
+ include/cephfs/libcephfs.h \
+ include/ceph_features.h \
+ include/ceph_frag.h \
+ include/ceph_fs.h \
+ include/ceph_hash.h \
+ include/cmp.h \
+ include/color.h \
+ include/compat.h \
+ include/crc32c.h \
+ include/encoding.h \
+ include/err.h \
+ include/error.h \
+ include/filepath.h \
+ include/frag.h \
+ include/hash.h \
+ include/histogram.h \
+ include/intarith.h \
+ include/interval_set.h \
+ include/int_types.h \
+ include/ipaddr.h \
+ include/linux_fiemap.h \
+ include/lru.h \
+ include/msgr.h \
+ include/object.h \
+ include/page.h \
+ include/rangeset.h \
+ include/rados.h \
+ include/rbd_types.h \
+ include/statlite.h \
+ include/str_list.h \
+ include/stringify.h \
+ include/triple.h \
+ include/types.h \
+ include/utime.h \
+ include/dlist.h \
+ include/elist.h \
+ include/uuid.h \
+ include/xlist.h \
+ include/rados/librados.h \
+ include/rados/rados_types.h \
+ include/rados/rados_types.hpp \
+ include/rados/librados.hpp \
+ include/rados/librgw.h \
+ include/rados/page.h \
+ include/rados/crc32c.h \
+ include/rados/buffer.h \
+ include/rbd/features.h \
+ include/rbd/librbd.h \
+ include/rbd/librbd.hpp\
+ include/util.h
diff --git a/src/include/bloom_filter.hpp b/src/include/bloom_filter.hpp
deleted file mode 100644
index 41aba4bad47..00000000000
--- a/src/include/bloom_filter.hpp
+++ /dev/null
@@ -1,544 +0,0 @@
-/*
- *******************************************************************
- * *
- * Open Bloom Filter *
- * *
- * Author: Arash Partow - 2000 *
- * URL: http://www.partow.net/programming/hashfunctions/index.html *
- * *
- * Copyright notice: *
- * Free use of the Open Bloom Filter Library is permitted under *
- * the guidelines and in accordance with the most current version *
- * of the Boost Software License, Version 1.0 *
- * http://www.opensource.org/licenses/bsl1.0.html *
- * *
- *******************************************************************
-*/
-
-
-#ifndef INCLUDE_BLOOM_FILTER_HPP
-#define INCLUDE_BLOOM_FILTER_HPP
-
-#include <cstddef>
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <string>
-#include <vector>
-
-
-static const std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned)
-static const unsigned char bit_mask[bits_per_char] = {
- 0x01, //00000001
- 0x02, //00000010
- 0x04, //00000100
- 0x08, //00001000
- 0x10, //00010000
- 0x20, //00100000
- 0x40, //01000000
- 0x80 //10000000
- };
-
-
-class bloom_filter
-{
-protected:
-
- typedef unsigned int bloom_type;
- typedef unsigned char cell_type;
-
-public:
-
- bloom_filter(const std::size_t& predicted_inserted_element_count,
- const double& false_positive_probability,
- const std::size_t& random_seed)
- : bit_table_(0),
- predicted_inserted_element_count_(predicted_inserted_element_count),
- inserted_element_count_(0),
- random_seed_((random_seed) ? random_seed : 0xA5A5A5A5),
- desired_false_positive_probability_(false_positive_probability)
- {
- find_optimal_parameters();
- generate_unique_salt();
- raw_table_size_ = table_size_ / bits_per_char;
- bit_table_ = new cell_type[raw_table_size_];
- std::fill_n(bit_table_,raw_table_size_,0x00);
- }
-
- bloom_filter(const bloom_filter& filter)
- {
- this->operator=(filter);
- }
-
- bloom_filter& operator = (const bloom_filter& filter)
- {
- if (this != &filter) {
- salt_count_ = filter.salt_count_;
- table_size_ = filter.table_size_;
- raw_table_size_ = filter.raw_table_size_;
- predicted_inserted_element_count_ = filter.predicted_inserted_element_count_;
- inserted_element_count_ = filter.inserted_element_count_;
- random_seed_ = filter.random_seed_;
- desired_false_positive_probability_ = filter.desired_false_positive_probability_;
- delete[] bit_table_;
- bit_table_ = new cell_type[raw_table_size_];
- std::copy(filter.bit_table_,filter.bit_table_ + raw_table_size_,bit_table_);
- salt_ = filter.salt_;
- }
- return *this;
- }
-
- virtual ~bloom_filter()
- {
- delete[] bit_table_;
- }
-
- inline bool operator!() const
- {
- return (0 == table_size_);
- }
-
- inline void clear()
- {
- std::fill_n(bit_table_,raw_table_size_,0x00);
- inserted_element_count_ = 0;
- }
-
- inline void insert(const unsigned char* key_begin, const std::size_t& length)
- {
- std::size_t bit_index = 0;
- std::size_t bit = 0;
- for (std::size_t i = 0; i < salt_.size(); ++i)
- {
- compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
- bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
- }
- ++inserted_element_count_;
- }
-
- template<typename T>
- inline void insert(const T& t)
- {
- // Note: T must be a C++ POD type.
- insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T));
- }
-
- inline void insert(const std::string& key)
- {
- insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
- }
-
- inline void insert(const char* data, const std::size_t& length)
- {
- insert(reinterpret_cast<const unsigned char*>(data),length);
- }
-
- template<typename InputIterator>
- inline void insert(const InputIterator begin, const InputIterator end)
- {
- InputIterator itr = begin;
- while (end != itr)
- {
- insert(*(itr++));
- }
- }
-
- inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
- {
- std::size_t bit_index = 0;
- std::size_t bit = 0;
- for (std::size_t i = 0; i < salt_.size(); ++i)
- {
- compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
- if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
- {
- return false;
- }
- }
- return true;
- }
-
- template<typename T>
- inline bool contains(const T& t) const
- {
- return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T)));
- }
-
- inline bool contains(const std::string& key) const
- {
- return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
- }
-
- inline bool contains(const char* data, const std::size_t& length) const
- {
- return contains(reinterpret_cast<const unsigned char*>(data),length);
- }
-
- template<typename InputIterator>
- inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
- {
- InputIterator itr = begin;
- while (end != itr)
- {
- if (!contains(*itr))
- {
- return itr;
- }
- ++itr;
- }
- return end;
- }
-
- template<typename InputIterator>
- inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
- {
- InputIterator itr = begin;
- while (end != itr)
- {
- if (contains(*itr))
- {
- return itr;
- }
- ++itr;
- }
- return end;
- }
-
- inline virtual std::size_t size() const
- {
- return table_size_;
- }
-
- inline std::size_t element_count() const
- {
- return inserted_element_count_;
- }
-
- inline double effective_fpp() const
- {
- /*
- Note:
- The effective false positive probability is calculated using the
- designated table size and hash function count in conjunction with
- the current number of inserted elements - not the user defined
- predicated/expected number of inserted elements.
- */
- return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size());
- }
-
- inline bloom_filter& operator &= (const bloom_filter& filter)
- {
- /* intersection */
- if (
- (salt_count_ == filter.salt_count_) &&
- (table_size_ == filter.table_size_) &&
- (random_seed_ == filter.random_seed_)
- )
- {
- for (std::size_t i = 0; i < raw_table_size_; ++i)
- {
- bit_table_[i] &= filter.bit_table_[i];
- }
- }
- return *this;
- }
-
- inline bloom_filter& operator |= (const bloom_filter& filter)
- {
- /* union */
- if (
- (salt_count_ == filter.salt_count_) &&
- (table_size_ == filter.table_size_) &&
- (random_seed_ == filter.random_seed_)
- )
- {
- for (std::size_t i = 0; i < raw_table_size_; ++i)
- {
- bit_table_[i] |= filter.bit_table_[i];
- }
- }
- return *this;
- }
-
- inline bloom_filter& operator ^= (const bloom_filter& filter)
- {
- /* difference */
- if (
- (salt_count_ == filter.salt_count_) &&
- (table_size_ == filter.table_size_) &&
- (random_seed_ == filter.random_seed_)
- )
- {
- for (std::size_t i = 0; i < raw_table_size_; ++i)
- {
- bit_table_[i] ^= filter.bit_table_[i];
- }
- }
- return *this;
- }
-
- inline const cell_type* table() const
- {
- return bit_table_;
- }
-
-protected:
-
- inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
- {
- bit_index = hash % table_size_;
- bit = bit_index % bits_per_char;
- }
-
- void generate_unique_salt()
- {
- /*
- Note:
- A distinct hash function need not be implementation-wise
- distinct. In the current implementation "seeding" a common
- hash function with different values seems to be adequate.
- */
- const unsigned int predef_salt_count = 128;
- static const bloom_type predef_salt[predef_salt_count] =
- {
- 0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
- 0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
- 0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
- 0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
- 0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
- 0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
- 0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
- 0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
- 0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
- 0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
- 0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
- 0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
- 0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
- 0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
- 0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
- 0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
- 0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
- 0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
- 0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
- 0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
- 0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
- 0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
- 0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
- 0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
- 0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
- 0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
- 0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
- 0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
- 0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
- 0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
- 0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
- 0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
- };
-
- if (salt_count_ <= predef_salt_count)
- {
- std::copy(predef_salt,
- predef_salt + salt_count_,
- std::back_inserter(salt_));
- for (unsigned int i = 0; i < salt_.size(); ++i)
- {
- /*
- Note:
- This is done to integrate the user defined random seed,
- so as to allow for the generation of unique bloom filter
- instances.
- */
- salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + random_seed_;
- }
- }
- else
- {
- std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_));
- srand(static_cast<unsigned int>(random_seed_));
- while (salt_.size() < salt_count_)
- {
- bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
- if (0 == current_salt) continue;
- if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
- {
- salt_.push_back(current_salt);
- }
- }
- }
- }
-
- void find_optimal_parameters()
- {
- /*
- Note:
- The following will attempt to find the number of hash functions
- and minimum amount of storage bits required to construct a bloom
- filter consistent with the user defined false positive probability
- and estimated element insertion count.
- */
-
- double min_m = std::numeric_limits<double>::infinity();
- double min_k = 0.0;
- double curr_m = 0.0;
- double k = 1.0;
- while (k < 1000.0)
- {
- double numerator = (- k * predicted_inserted_element_count_);
- double denominator = std::log(1.0 - std::pow(desired_false_positive_probability_, 1.0 / k));
- curr_m = numerator / denominator;
-
- if (curr_m < min_m)
- {
- min_m = curr_m;
- min_k = k;
- }
- k += 1.0;
- }
-
- salt_count_ = static_cast<std::size_t>(min_k);
- table_size_ = static_cast<std::size_t>(min_m);
- table_size_ += (((table_size_ % bits_per_char) != 0) ? (bits_per_char - (table_size_ % bits_per_char)) : 0);
- }
-
- inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
- {
- const unsigned char* itr = begin;
-
- while (remaining_length >= 4)
- {
- hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
- hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
- hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
- hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
- remaining_length -= 4;
- }
-
- while (remaining_length >= 2)
- {
- hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
- hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
- remaining_length -= 2;
- }
-
- if (remaining_length)
- {
- hash ^= (hash << 7) ^ (*itr) * (hash >> 3);
- }
-
- return hash;
- }
-
- std::vector<bloom_type> salt_;
- unsigned char* bit_table_;
- std::size_t salt_count_;
- std::size_t table_size_;
- std::size_t raw_table_size_;
- std::size_t predicted_inserted_element_count_;
- std::size_t inserted_element_count_;
- std::size_t random_seed_;
- double desired_false_positive_probability_;
-};
-
-inline bloom_filter operator & (const bloom_filter& a, const bloom_filter& b)
-{
- bloom_filter result = a;
- result &= b;
- return result;
-}
-
-inline bloom_filter operator | (const bloom_filter& a, const bloom_filter& b)
-{
- bloom_filter result = a;
- result |= b;
- return result;
-}
-
-inline bloom_filter operator ^ (const bloom_filter& a, const bloom_filter& b)
-{
- bloom_filter result = a;
- result ^= b;
- return result;
-}
-
-
-class compressible_bloom_filter : public bloom_filter
-{
-public:
-
- compressible_bloom_filter(const std::size_t& predicted_element_count,
- const double& false_positive_probability,
- const std::size_t& random_seed)
- : bloom_filter(predicted_element_count,false_positive_probability,random_seed)
- {
- size_list.push_back(table_size_);
- }
-
- inline virtual std::size_t size() const
- {
- return size_list.back();
- }
-
- inline bool compress(const double& percentage)
- {
- if ((0.0 >= percentage) || (percentage >= 100.0))
- {
- return false;
- }
-
- std::size_t original_table_size = size_list.back();
- std::size_t new_table_size = static_cast<std::size_t>((size_list.back() * (1.0 - (percentage / 100.0))));
- new_table_size -= (((new_table_size % bits_per_char) != 0) ? (new_table_size % bits_per_char) : 0);
-
- if ((bits_per_char > new_table_size) || (new_table_size >= original_table_size))
- {
- return false;
- }
-
- desired_false_positive_probability_ = effective_fpp();
- cell_type* tmp = new cell_type[new_table_size / bits_per_char];
- std::copy(bit_table_, bit_table_ + (new_table_size / bits_per_char), tmp);
- cell_type* itr = bit_table_ + (new_table_size / bits_per_char);
- cell_type* end = bit_table_ + (original_table_size / bits_per_char);
- cell_type* itr_tmp = tmp;
-
- while (end != itr)
- {
- *(itr_tmp++) |= (*itr++);
- }
-
- delete[] bit_table_;
- bit_table_ = tmp;
- size_list.push_back(new_table_size);
-
- return true;
- }
-
-private:
-
- inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
- {
- bit_index = hash;
- for (std::size_t i = 0; i < size_list.size(); ++i)
- {
- bit_index %= size_list[i];
- }
- bit = bit_index % bits_per_char;
- }
-
- std::vector<std::size_t> size_list;
-};
-
-#endif
-
-
-/*
- Note 1:
- If it can be guaranteed that bits_per_char will be of the form 2^n then
- the following optimization can be used:
-
- hash_table[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)];
-
- Note 2:
- For performance reasons where possible when allocating memory it should
- be aligned (aligned_alloc) according to the architecture being used.
-*/
diff --git a/src/include/buffer.h b/src/include/buffer.h
index 8e637d658c5..ffa3d6e1b97 100644
--- a/src/include/buffer.h
+++ b/src/include/buffer.h
@@ -11,7 +11,6 @@
* Foundation. See file COPYING.
*
*/
-
#ifndef CEPH_BUFFER_H
#define CEPH_BUFFER_H
@@ -20,7 +19,6 @@
#include <linux/types.h>
#elif defined(__FreeBSD__)
#include <sys/types.h>
-#include "include/inttypes.h"
#include <stdlib.h>
#endif
@@ -46,6 +44,7 @@ void *valloc(size_t);
#include <malloc.h>
#endif
+#include <inttypes.h>
#include <stdint.h>
#include <string.h>
@@ -420,15 +419,7 @@ public:
ssize_t read_fd(int fd, size_t len);
int write_file(const char *fn, int mode=0644);
int write_fd(int fd) const;
- __u32 crc32c(__u32 crc) {
- for (std::list<ptr>::const_iterator it = _buffers.begin();
- it != _buffers.end();
- ++it)
- if (it->length())
- crc = ceph_crc32c(crc, (unsigned char*)it->c_str(), it->length());
- return crc;
- }
-
+ uint32_t crc32c(uint32_t crc) const;
};
/*
@@ -436,7 +427,7 @@ public:
*/
class hash {
- __u32 crc;
+ uint32_t crc;
public:
hash() : crc(0) { }
@@ -445,7 +436,7 @@ public:
crc = bl.crc32c(crc);
}
- __u32 digest() {
+ uint32_t digest() {
return crc;
}
};
diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h
index 362a459bde6..c0f01cc5430 100644
--- a/src/include/ceph_features.h
+++ b/src/include/ceph_features.h
@@ -39,6 +39,7 @@
#define CEPH_FEATURE_OSD_SNAPMAPPER (1ULL<<32)
#define CEPH_FEATURE_MON_SCRUB (1ULL<<33)
#define CEPH_FEATURE_OSD_PACKED_RECOVERY (1ULL<<34)
+#define CEPH_FEATURE_OSD_CACHEPOOL (1ULL<<35)
/*
* The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature
@@ -101,6 +102,7 @@ static inline unsigned long long ceph_sanitize_features(unsigned long long f) {
CEPH_FEATURE_OSD_SNAPMAPPER | \
CEPH_FEATURE_MON_SCRUB | \
CEPH_FEATURE_OSD_PACKED_RECOVERY | \
+ CEPH_FEATURE_OSD_CACHEPOOL | \
0ULL)
#define CEPH_FEATURES_SUPPORTED_DEFAULT CEPH_FEATURES_ALL
diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h
index 6c41d14f5da..ba0b5eb0f19 100644
--- a/src/include/ceph_fs.h
+++ b/src/include/ceph_fs.h
@@ -224,6 +224,7 @@ struct ceph_mon_subscribe_ack {
* mdsmap flags
*/
#define CEPH_MDSMAP_DOWN (1<<0) /* cluster deliberately down */
+#define CEPH_MDSMAP_ALLOW_SNAPS (1<<1) /* cluster allowed to create snapshots */
/*
* mds states
diff --git a/src/include/crc32c.h b/src/include/crc32c.h
index d5f7388be56..49d68474d68 100644
--- a/src/include/crc32c.h
+++ b/src/include/crc32c.h
@@ -1,7 +1,7 @@
#ifndef CEPH_CRC32C_H
#define CEPH_CRC32C_H
-#include "include/inttypes.h"
+#include <inttypes.h>
#include <string.h>
typedef uint32_t (*ceph_crc32c_func_t)(uint32_t crc, unsigned char const *data, unsigned length);
diff --git a/src/include/encoding.h b/src/include/encoding.h
index 67c9af59d2b..fedc8b31299 100644
--- a/src/include/encoding.h
+++ b/src/include/encoding.h
@@ -11,13 +11,13 @@
* Foundation. See file COPYING.
*
*/
-
#ifndef CEPH_ENCODING_H
#define CEPH_ENCODING_H
+#include "include/int_types.h"
+
#include <tr1/memory>
-#include "inttypes.h"
#include "byteorder.h"
#include "buffer.h"
#include "assert.h"
@@ -562,6 +562,17 @@ inline void decode(std::map<T,U>& m, bufferlist::iterator& p)
}
}
template<class T, class U>
+inline void decode_noclear(std::map<T,U>& m, bufferlist::iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ while (n--) {
+ T k;
+ decode(k, p);
+ decode(m[k], p);
+ }
+}
+template<class T, class U>
inline void encode_nohead(const std::map<T,U>& m, bufferlist& bl)
{
for (typename std::map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) {
diff --git a/src/include/histogram.h b/src/include/histogram.h
new file mode 100644
index 00000000000..c817b1ec175
--- /dev/null
+++ b/src/include/histogram.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ * Copyright 2013 Inktank
+ */
+
+#ifndef HISTOGRAM_H_
+#define HISTOGRAM_H_
+
+/**
+ * power of 2 histogram
+ */
+struct pow2_hist_t { //
+ /**
+ * histogram
+ *
+ * bin size is 2^index
+ * value is count of elements that are <= the current bin but > the previous bin.
+ */
+ vector<int32_t> h;
+
+private:
+ /// expand to at least another's size
+ void _expand_to(unsigned s) {
+ if (s > h.size())
+ h.resize(s, 0);
+ }
+ /// drop useless trailing 0's
+ void _contract() {
+ unsigned p = h.size();
+ while (p > 0 && h[p-1] == 0)
+ --p;
+ h.resize(p);
+ }
+
+public:
+ void clear() {
+ h.clear();
+ }
+ void set(int bin, int32_t v) {
+ _expand_to(bin + 1);
+ h[bin] = v;
+ _contract();
+ }
+
+ void add(const pow2_hist_t& o) {
+ _expand_to(o.h.size());
+ for (unsigned p = 0; p < o.h.size(); ++p)
+ h[p] += o.h[p];
+ _contract();
+ }
+ void sub(const pow2_hist_t& o) {
+ _expand_to(o.h.size());
+ for (unsigned p = 0; p < o.h.size(); ++p)
+ h[p] -= o.h[p];
+ _contract();
+ }
+
+ int32_t upper_bound() const {
+ return 1 << h.size();
+ }
+
+ void dump(Formatter *f) const;
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::iterator &bl);
+ static void generate_test_instances(std::list<pow2_hist_t*>& o);
+};
+WRITE_CLASS_ENCODER(pow2_hist_t)
+
+#endif /* HISTOGRAM_H_ */
diff --git a/src/include/int_types.h b/src/include/int_types.h
new file mode 100644
index 00000000000..f290f9d8d23
--- /dev/null
+++ b/src/include/int_types.h
@@ -0,0 +1,75 @@
+#ifndef CEPH_INTTYPES_H
+#define CEPH_INTTYPES_H
+
+#include "acconfig.h"
+
+#if defined(__linux__)
+#include <linux/types.h>
+#endif
+
+/*
+ * Get 64b integers either from inttypes.h or glib.h
+ */
+#ifdef HAVE_INTTYPES_H
+# include <inttypes.h>
+//#else
+//# ifdef HAVE_GLIB
+//# include <glib.h>
+//# endif
+#endif
+
+/*
+ * C99 says inttypes.h includes stdint.h, but that's not true on all
+ * systems. If it's there, include it always - just in case.
+ */
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#endif
+
+/*
+ * Emergency replacements for PRI*64 modifiers. Some systems have
+ * an inttypes.h that doesn't define all the PRI[doxu]64 macros.
+ */
+#if !defined(PRIu64)
+# if defined(HAVE_INTTYPES_H) || defined(HAVE_GLIB)
+/* If we have inttypes or glib, assume we have 64-bit long long int */
+# define PRIu64 "llu"
+# define PRIi64 "lli"
+# define PRIx64 "llx"
+# define PRIX64 "llX"
+# define PRIo64 "llo"
+# define PRId64 "lld"
+# else
+/* Assume that we don't have long long, so use long int modifiers */
+# define PRIu64 "lu"
+# define PRIi64 "li"
+# define PRIx64 "lx"
+# define PRIX64 "lX"
+# define PRIo64 "lo"
+# define PRId64 "ld"
+# endif
+#endif
+
+#if defined(__FreeBSD__)
+#include <sys/types.h>
+
+typedef int8_t __s8;
+typedef uint8_t __u8;
+typedef int16_t __s16;
+typedef uint16_t __u16;
+typedef int32_t __s32;
+typedef uint32_t __u32;
+typedef int64_t __s64;
+typedef uint64_t __u64;
+
+#define __bitwise__
+
+typedef __u16 __bitwise__ __le16;
+typedef __u16 __bitwise__ __be16;
+typedef __u32 __bitwise__ __le32;
+typedef __u32 __bitwise__ __be32;
+typedef __u64 __bitwise__ __le64;
+typedef __u64 __bitwise__ __be64;
+
+#endif
+#endif
diff --git a/src/include/inttypes.h b/src/include/inttypes.h
deleted file mode 100644
index 656e2bb0a0f..00000000000
--- a/src/include/inttypes.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef CEPH_INTTYPES_H
-#define CEPH_INTTYPES_H
-
-#include <stdint.h>
-#if defined(__linux__)
-#include <linux/types.h>
-#elif defined(__FreeBSD__)
-#include <sys/types.h>
-typedef int8_t __s8;
-typedef uint8_t __u8;
-typedef int16_t __s16;
-typedef uint16_t __u16;
-typedef int32_t __s32;
-typedef uint32_t __u32;
-typedef int64_t __s64;
-typedef uint64_t __u64;
-
-#define __bitwise__
-
-typedef __u16 __bitwise__ __le16;
-typedef __u16 __bitwise__ __be16;
-typedef __u32 __bitwise__ __le32;
-typedef __u32 __bitwise__ __be32;
-typedef __u64 __bitwise__ __le64;
-typedef __u64 __bitwise__ __be64;
-
-#endif
-#endif
diff --git a/src/include/linux_fiemap.h b/src/include/linux_fiemap.h
index ba5cb6657fa..352208b2207 100644
--- a/src/include/linux_fiemap.h
+++ b/src/include/linux_fiemap.h
@@ -7,15 +7,15 @@
* Kalpak Shah <kalpak.shah@sun.com>
* Andreas Dilger <adilger@sun.com>
*/
-
#ifndef _LINUX_FIEMAP_H
#define _LINUX_FIEMAP_H
+#include "include/int_types.h"
+
#if defined(__linux__)
#include <linux/types.h>
#elif defined(__FreeBSD_)
#include <sys/types.h>
-#include "include/inttypes.h"
#endif
struct fiemap_extent {
diff --git a/src/include/rados.h b/src/include/rados.h
index de9b449ed15..e7a32b5afef 100644
--- a/src/include/rados.h
+++ b/src/include/rados.h
@@ -217,6 +217,11 @@ enum {
CEPH_OSD_OP_OMAPRMKEYS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24,
CEPH_OSD_OP_OMAP_CMP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25,
+ CEPH_OSD_OP_COPY_FROM = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 26,
+ CEPH_OSD_OP_COPY_GET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 27,
+ CEPH_OSD_OP_UNDIRTY = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 28,
+ CEPH_OSD_OP_ISDIRTY = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 29,
+
/** multi **/
CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1,
CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2,
@@ -398,9 +403,20 @@ struct ceph_osd_op {
__u8 flag; /* 0 = unwatch, 1 = watch */
} __attribute__ ((packed)) watch;
struct {
+ __le64 unused;
+ __le64 ver;
+ } __attribute__ ((packed)) assert_ver;
+ struct {
__le64 offset, length;
__le64 src_offset;
} __attribute__ ((packed)) clonerange;
+ struct {
+ __le64 max; /* max data in reply */
+ } __attribute__ ((packed)) copy_get;
+ struct {
+ __le64 snapid;
+ __le64 src_version;
+ } __attribute__ ((packed)) copy_from;
};
__le32 payload_len;
} __attribute__ ((packed));
diff --git a/src/include/rados/librados.h b/src/include/rados/librados.h
index 6f5e454c8e8..515663c2335 100644
--- a/src/include/rados/librados.h
+++ b/src/include/rados/librados.h
@@ -10,7 +10,6 @@ extern "C" {
#include <linux/types.h>
#elif defined(__FreeBSD__)
#include <sys/types.h>
-#include "include/inttypes.h"
#endif
#include <string.h>
#include "rados_types.h"
@@ -565,7 +564,7 @@ int rados_pool_create_with_auid(rados_t cluster, const char *pool_name, uint64_t
* @returns 0 on success, negative error code on failure
*/
int rados_pool_create_with_crush_rule(rados_t cluster, const char *pool_name,
- __u8 crush_rule_num);
+ uint8_t crush_rule_num);
/**
* Create a pool with a specific CRUSH rule and auid
@@ -580,7 +579,7 @@ int rados_pool_create_with_crush_rule(rados_t cluster, const char *pool_name,
* @returns 0 on success, negative error code on failure
*/
int rados_pool_create_with_all(rados_t cluster, const char *pool_name, uint64_t auid,
- __u8 crush_rule_num);
+ uint8_t crush_rule_num);
/**
* Delete a pool and all data inside it
diff --git a/src/include/rados/librados.hpp b/src/include/rados/librados.hpp
index cf193b36d72..c8de9f9df33 100644
--- a/src/include/rados/librados.hpp
+++ b/src/include/rados/librados.hpp
@@ -96,7 +96,8 @@ namespace librados
bool is_complete_and_cb();
bool is_safe_and_cb();
int get_return_value();
- int get_version();
+ int get_version(); ///< DEPRECATED get_version() only returns 32-bits
+ uint64_t get_version64();
void release();
AioCompletionImpl *pc;
};
@@ -132,11 +133,16 @@ namespace librados
* BALANCE_READS and LOCALIZE_READS should only be used
* when reading from data you're certain won't change,
* like a snapshot, or where eventual consistency is ok.
+ *
+ * ORDER_READS_WRITES will order reads the same way writes are
+ * ordered (e.g., waiting for degraded objects). In particular, it
+ * will make a write followed by a read sequence be preserved.
*/
enum ObjectOperationGlobalFlags {
OPERATION_NOFLAG = 0,
OPERATION_BALANCE_READS = 1,
OPERATION_LOCALIZE_READS = 2,
+ OPERATION_ORDER_READS_WRITES = 4,
};
/*
@@ -264,6 +270,26 @@ namespace librados
*/
void omap_rm_keys(const std::set<std::string> &to_rm);
+ /**
+ * Copy an object
+ *
+ * Copies an object from another location. The operation is atomic in that
+ * the copy either succeeds in its entirety or fails (e.g., because the
+ * source object was modified while the copy was in progress).
+ *
+ * @param src source object name
+ * @param src_ioctx ioctx for the source object
+ * @param version current version of the source object
+ */
+ void copy_from(const std::string& src, const IoCtx& src_ioctx, uint64_t src_version);
+
+ /**
+ * undirty an object
+ *
+ * Clear an objects dirty flag
+ */
+ void undirty();
+
friend class IoCtx;
};
@@ -382,6 +408,14 @@ namespace librados
*/
void list_snaps(snap_set_t *out_snaps, int *prval);
+ /**
+ * query dirty state of an object
+ *
+ * @param out_dirty [out] pointer to resulting bool
+ * @param prval [out] place error code in prval upon completion
+ */
+ void is_dirty(bool *isdirty, int *prval);
+
};
/* IoCtx : This is a context in which we can perform I/O.
@@ -425,8 +459,23 @@ namespace librados
int create(const std::string& oid, bool exclusive);
int create(const std::string& oid, bool exclusive, const std::string& category);
+ /**
+ * write bytes to an object at a specified offset
+ *
+ * NOTE: this call steals the contents of @param bl.
+ */
int write(const std::string& oid, bufferlist& bl, size_t len, uint64_t off);
+ /**
+ * append bytes to an object
+ *
+ * NOTE: this call steals the contents of @param bl.
+ */
int append(const std::string& oid, bufferlist& bl, size_t len);
+ /**
+ * replace object contents with provided data
+ *
+ * NOTE: this call steals the contents of @param bl.
+ */
int write_full(const std::string& oid, bufferlist& bl);
int clone_range(const std::string& dst_oid, uint64_t dst_off,
const std::string& src_oid, uint64_t src_off,
@@ -443,7 +492,17 @@ namespace librados
int stat(const std::string& oid, uint64_t *psize, time_t *pmtime);
int exec(const std::string& oid, const char *cls, const char *method,
bufferlist& inbl, bufferlist& outbl);
+ /**
+ * modify object tmap based on encoded update sequence
+ *
+ * NOTE: this call steals the contents of @param bl
+ */
int tmap_update(const std::string& oid, bufferlist& cmdbl);
+ /**
+ * replace object contents with provided encoded tmap data
+ *
+ * NOTE: this call steals the contents of @param bl
+ */
int tmap_put(const std::string& oid, bufferlist& bl);
int tmap_get(const std::string& oid, bufferlist& bl);
@@ -673,6 +732,7 @@ namespace librados
IoCtx(IoCtxImpl *io_ctx_impl_);
friend class Rados; // Only Rados can use our private constructor to create IoCtxes.
+ friend class ObjectWriteOperation; // copy_from needs to see our IoCtxImpl
IoCtxImpl *io_ctx_impl;
};
@@ -729,7 +789,12 @@ namespace librados
int cluster_stat(cluster_stat_t& result);
int cluster_fsid(std::string *fsid);
- /* pool aio */
+ /*
+ * pool aio
+ *
+ * It is up to the caller to release the completion handler, even if the pool_create_async()
+ * and/or pool_delete_async() fails and does not send the async request
+ */
static PoolAsyncCompletion *pool_async_create_completion();
// -- aio --
diff --git a/src/include/types.h b/src/include/types.h
index 7e6ddb7117e..5a9e6f6d4c9 100644
--- a/src/include/types.h
+++ b/src/include/types.h
@@ -11,12 +11,11 @@
* Foundation. See file COPYING.
*
*/
-
#ifndef CEPH_TYPES_H
#define CEPH_TYPES_H
// this is needed for ceph_fs to compile in userland
-#include "inttypes.h"
+#include "int_types.h"
#include "byteorder.h"
#include "uuid.h"
@@ -380,7 +379,7 @@ inline ostream& operator<<(ostream& out, const prettybyte_t& b)
if (b.v > bump_after << 20)
return out << (b.v >> 20) << " MB";
if (b.v > bump_after << 10)
- return out << (b.v >> 10) << " KB";
+ return out << (b.v >> 10) << " kB";
return out << b.v << " bytes";
}
@@ -403,7 +402,7 @@ inline ostream& operator<<(ostream& out, const si_t& b)
if (b.v > bump_after << 20)
return out << (b.v >> 20) << "M";
if (b.v > bump_after << 10)
- return out << (b.v >> 10) << "K";
+ return out << (b.v >> 10) << "k";
return out << b.v;
}
@@ -426,7 +425,7 @@ inline ostream& operator<<(ostream& out, const pretty_si_t& b)
if (b.v > bump_after << 20)
return out << (b.v >> 20) << " M";
if (b.v > bump_after << 10)
- return out << (b.v >> 10) << " K";
+ return out << (b.v >> 10) << " k";
return out << b.v << " ";
}
@@ -446,7 +445,7 @@ inline ostream& operator<<(ostream& out, const kb_t& kb)
return out << (kb.v >> 20) << " GB";
if (kb.v > bump_after << 10)
return out << (kb.v >> 10) << " MB";
- return out << kb.v << " KB";
+ return out << kb.v << " kB";
}
inline ostream& operator<<(ostream& out, const ceph_mon_subscribe_item& i)
diff --git a/src/init-ceph.in b/src/init-ceph.in
index 3a404a46c6f..46877d75558 100644
--- a/src/init-ceph.in
+++ b/src/init-ceph.in
@@ -80,7 +80,7 @@ stop_daemon() {
action=$5
[ -z "$action" ] && action="Stopping"
echo -n "$action Ceph $name on $host..."
- do_cmd "while [ 1 ]; do
+ do_cmd "while [ 1 ]; do
[ -e $pidfile ] || break
pid=\`cat $pidfile\`
while [ -e /proc/\$pid ] && grep -q $daemon /proc/\$pid/cmdline ; do
@@ -172,6 +172,14 @@ command=$1
get_local_name_list
get_name_list "$@"
+# Reverse the order if we are stopping
+if [ "$command" = "stop" ]; then
+ for f in $what; do
+ new_order="$f $new_order"
+ done
+ what="$new_order"
+fi
+
for name in $what; do
type=`echo $name | cut -c 1-3` # e.g. 'mon', if $item is 'mon1'
id=`echo $name | cut -c 4- | sed 's/^\\.//'`
@@ -251,18 +259,18 @@ for name in $what; do
wrap=""
runmode=""
runarg=""
-
+
[ -z "$docrun" ] && get_conf_bool docrun "0" "restart on core dump"
[ "$docrun" -eq 1 ] && wrap="$BINDIR/ceph-run"
-
+
[ -z "$dovalgrind" ] && get_conf_bool valgrind "" "valgrind"
[ -n "$valgrind" ] && wrap="$wrap valgrind $valgrind"
-
+
[ -n "$wrap" ] && runmode="-f &" && runarg="-f"
[ -n "$max_open_files" ] && files="ulimit -n $max_open_files;"
cmd="$files $wrap $cmd $runmode"
-
+
if [ $dofsmount -eq 1 ] && [ -n "$fs_devs" ]; then
get_conf pre_mount "true" "pre mount command"
get_conf fs_type "" "osd mkfs type"
@@ -361,7 +369,7 @@ for name in $what; do
[ -n "$post_start" ] && do_cmd "$post_start"
[ -n "$lockfile" ] && [ "$?" -eq 0 ] && touch $lockfile
;;
-
+
stop)
get_conf pre_stop "" "pre stop command"
get_conf post_stop "" "post stop command"
@@ -402,13 +410,13 @@ for name in $what; do
[ -n "$post_forcestop" ] && do_cmd "$post_forcestop"
[ -n "$lockfile" ] && [ "$?" -eq 0 ] && rm -f $lockfile
;;
-
+
killall)
echo "killall ceph-$type on $host"
do_cmd "pkill ^ceph-$type || true"
[ -n "$lockfile" ] && [ "$?" -eq 0 ] && rm -f $lockfile
;;
-
+
force-reload | reload)
signal_daemon $name ceph-$type $pid_file -1 "Reloading"
;;
diff --git a/src/init-radosgw b/src/init-radosgw
index a526441c3a0..de1e01e0018 100644
--- a/src/init-radosgw
+++ b/src/init-radosgw
@@ -83,7 +83,7 @@ case "$1" in
echo "$RADOSGW is running."
else
echo "$RADOSGW is not running."
- RETVAL=1
+ exit 1
fi
;;
*)
diff --git a/src/init-radosgw.sysv b/src/init-radosgw.sysv
index e08e1bd1b1b..017e3f9bc07 100644
--- a/src/init-radosgw.sysv
+++ b/src/init-radosgw.sysv
@@ -90,7 +90,7 @@ case "$1" in
echo "$RADOSGW is running."
else
echo "$RADOSGW is not running."
- RETVAL=1
+ exit 1
fi
;;
*)
diff --git a/src/java/Makefile.am b/src/java/Makefile.am
index ac7e86cf9fa..8b28f839e46 100644
--- a/src/java/Makefile.am
+++ b/src/java/Makefile.am
@@ -64,7 +64,6 @@ BUILT_SOURCES = $(JAVA_H)
if HAVE_JUNIT4
JAVA_TEST_CLASSES = $(JAVA_TEST_SRC:test/%.java=%.class)
-ESCAPED_JAVA_TEST_CLASSES = com/ceph/fs/CephAllTests\$$1.class
CEPH_TEST_PROXY=test/com/ceph/fs/CephMountTest.class
@@ -73,7 +72,7 @@ $(CEPH_TEST_PROXY): $(JAVA_TEST_SRC) $(CEPH_PROXY)
$(JAVAC) -source 1.5 -target 1.5 -Xlint:-options test/com/ceph/fs/*.java
libcephfs-test.jar: $(CEPH_TEST_PROXY)
- $(JAR) cf $@ $(JAVA_TEST_CLASSES:%=-C test %) $(ESCAPED_JAVA_TEST_CLASSES:%=-C test %)
+ $(JAR) cf $@ $(JAVA_TEST_CLASSES:%=-C test %)
java_DATA += libcephfs-test.jar
diff --git a/src/java/test/com/ceph/fs/CephAllTests.java b/src/java/test/com/ceph/fs/CephAllTests.java
index 71c2ddfee96..039ad6da3b7 100644
--- a/src/java/test/com/ceph/fs/CephAllTests.java
+++ b/src/java/test/com/ceph/fs/CephAllTests.java
@@ -23,7 +23,6 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.UUID;
import org.junit.*;
-import org.junit.rules.ExternalResource;
import org.junit.runners.Suite;
import org.junit.runner.RunWith;
import static org.junit.Assert.*;
@@ -42,16 +41,4 @@ import static org.junit.Assert.*;
*/
public class CephAllTests{
- @Rule
- public static ExternalResource testRule = new ExternalResource(){
- @Override
- protected void before() throws Throwable{
- // Add debugging messages or setup code here
- };
-
- @Override
- protected void after(){
- // Add debugging messages or cleanup code here
- };
- };
}
diff --git a/src/json_spirit/Makefile.am b/src/json_spirit/Makefile.am
new file mode 100644
index 00000000000..43025be0e71
--- /dev/null
+++ b/src/json_spirit/Makefile.am
@@ -0,0 +1,18 @@
+libjson_spirit_la_SOURCES = \
+ json_spirit/json_spirit_reader.cpp \
+ json_spirit/json_spirit_writer.cpp \
+ json_spirit/json_spirit_value.cpp
+noinst_LTLIBRARIES += libjson_spirit.la
+
+noinst_HEADERS += \
+ json_spirit/json_spirit.h \
+ json_spirit/json_spirit_error_position.h \
+ json_spirit/json_spirit_reader.h \
+ json_spirit/json_spirit_reader_template.h \
+ json_spirit/json_spirit_stream_reader.h \
+ json_spirit/json_spirit_utils.h \
+ json_spirit/json_spirit_value.h \
+ json_spirit/json_spirit_writer.h \
+ json_spirit/json_spirit_writer_options.h \
+ json_spirit/json_spirit_writer_template.h
+
diff --git a/src/key_value_store/Makefile.am b/src/key_value_store/Makefile.am
new file mode 100644
index 00000000000..e5bae00155f
--- /dev/null
+++ b/src/key_value_store/Makefile.am
@@ -0,0 +1,10 @@
+libcls_kvs_la_SOURCES = key_value_store/cls_kvs.cc
+libcls_kvs_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libcls_kvs_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__cls_.*'
+radoslib_LTLIBRARIES += libcls_kvs.la
+
+noinst_HEADERS += \
+ key_value_store/key_value_structure.h \
+ key_value_store/kv_flat_btree_async.h \
+ key_value_store/kvs_arg_types.h
+
diff --git a/src/librados/AioCompletionImpl.h b/src/librados/AioCompletionImpl.h
index a40282a7c25..63a56db8aa8 100644
--- a/src/librados/AioCompletionImpl.h
+++ b/src/librados/AioCompletionImpl.h
@@ -32,7 +32,7 @@ struct librados::AioCompletionImpl {
int ref, rval;
bool released;
bool ack, safe;
- eversion_t objver;
+ version_t objver;
rados_callback_t callback_complete, callback_safe;
void *callback_complete_arg, *callback_safe_arg;
@@ -49,6 +49,7 @@ struct librados::AioCompletionImpl {
AioCompletionImpl() : lock("AioCompletionImpl lock", false, false),
ref(1), rval(0), released(false), ack(false), safe(false),
+ objver(0),
callback_complete(0),
callback_safe(0),
callback_complete_arg(0),
@@ -130,9 +131,9 @@ struct librados::AioCompletionImpl {
}
uint64_t get_version() {
lock.Lock();
- eversion_t v = objver;
+ version_t v = objver;
lock.Unlock();
- return v.version;
+ return v;
}
void get() {
diff --git a/src/librados/IoCtxImpl.cc b/src/librados/IoCtxImpl.cc
index c7900585458..aaa987316a3 100644
--- a/src/librados/IoCtxImpl.cc
+++ b/src/librados/IoCtxImpl.cc
@@ -502,7 +502,7 @@ int librados::IoCtxImpl::operate(const object_t& oid, ::ObjectOperation *o,
Cond cond;
bool done;
int r;
- eversion_t ver;
+ version_t ver;
Context *oncommit = new C_SafeCond(&mylock, &cond, &done, &r);
@@ -536,7 +536,7 @@ int librados::IoCtxImpl::operate_read(const object_t& oid,
Cond cond;
bool done;
int r;
- eversion_t ver;
+ version_t ver;
Context *onack = new C_SafeCond(&mylock, &cond, &done, &r);
@@ -609,7 +609,6 @@ int librados::IoCtxImpl::aio_read(const object_t oid, AioCompletionImpl *c,
return -EDOM;
Context *onack = new C_aio_Ack(c);
- eversion_t ver;
c->is_read = true;
c->io = this;
@@ -1002,7 +1001,7 @@ int librados::IoCtxImpl::getxattrs(const object_t& oid,
return r;
}
-void librados::IoCtxImpl::set_sync_op_version(eversion_t& ver)
+void librados::IoCtxImpl::set_sync_op_version(version_t ver)
{
last_objver = ver;
}
@@ -1016,7 +1015,7 @@ int librados::IoCtxImpl::watch(const object_t& oid, uint64_t ver,
bool done;
int r;
Context *onfinish = new C_SafeCond(&mylock, &cond, &done, &r);
- eversion_t objver;
+ version_t objver;
lock->Lock();
@@ -1071,7 +1070,7 @@ int librados::IoCtxImpl::unwatch(const object_t& oid, uint64_t cookie)
bool done;
int r;
Context *oncommit = new C_SafeCond(&mylock, &cond, &done, &r);
- eversion_t ver;
+ version_t ver;
lock->Lock();
client->unregister_watcher(cookie);
@@ -1102,7 +1101,7 @@ int librados::IoCtxImpl::notify(const object_t& oid, uint64_t ver, bufferlist& b
bool done, done_all;
int r;
Context *onack = new C_SafeCond(&mylock, &cond, &done, &r);
- eversion_t objver;
+ version_t objver;
uint64_t cookie;
C_NotifyComplete *ctx = new C_NotifyComplete(&mylock_all, &cond_all, &done_all);
@@ -1144,7 +1143,7 @@ int librados::IoCtxImpl::notify(const object_t& oid, uint64_t ver, bufferlist& b
return r;
}
-eversion_t librados::IoCtxImpl::last_version()
+version_t librados::IoCtxImpl::last_version()
{
return last_objver;
}
diff --git a/src/librados/IoCtxImpl.h b/src/librados/IoCtxImpl.h
index 74ca1d09880..ccecd4e8184 100644
--- a/src/librados/IoCtxImpl.h
+++ b/src/librados/IoCtxImpl.h
@@ -37,7 +37,7 @@ struct librados::IoCtxImpl {
::SnapContext snapc;
uint64_t assert_ver;
map<object_t, uint64_t> assert_src_version;
- eversion_t last_objver;
+ version_t last_objver;
uint32_t notify_timeout;
object_locator_t oloc;
@@ -183,7 +183,7 @@ struct librados::IoCtxImpl {
int pool_change_auid(unsigned long long auid);
int pool_change_auid_async(unsigned long long auid, PoolAsyncCompletionImpl *c);
- void set_sync_op_version(eversion_t& ver);
+ void set_sync_op_version(version_t ver);
int watch(const object_t& oid, uint64_t ver, uint64_t *cookie, librados::WatchCtx *ctx);
int unwatch(const object_t& oid, uint64_t cookie);
int notify(const object_t& oid, uint64_t ver, bufferlist& bl);
@@ -191,7 +191,7 @@ struct librados::IoCtxImpl {
const object_t& oid, uint64_t notify_id, uint64_t ver,
uint64_t cookie);
- eversion_t last_version();
+ version_t last_version();
void set_assert_version(uint64_t ver);
void set_assert_src_version(const object_t& oid, uint64_t ver);
void set_notify_timeout(uint32_t timeout);
diff --git a/src/librados/Makefile.am b/src/librados/Makefile.am
new file mode 100644
index 00000000000..570aa91af93
--- /dev/null
+++ b/src/librados/Makefile.am
@@ -0,0 +1,20 @@
+librados_la_SOURCES = \
+ librados/librados.cc \
+ librados/RadosClient.cc \
+ librados/IoCtxImpl.cc \
+ librados/snap_set_diff.cc
+
+# We need this to avoid basename conflicts with the librados build tests in test/Makefile.am
+librados_la_CFLAGS = ${AM_CFLAGS}
+
+LIBRADOS_DEPS += libcls_lock_client.la $(LIBOSDC) $(LIBCOMMON)
+librados_la_LIBADD = $(LIBRADOS_DEPS) $(PTHREAD_LIBS) $(CRYPTO_LIBS) $(EXTRALIBS)
+librados_la_LDFLAGS = ${AM_LDFLAGS} -version-info 2:0:0 -export-symbols-regex '^rados_.*'
+lib_LTLIBRARIES += librados.la
+
+noinst_HEADERS += \
+ librados/snap_set_diff.h \
+ librados/AioCompletionImpl.h \
+ librados/IoCtxImpl.h \
+ librados/PoolAsyncCompletionImpl.h \
+ librados/RadosClient.h
diff --git a/src/librados/PoolAsyncCompletionImpl.h b/src/librados/PoolAsyncCompletionImpl.h
index efb89641466..443b2c23a17 100644
--- a/src/librados/PoolAsyncCompletionImpl.h
+++ b/src/librados/PoolAsyncCompletionImpl.h
@@ -94,6 +94,9 @@ namespace librados {
C_PoolAsync_Safe(PoolAsyncCompletionImpl *_c) : c(_c) {
c->get();
}
+ ~C_PoolAsync_Safe() {
+ c->put();
+ }
void finish(int r) {
c->lock.Lock();
@@ -109,7 +112,7 @@ namespace librados {
c->lock.Lock();
}
- c->put_unlock();
+ c->lock.Unlock();
}
};
}
diff --git a/src/librados/RadosClient.cc b/src/librados/RadosClient.cc
index 8a5f499ec15..1be3ebd10f9 100644
--- a/src/librados/RadosClient.cc
+++ b/src/librados/RadosClient.cc
@@ -459,7 +459,6 @@ int librados::RadosClient::pool_create_async(string& name, PoolAsyncCompletionIm
Context *onfinish = new C_PoolAsync_Safe(c);
int r = objecter->create_pool(name, onfinish, auid, crush_rule);
if (r < 0) {
- delete c;
delete onfinish;
}
return r;
@@ -505,7 +504,6 @@ int librados::RadosClient::pool_delete_async(const char *name, PoolAsyncCompleti
Context *onfinish = new C_PoolAsync_Safe(c);
int r = objecter->delete_pool(tmp_pool_id, onfinish);
if (r < 0) {
- delete c;
delete onfinish;
}
return r;
diff --git a/src/librados/librados.cc b/src/librados/librados.cc
index 0a36092a3d9..217a0a7bfb2 100644
--- a/src/librados/librados.cc
+++ b/src/librados/librados.cc
@@ -269,6 +269,14 @@ void librados::ObjectReadOperation::list_snaps(
o->list_snaps(out_snaps, prval);
}
+void librados::ObjectReadOperation::is_dirty(bool *is_dirty, int *prval)
+{
+ ::ObjectOperation *o = (::ObjectOperation *)impl;
+ o->is_dirty(is_dirty, prval);
+}
+
+
+
int librados::IoCtx::omap_get_vals(const std::string& oid,
const std::string& start_after,
const std::string& filter_prefix,
@@ -382,6 +390,20 @@ void librados::ObjectWriteOperation::omap_rm_keys(
o->omap_rm_keys(to_rm);
}
+void librados::ObjectWriteOperation::copy_from(const std::string& src,
+ const IoCtx& src_ioctx,
+ uint64_t src_version)
+{
+ ::ObjectOperation *o = (::ObjectOperation *)impl;
+ o->copy_from(object_t(src), src_ioctx.io_ctx_impl->snap_seq, src_ioctx.io_ctx_impl->oloc, src_version);
+}
+
+void librados::ObjectWriteOperation::undirty()
+{
+ ::ObjectOperation *o = (::ObjectOperation *)impl;
+ o->undirty();
+}
+
void librados::ObjectWriteOperation::tmap_put(const bufferlist &bl)
{
::ObjectOperation *o = (::ObjectOperation *)impl;
@@ -592,6 +614,12 @@ int librados::AioCompletion::AioCompletion::get_version()
return c->get_version();
}
+uint64_t librados::AioCompletion::AioCompletion::get_version64()
+{
+ AioCompletionImpl *c = (AioCompletionImpl *)pc;
+ return c->get_version();
+}
+
void librados::AioCompletion::AioCompletion::release()
{
AioCompletionImpl *c = (AioCompletionImpl *)pc;
@@ -944,6 +972,8 @@ int librados::IoCtx::aio_operate(const std::string& oid, AioCompletion *c,
op_flags |= CEPH_OSD_FLAG_BALANCE_READS;
if (flags & OPERATION_LOCALIZE_READS)
op_flags |= CEPH_OSD_FLAG_LOCALIZE_READS;
+ if (flags & OPERATION_ORDER_READS_WRITES)
+ op_flags |= CEPH_OSD_FLAG_RWORDERED;
return io_ctx_impl->aio_operate_read(obj, (::ObjectOperation*)o->impl, c->pc,
op_flags, pbl);
@@ -1108,8 +1138,7 @@ const librados::ObjectIterator& librados::IoCtx::objects_end() const
uint64_t librados::IoCtx::get_last_version()
{
- eversion_t ver = io_ctx_impl->last_version();
- return ver.version;
+ return io_ctx_impl->last_version();
}
int librados::IoCtx::aio_read(const std::string& oid, librados::AioCompletion *c,
@@ -2142,8 +2171,7 @@ extern "C" int rados_read(rados_ioctx_t io, const char *o, char *buf, size_t len
extern "C" uint64_t rados_get_last_version(rados_ioctx_t io)
{
librados::IoCtxImpl *ctx = (librados::IoCtxImpl *)io;
- eversion_t ver = ctx->last_version();
- return ver.version;
+ return ctx->last_version();
}
extern "C" int rados_pool_create(rados_t cluster, const char *name)
diff --git a/src/librbd/AioRequest.h b/src/librbd/AioRequest.h
index 7625bdd7ff3..cf50ee2c049 100644
--- a/src/librbd/AioRequest.h
+++ b/src/librbd/AioRequest.h
@@ -3,9 +3,9 @@
#ifndef CEPH_LIBRBD_AIOREQUEST_H
#define CEPH_LIBRBD_AIOREQUEST_H
-#include <map>
+#include "include/int_types.h"
-#include "inttypes.h"
+#include <map>
#include "common/snap_types.h"
#include "include/buffer.h"
diff --git a/src/librbd/ImageCtx.h b/src/librbd/ImageCtx.h
index 29ca2f197ea..c9e74393e13 100644
--- a/src/librbd/ImageCtx.h
+++ b/src/librbd/ImageCtx.h
@@ -3,7 +3,7 @@
#ifndef CEPH_LIBRBD_IMAGECTX_H
#define CEPH_LIBRBD_IMAGECTX_H
-#include <inttypes.h>
+#include "include/int_types.h"
#include <map>
#include <set>
diff --git a/src/librbd/Makefile.am b/src/librbd/Makefile.am
new file mode 100644
index 00000000000..d4e2455c8c1
--- /dev/null
+++ b/src/librbd/Makefile.am
@@ -0,0 +1,24 @@
+librbd_la_SOURCES = \
+ librbd/librbd.cc \
+ librbd/AioCompletion.cc \
+ librbd/AioRequest.cc \
+ librbd/ImageCtx.cc \
+ librbd/internal.cc \
+ librbd/LibrbdWriteback.cc \
+ librbd/WatchCtx.cc
+librbd_la_LIBADD = \
+ $(LIBRADOS) $(LIBOSDC) \
+ libcls_rbd_client.la libcls_lock_client.la \
+ $(PTHREAD_LIBS) $(EXTRALIBS)
+librbd_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '^rbd_.*'
+lib_LTLIBRARIES += librbd.la
+
+noinst_HEADERS += \
+ librbd/AioCompletion.h \
+ librbd/AioRequest.h \
+ librbd/ImageCtx.h \
+ librbd/internal.h \
+ librbd/LibrbdWriteback.h \
+ librbd/parent_types.h \
+ librbd/SnapInfo.h \
+ librbd/WatchCtx.h
diff --git a/src/librbd/SnapInfo.h b/src/librbd/SnapInfo.h
index 5a7852c3c66..44dd4cf208e 100644
--- a/src/librbd/SnapInfo.h
+++ b/src/librbd/SnapInfo.h
@@ -3,7 +3,7 @@
#ifndef CEPH_LIBRBD_SNAPINFO_H
#define CEPH_LIBRBD_SNAPINFO_H
-#include <inttypes.h>
+#include "include/int_types.h"
#include "include/rados/librados.hpp"
diff --git a/src/librbd/WatchCtx.h b/src/librbd/WatchCtx.h
index 0c0802a4035..9872c84307b 100644
--- a/src/librbd/WatchCtx.h
+++ b/src/librbd/WatchCtx.h
@@ -3,7 +3,7 @@
#ifndef CEPH_LIBRBD_WATCHCTX_H
#define CEPH_LIBRBD_WATCHCTX_H
-#include <inttypes.h>
+#include "include/int_types.h"
#include "common/Mutex.h"
#include "include/buffer.h"
diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc
index 9c9ae16dfa4..b8a757ab333 100644
--- a/src/librbd/internal.cc
+++ b/src/librbd/internal.cc
@@ -1,5 +1,7 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
+#include "include/int_types.h"
+
#include <errno.h>
#include <limits.h>
@@ -8,7 +10,6 @@
#include "common/errno.h"
#include "common/Throttle.h"
#include "cls/lock/cls_lock_client.h"
-#include "include/inttypes.h"
#include "include/stringify.h"
#include "cls/rbd/cls_rbd.h"
@@ -2845,7 +2846,7 @@ reprotect_and_return_err:
{
CephContext *cct = ictx->cct;
ldout(cct, 20) << "aio_write " << ictx << " off = " << off << " len = "
- << len << " buf = " << &buf << dendl;
+ << len << " buf = " << (void*)buf << dendl;
if (!len)
return 0;
diff --git a/src/librbd/internal.h b/src/librbd/internal.h
index 33f5191706d..43458886b43 100644
--- a/src/librbd/internal.h
+++ b/src/librbd/internal.h
@@ -3,7 +3,7 @@
#ifndef CEPH_LIBRBD_INTERNAL_H
#define CEPH_LIBRBD_INTERNAL_H
-#include <inttypes.h>
+#include "include/int_types.h"
#include <map>
#include <set>
diff --git a/src/librbd/librbd.cc b/src/librbd/librbd.cc
index af413dda04f..cad0c5eb172 100644
--- a/src/librbd/librbd.cc
+++ b/src/librbd/librbd.cc
@@ -11,9 +11,9 @@
* Foundation. See file COPYING.
*
*/
+#include "include/int_types.h"
#include <errno.h>
-#include <inttypes.h>
#include "common/Cond.h"
#include "common/dout.h"
diff --git a/src/log/Makefile.am b/src/log/Makefile.am
new file mode 100644
index 00000000000..b66e6cf84ce
--- /dev/null
+++ b/src/log/Makefile.am
@@ -0,0 +1,11 @@
+liblog_la_SOURCES = \
+ log/Log.cc \
+ log/SubsystemMap.cc
+noinst_LTLIBRARIES += liblog.la
+
+noinst_HEADERS += \
+ log/Entry.h \
+ log/EntryQueue.h \
+ log/Log.h \
+ log/SubsystemMap.h
+
diff --git a/src/mds/CDentry.cc b/src/mds/CDentry.cc
index 5ff6e61fbe0..05766587930 100644
--- a/src/mds/CDentry.cc
+++ b/src/mds/CDentry.cc
@@ -567,4 +567,14 @@ void CDentry::remove_client_lease(ClientLease *l, Locker *locker)
locker->eval_gather(&lock);
}
-
+void CDentry::_put()
+{
+ if (get_num_ref() <= (int)is_dirty() + 1) {
+ CDentry::linkage_t *dnl = get_projected_linkage();
+ if (dnl->is_primary()) {
+ CInode *in = dnl->get_inode();
+ if (get_num_ref() == (int)is_dirty() + !!in->get_num_ref())
+ in->mdcache->maybe_eval_stray(in, true);
+ }
+ }
+}
diff --git a/src/mds/CDentry.h b/src/mds/CDentry.h
index 0d2445a525f..e40854adfaa 100644
--- a/src/mds/CDentry.h
+++ b/src/mds/CDentry.h
@@ -76,6 +76,8 @@ public:
static const int STATE_FRAGMENTING = (1<<1);
static const int STATE_PURGING = (1<<2);
static const int STATE_BADREMOTEINO = (1<<3);
+ // stray dentry needs notification of releasing reference
+ static const int STATE_STRAY = STATE_NOTIFYREF;
// -- pins --
static const int PIN_INODEPIN = 1; // linked inode is pinned
@@ -146,6 +148,7 @@ protected:
public:
elist<CDentry*>::item item_dirty;
+ elist<CDentry*>::item item_stray;
protected:
int auth_pins, nested_auth_pins;
@@ -254,6 +257,7 @@ public:
void last_put() {
lru_unpin();
}
+ void _put();
// auth pins
bool can_auth_pin();
diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index 2b991d78fde..4a5e636d9a6 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -27,7 +27,7 @@
#include "MDLog.h"
#include "LogSegment.h"
-#include "include/bloom_filter.hpp"
+#include "common/bloom_filter.hpp"
#include "include/Context.h"
#include "common/Clock.h"
@@ -655,6 +655,14 @@ void CDir::remove_null_dentries() {
assert(get_num_any() == items.size());
}
+void CDir::touch_dentries_bottom() {
+ dout(12) << "touch_dentries_bottom " << *this << dendl;
+
+ for (CDir::map_t::iterator p = items.begin();
+ p != items.end();
+ ++p)
+ inode->mdcache->touch_dentry_bottom(p->second);
+}
bool CDir::try_trim_snap_dentry(CDentry *dn, const set<snapid_t>& snaps)
{
@@ -1461,6 +1469,7 @@ void CDir::_fetched(bufferlist &bl, const string& want_dn)
}
bool purged_any = false;
+ bool stray = inode->is_stray();
//int num_new_inodes_loaded = 0;
loff_t baseoff = p.get_off();
@@ -1605,6 +1614,12 @@ void CDir::_fetched(bufferlist &bl, const string& want_dn)
if (in->inode.is_dirty_rstat())
in->mark_dirty_rstat();
+ if (stray) {
+ dn->state_set(CDentry::STATE_STRAY);
+ if (in->inode.nlink == 0)
+ in->state_set(CInode::STATE_ORPHAN);
+ }
+
//in->hack_accessed = false;
//in->hack_load_stamp = ceph_clock_now(g_ceph_context);
//num_new_inodes_loaded++;
diff --git a/src/mds/CDir.h b/src/mds/CDir.h
index 7cf2b6a43d7..86da4e5dfd3 100644
--- a/src/mds/CDir.h
+++ b/src/mds/CDir.h
@@ -357,6 +357,7 @@ private:
void remove_null_dentries();
void purge_stale_snap_data(const set<snapid_t>& snaps);
public:
+ void touch_dentries_bottom();
bool try_trim_snap_dentry(CDentry *dn, const set<snapid_t>& snaps);
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index dd483263b6d..7accc5a4dba 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -12,7 +12,8 @@
*
*/
-#include <inttypes.h>
+#include "include/int_types.h"
+
#include <string>
#include <stdio.h>
@@ -681,6 +682,12 @@ void CInode::last_put()
parent->put(CDentry::PIN_INODEPIN);
}
+void CInode::_put()
+{
+ if (get_num_ref() == (int)is_dirty() + (int)is_dirty_parent())
+ mdcache->maybe_eval_stray(this, true);
+}
+
void CInode::add_remote_parent(CDentry *p)
{
if (remote_parents.empty())
@@ -1072,7 +1079,6 @@ void CInode::_stored_backtrace(version_t v, Context *fin)
clear_dirty_parent();
if (fin)
fin->complete(0);
- mdcache->maybe_eval_stray(this);
}
void CInode::_mark_dirty_parent(LogSegment *ls, bool dirty_pool)
diff --git a/src/mds/CInode.h b/src/mds/CInode.h
index 8e760220c14..1c2a9339c1c 100644
--- a/src/mds/CInode.h
+++ b/src/mds/CInode.h
@@ -156,6 +156,8 @@ public:
static const int STATE_STRAYPINNED = (1<<16);
static const int STATE_FROZENAUTHPIN = (1<<17);
static const int STATE_DIRTYPOOL = (1<<18);
+ // orphan inode needs notification of releasing reference
+ static const int STATE_ORPHAN = STATE_NOTIFYREF;
static const int MASK_STATE_EXPORTED =
(STATE_DIRTY|STATE_NEEDSRECOVER|STATE_DIRTYPARENT|STATE_DIRTYPOOL);
@@ -812,6 +814,7 @@ public:
}
void first_get();
void last_put();
+ void _put();
// -- hierarchy stuff --
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index 99bd761e0f7..19c9176f414 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -1640,9 +1640,6 @@ void Locker::file_update_finish(CInode *in, Mutation *mut, bool share, client_t
share_inode_max_size(in);
}
issue_caps_set(need_issue);
-
- // unlinked stray? may need to purge (e.g., after all caps are released)
- mdcache->maybe_eval_stray(in);
}
Capability* Locker::issue_new_caps(CInode *in,
@@ -3011,8 +3008,6 @@ void Locker::remove_client_cap(CInode *in, client_t client)
}
try_eval(in, CEPH_CAP_LOCKS);
-
- mds->mdcache->maybe_eval_stray(in);
}
diff --git a/src/mds/LogEvent.cc b/src/mds/LogEvent.cc
index b775b6d9501..16e7f803196 100644
--- a/src/mds/LogEvent.cc
+++ b/src/mds/LogEvent.cc
@@ -46,10 +46,16 @@ LogEvent *LogEvent::decode(bufferlist& bl)
::decode(type, p);
if (EVENT_NEW_ENCODING == type) {
- DECODE_START(1, p);
- ::decode(type, p);
- event = decode_event(bl, p, type);
- DECODE_FINISH(p);
+ try {
+ DECODE_START(1, p);
+ ::decode(type, p);
+ event = decode_event(bl, p, type);
+ DECODE_FINISH(p);
+ }
+ catch (const buffer::error &e) {
+ generic_dout(0) << "failed to decode LogEvent (type maybe " << type << ")" << dendl;
+ return NULL;
+ }
} else { // we are using classic encoding
event = decode_event(bl, p, type);
}
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 86b380f2827..0188d418e0d 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -127,7 +127,8 @@ long g_num_caps = 0;
set<int> SimpleLock::empty_gather_set;
-MDCache::MDCache(MDS *m)
+MDCache::MDCache(MDS *m) :
+ delayed_eval_stray(member_offset(CDentry, item_stray))
{
mds = m;
migrator = new Migrator(mds, this);
@@ -631,7 +632,7 @@ void MDCache::populate_mydir()
CDir *dir = strays[i]->get_dirfrag(fg);
if (!dir)
dir = strays[i]->get_or_open_dirfrag(this, fg);
- if (!dir->is_complete()) {
+ if (dir->get_version() == 0) {
dir->fetch(new C_MDS_RetryOpenRoot(this));
return;
}
@@ -652,6 +653,8 @@ void MDCache::populate_mydir()
assert(!open);
open = true;
mds->queue_waiters(waiting_for_open);
+
+ scan_stray_dir();
}
void MDCache::open_foreign_mdsdir(inodeno_t ino, Context *fin)
@@ -676,6 +679,7 @@ CDentry *MDCache::get_or_create_stray_dentry(CInode *in)
} else
assert(straydn->get_projected_linkage()->is_null());
+ straydn->state_set(CDentry::STATE_STRAY);
return straydn;
}
@@ -5934,8 +5938,9 @@ void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
mds->mdlog->start_entry(le);
- le->metablob.add_dir_context(in->get_parent_dir());
- le->metablob.add_primary_dentry(in->get_projected_parent_dn(), in, true);
+ CDentry *dn = in->get_projected_parent_dn();
+ le->metablob.add_dir_context(dn->get_dir());
+ le->metablob.add_primary_dentry(dn, in, true);
le->metablob.add_truncate_finish(in->ino(), ls->offset);
journal_dirty_inode(mut, &le->metablob, in);
@@ -6017,8 +6022,15 @@ bool MDCache::trim(int max)
}
dout(7) << "trim max=" << max << " cur=" << lru.lru_get_size() << dendl;
- map<int, MCacheExpire*> expiremap;
+ // process delayed eval_stray()
+ for (elist<CDentry*>::iterator p = delayed_eval_stray.begin(); !p.end(); ) {
+ CDentry *dn = *p;
+ ++p;
+ dn->item_stray.remove_myself();
+ eval_stray(dn);
+ }
+ map<int, MCacheExpire*> expiremap;
bool is_standby_replay = mds->is_standby_replay();
int unexpirable = 0;
list<CDentry*> unexpirables;
@@ -6026,13 +6038,12 @@ bool MDCache::trim(int max)
while (lru.lru_get_size() + unexpirable > (unsigned)max) {
CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
if (!dn) break;
- if (is_standby_replay && dn->get_linkage() &&
- dn->get_linkage()->inode->item_open_file.is_on_list()) {
+ if ((is_standby_replay && dn->get_linkage() &&
+ dn->get_linkage()->inode->item_open_file.is_on_list()) ||
+ trim_dentry(dn, expiremap)) {
unexpirables.push_back(dn);
++unexpirable;
- continue;
}
- trim_dentry(dn, expiremap);
}
for(list<CDentry*>::iterator i = unexpirables.begin();
i != unexpirables.end();
@@ -6087,7 +6098,7 @@ void MDCache::send_expire_messages(map<int, MCacheExpire*>& expiremap)
}
-void MDCache::trim_dentry(CDentry *dn, map<int, MCacheExpire*>& expiremap)
+bool MDCache::trim_dentry(CDentry *dn, map<int, MCacheExpire*>& expiremap)
{
dout(12) << "trim_dentry " << *dn << dendl;
@@ -6142,6 +6153,9 @@ void MDCache::trim_dentry(CDentry *dn, map<int, MCacheExpire*>& expiremap)
CInode *in = dnl->get_inode();
assert(in);
trim_inode(dn, in, con, expiremap);
+ // purging stray instead of trimming ?
+ if (dn->get_num_ref() > 0)
+ return true;
}
else {
assert(dnl->is_null());
@@ -6160,6 +6174,7 @@ void MDCache::trim_dentry(CDentry *dn, map<int, MCacheExpire*>& expiremap)
migrator->export_empty_import(dir);
if (mds->logger) mds->logger->inc(l_mds_iex);
+ return false;
}
@@ -6222,7 +6237,14 @@ void MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map<int, MCacheExpi
trim_dirfrag(*p, con ? con:*p, expiremap); // if no container (e.g. root dirfrag), use *p
// INODE
- if (!in->is_auth()) {
+ if (in->is_auth()) {
+ // eval stray after closing dirfrags
+ if (dn) {
+ maybe_eval_stray(in);
+ if (dn->get_num_ref() > 0)
+ return;
+ }
+ } else {
pair<int,int> auth = in->authority();
dirfrag_t df;
@@ -6305,6 +6327,12 @@ void MDCache::trim_non_auth()
// add back into lru (at the top)
lru.lru_insert_top(dn);
+ if (dn->get_dir()->get_inode()->is_stray()) {
+ dn->state_set(CDentry::STATE_STRAY);
+ if (dnl->is_primary() && dnl->get_inode()->inode.nlink == 0)
+ dnl->get_inode()->state_set(CInode::STATE_ORPHAN);
+ }
+
if (!first_auth) {
first_auth = dn;
} else {
@@ -6725,9 +6753,6 @@ void MDCache::inode_remove_replica(CInode *in, int from, set<SimpleLock *>& gath
if (in->nestlock.remove_replica(from)) gather_locks.insert(&in->nestlock);
if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
-
- // trim?
- maybe_eval_stray(in);
}
void MDCache::dentry_remove_replica(CDentry *dn, int from, set<SimpleLock *>& gather_locks)
@@ -6737,10 +6762,6 @@ void MDCache::dentry_remove_replica(CDentry *dn, int from, set<SimpleLock *>& ga
// fix lock
if (dn->lock.remove_replica(from))
gather_locks.insert(&dn->lock);
-
- CDentry::linkage_t *dnl = dn->get_projected_linkage();
- if (dnl->is_primary())
- maybe_eval_stray(dnl->get_inode());
}
void MDCache::trim_client_leases()
@@ -9116,19 +9137,34 @@ void MDCache::_snaprealm_create_finish(MDRequest *mdr, Mutation *mut, CInode *in
// -------------------------------------------------------------------------------
// STRAYS
-void MDCache::scan_stray_dir()
+struct C_MDC_RetryScanStray : public Context {
+ MDCache *cache;
+ dirfrag_t next;
+ C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : cache(c), next(n) { }
+ void finish(int r) {
+ cache->scan_stray_dir(next);
+ }
+};
+
+void MDCache::scan_stray_dir(dirfrag_t next)
{
- dout(10) << "scan_stray_dir" << dendl;
-
+ dout(10) << "scan_stray_dir " << next << dendl;
+
list<CDir*> ls;
for (int i = 0; i < NUM_STRAY; ++i) {
- if (strays[i]) {
- strays[i]->get_dirfrags(ls);
- }
+ if (strays[i]->ino() < next.ino)
+ continue;
+ strays[i]->get_dirfrags(ls);
}
for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
CDir *dir = *p;
+ if (dir->dirfrag() < next)
+ continue;
+ if (!dir->is_complete()) {
+ dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag()));
+ return;
+ }
for (CDir::map_t::iterator q = dir->items.begin(); q != dir->items.end(); ++q) {
CDentry *dn = q->second;
CDentry::linkage_t *dnl = dn->get_projected_linkage();
@@ -9147,7 +9183,7 @@ struct C_MDC_EvalStray : public Context {
}
};
-void MDCache::eval_stray(CDentry *dn)
+void MDCache::eval_stray(CDentry *dn, bool delay)
{
dout(10) << "eval_stray " << *dn << dendl;
CDentry::linkage_t *dnl = dn->get_projected_linkage();
@@ -9211,9 +9247,13 @@ void MDCache::eval_stray(CDentry *dn)
dout(20) << " too many dn refs" << dendl;
return;
}
- purge_stray(dn);
+ if (delay) {
+ if (!dn->item_stray.is_on_list())
+ delayed_eval_stray.push_back(&dn->item_stray);
+ } else
+ purge_stray(dn);
}
- else if (in->inode.nlink == 1) {
+ else if (in->inode.nlink >= 1) {
// trivial reintegrate?
if (!in->remote_parents.empty()) {
CDentry *rlink = *in->remote_parents.begin();
@@ -9257,14 +9297,6 @@ void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Conte
mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
}
-void MDCache::remove_backtrace(inodeno_t ino, int64_t pool, Context *fin)
-{
- SnapContext snapc;
- object_t oid = CInode::get_object_name(ino, frag_t(), "");
- mds->objecter->removexattr(oid, object_locator_t(pool), "parent", snapc,
- ceph_clock_now(g_ceph_context), 0, NULL, fin);
-}
-
class C_MDC_PurgeStrayPurged : public Context {
MDCache *cache;
CDentry *dn;
@@ -9276,94 +9308,6 @@ public:
}
};
-class C_MDC_PurgeForwardingPointers : public Context {
- MDCache *cache;
- CDentry *dn;
-public:
- bufferlist bl;
- C_MDC_PurgeForwardingPointers(MDCache *c, CDentry *d) :
- cache(c), dn(d) {}
- void finish(int r) {
- cache->_purge_forwarding_pointers(bl, dn, r);
- }
-};
-
-class C_MDC_PurgeStray : public Context {
- MDCache *cache;
- CDentry *dn;
-public:
- C_MDC_PurgeStray(MDCache *c, CDentry *d) :
- cache(c), dn(d) {}
- void finish(int r) {
- cache->_purge_stray(dn, r);
- }
-};
-
-void MDCache::_purge_forwarding_pointers(bufferlist& bl, CDentry *dn, int r)
-{
- assert(r == 0 || r == -ENOENT || r == -ENODATA);
- inode_backtrace_t backtrace;
- if (r == 0)
- ::decode(backtrace, bl);
-
- // setup gathering context
- C_GatherBuilder gather_bld(g_ceph_context);
-
- // remove all the objects with forwarding pointer backtraces (aka sentinels)
- for (set<int64_t>::const_iterator i = backtrace.old_pools.begin();
- i != backtrace.old_pools.end();
- ++i) {
- SnapContext snapc;
- object_t oid = CInode::get_object_name(backtrace.ino, frag_t(), "");
- object_locator_t oloc(*i);
-
- mds->objecter->remove(oid, oloc, snapc, ceph_clock_now(g_ceph_context), 0,
- NULL, gather_bld.new_sub());
- }
-
- if (gather_bld.has_subs()) {
- gather_bld.set_finisher(new C_MDC_PurgeStray(this, dn));
- gather_bld.activate();
- } else {
- _purge_stray(dn, r);
- }
-}
-
-void MDCache::_purge_stray(CDentry *dn, int r)
-{
- // purge the strays
- CDentry::linkage_t *dnl = dn->get_projected_linkage();
- CInode *in = dnl->get_inode();
- dout(10) << "_purge_stray " << *dn << " " << *in << dendl;
-
- SnapRealm *realm = in->find_snaprealm();
- SnapContext nullsnap;
- const SnapContext *snapc;
- if (realm) {
- dout(10) << " realm " << *realm << dendl;
- snapc = &realm->get_snap_context();
- } else {
- dout(10) << " NO realm, using null context" << dendl;
- snapc = &nullsnap;
- assert(in->last == CEPH_NOSNAP);
- }
-
- uint64_t period = (uint64_t)in->inode.layout.fl_object_size * (uint64_t)in->inode.layout.fl_stripe_count;
- uint64_t cur_max_size = in->inode.get_max_size();
- uint64_t to = MAX(in->inode.size, cur_max_size);
- if (to && period) {
- uint64_t num = (to + period - 1) / period;
- dout(10) << "purge_stray 0~" << to << " objects 0~" << num << " snapc " << snapc << " on " << *in << dendl;
- mds->filer->purge_range(in->inode.ino, &in->inode.layout, *snapc,
- 0, num, ceph_clock_now(g_ceph_context), 0,
- new C_MDC_PurgeStrayPurged(this, dn));
-
- } else {
- dout(10) << "purge_stray 0 objects snapc " << snapc << " on " << *in << dendl;
- _purge_stray_purged(dn);
- }
-}
-
void MDCache::purge_stray(CDentry *dn)
{
CDentry::linkage_t *dnl = dn->get_projected_linkage();
@@ -9381,24 +9325,90 @@ void MDCache::purge_stray(CDentry *dn)
dn->get(CDentry::PIN_PURGING);
in->state_set(CInode::STATE_PURGING);
-
+ if (dn->item_stray.is_on_list())
+ dn->item_stray.remove_myself();
+
+ if (in->is_dirty_parent())
+ in->clear_dirty_parent();
+
// CHEAT. there's no real need to journal our intent to purge, since
// that is implicit in the dentry's presence and non-use in the stray
// dir. on recovery, we'll need to re-eval all strays anyway.
+ SnapContext nullsnapc;
+ C_GatherBuilder gather(g_ceph_context, new C_MDC_PurgeStrayPurged(this, dn));
+
if (in->is_dir()) {
- dout(10) << "purge_stray dir ... implement me!" << dendl; // FIXME XXX
- // remove the backtrace
- remove_backtrace(in->ino(), mds->mdsmap->get_metadata_pool(),
- new C_MDC_PurgeStrayPurged(this, dn));
- } else if (in->is_file()) {
- // get the backtrace before blowing away the object
- C_MDC_PurgeForwardingPointers *fin = new C_MDC_PurgeForwardingPointers(this, dn);
- fetch_backtrace(in->ino(), in->get_inode().layout.fl_pg_pool, fin->bl, fin);
+ object_locator_t oloc(mds->mdsmap->get_metadata_pool());
+ list<frag_t> ls;
+ if (!in->dirfragtree.is_leaf(frag_t()))
+ in->dirfragtree.get_leaves(ls);
+ ls.push_back(frag_t());
+ for (list<frag_t>::iterator p = ls.begin();
+ p != ls.end();
+ ++p) {
+ object_t oid = CInode::get_object_name(in->inode.ino, *p, "");
+ dout(10) << "purge_stray remove dirfrag " << oid << dendl;
+ mds->objecter->remove(oid, oloc, nullsnapc, ceph_clock_now(g_ceph_context),
+ 0, NULL, gather.new_sub());
+ }
+ assert(gather.has_subs());
+ gather.activate();
+ return;
+ }
+
+ const SnapContext *snapc;
+ SnapRealm *realm = in->find_snaprealm();
+ if (realm) {
+ dout(10) << " realm " << *realm << dendl;
+ snapc = &realm->get_snap_context();
} else {
- // not a dir or file; purged!
- _purge_stray_purged(dn);
+ dout(10) << " NO realm, using null context" << dendl;
+ snapc = &nullsnapc;
+ assert(in->last == CEPH_NOSNAP);
}
+
+ if (in->is_file()) {
+ uint64_t period = (uint64_t)in->inode.layout.fl_object_size *
+ (uint64_t)in->inode.layout.fl_stripe_count;
+ uint64_t to = in->inode.get_max_size();
+ to = MAX(in->inode.size, to);
+ // when truncating a file, the filer does not delete stripe objects that are
+ // truncated to zero. so we need to purge stripe objects up to the max size
+ // the file has ever been.
+ to = MAX(in->inode.max_size_ever, to);
+ if (to && period) {
+ uint64_t num = (to + period - 1) / period;
+ dout(10) << "purge_stray 0~" << to << " objects 0~" << num
+ << " snapc " << snapc << " on " << *in << dendl;
+ mds->filer->purge_range(in->inode.ino, &in->inode.layout, *snapc,
+ 0, num, ceph_clock_now(g_ceph_context), 0,
+ gather.new_sub());
+ }
+ }
+
+ inode_t *pi = in->get_projected_inode();
+ object_t oid = CInode::get_object_name(pi->ino, frag_t(), "");
+ // remove the backtrace object if it was not purged
+ if (!gather.has_subs()) {
+ object_locator_t oloc(pi->layout.fl_pg_pool);
+ dout(10) << "purge_stray remove backtrace object " << oid
+ << " pool " << oloc.pool << " snapc " << snapc << dendl;
+ mds->objecter->remove(oid, oloc, *snapc, ceph_clock_now(g_ceph_context), 0,
+ NULL, gather.new_sub());
+ }
+ // remove old backtrace objects
+ for (vector<int64_t>::iterator p = pi->old_pools.begin();
+ p != pi->old_pools.end();
+ ++p) {
+ object_locator_t oloc(*p);
+ dout(10) << "purge_stray remove backtrace object " << oid
+ << " old pool " << *p << " snapc " << snapc << dendl;
+ mds->objecter->remove(oid, oloc, *snapc, ceph_clock_now(g_ceph_context), 0,
+ NULL, gather.new_sub());
+ }
+ assert(gather.has_subs());
+ gather.activate();
}
class C_MDC_PurgeStrayLogged : public Context {
@@ -9480,9 +9490,6 @@ void MDCache::_purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls)
CInode *in = dn->get_linkage()->get_inode();
dout(10) << "_purge_stray_logged " << *dn << " " << *in << dendl;
- dn->state_clear(CDentry::STATE_PURGING);
- dn->put(CDentry::PIN_PURGING);
-
assert(!in->state_test(CInode::STATE_RECOVERING));
// unlink
@@ -9493,11 +9500,13 @@ void MDCache::_purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls)
dn->dir->pop_and_dirty_projected_fnode(ls);
+ in->state_clear(CInode::STATE_ORPHAN);
+ dn->state_clear(CDentry::STATE_PURGING);
+ dn->put(CDentry::PIN_PURGING);
+
// drop inode
if (in->is_dirty())
in->mark_clean();
- if (in->is_dirty_parent())
- in->clear_dirty_parent();
remove_inode(in);
@@ -10639,7 +10648,7 @@ void MDCache::handle_dentry_unlink(MDentryUnlink *m)
!in->state_test(CInode::STATE_EXPORTINGCAPS))
migrator->export_caps(in);
- lru.lru_bottouch(straydn); // move stray to end of lru
+ touch_dentry_bottom(straydn); // move stray to end of lru
straydn = NULL;
} else {
assert(!straydn);
@@ -10649,7 +10658,7 @@ void MDCache::handle_dentry_unlink(MDentryUnlink *m)
assert(dnl->is_null());
// move to bottom of lru
- lru.lru_bottouch(dn);
+ touch_dentry_bottom(dn);
}
}
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index b4b57da84b2..416c6454292 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -19,6 +19,7 @@
#include "include/types.h"
#include "include/filepath.h"
+#include "include/elist.h"
#include "CInode.h"
#include "CDentry.h"
@@ -564,7 +565,7 @@ public:
// trimming
bool trim(int max = -1); // trim cache
- void trim_dentry(CDentry *dn, map<int, MCacheExpire*>& expiremap);
+ bool trim_dentry(CDentry *dn, map<int, MCacheExpire*>& expiremap);
void trim_dirfrag(CDir *dir, CDir *con,
map<int, MCacheExpire*>& expiremap);
void trim_inode(CDentry *dn, CInode *in, CDir *con,
@@ -646,6 +647,15 @@ public:
}
void touch_dentry_bottom(CDentry *dn) {
lru.lru_bottouch(dn);
+ if (dn->get_projected_linkage()->is_primary()) {
+ CInode *in = dn->get_projected_linkage()->get_inode();
+ if (in->has_dirfrags()) {
+ list<CDir*> ls;
+ in->get_dirfrags(ls);
+ for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p)
+ (*p)->touch_dentries_bottom();
+ }
+ }
}
protected:
@@ -858,31 +868,29 @@ public:
// -- stray --
public:
- void scan_stray_dir();
- void eval_stray(CDentry *dn);
+ elist<CDentry*> delayed_eval_stray;
+
+ void eval_stray(CDentry *dn, bool delay=false);
void eval_remote(CDentry *dn);
- void maybe_eval_stray(CInode *in) {
+ void maybe_eval_stray(CInode *in, bool delay=false) {
if (in->inode.nlink > 0 || in->is_base())
return;
CDentry *dn = in->get_projected_parent_dn();
- if (dn->get_projected_linkage()->is_primary() &&
- dn->get_dir()->get_inode()->is_stray() &&
- !dn->is_replicated())
- eval_stray(dn);
+ if (!dn->state_test(CDentry::STATE_PURGING) &&
+ dn->get_projected_linkage()->is_primary() &&
+ dn->get_dir()->get_inode()->is_stray())
+ eval_stray(dn, delay);
}
protected:
+ void scan_stray_dir(dirfrag_t next=dirfrag_t());
void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin);
- void remove_backtrace(inodeno_t ino, int64_t pool, Context *fin);
- void _purge_forwarding_pointers(bufferlist& bl, CDentry *dn, int r);
- void _purge_stray(CDentry *dn, int r);
void purge_stray(CDentry *dn);
void _purge_stray_purged(CDentry *dn, int r=0);
void _purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls);
void _purge_stray_logged_truncate(CDentry *dn, LogSegment *ls);
+ friend class C_MDC_RetryScanStray;
friend class C_MDC_FetchedBacktrace;
- friend class C_MDC_PurgeForwardingPointers;
- friend class C_MDC_PurgeStray;
friend class C_MDC_PurgeStrayLogged;
friend class C_MDC_PurgeStrayLoggedTruncate;
friend class C_MDC_PurgeStrayPurged;
diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc
index 1ace72e0ac3..cacbebfd3f6 100644
--- a/src/mds/MDLog.cc
+++ b/src/mds/MDLog.cc
@@ -499,7 +499,11 @@ void MDLog::_replay_thread()
if (journaler->get_error()) {
r = journaler->get_error();
dout(0) << "_replay journaler got error " << r << ", aborting" << dendl;
- if (r == -EINVAL) {
+ if (r == -ENOENT) {
+ // journal has been trimmed by somebody else?
+ assert(journaler->is_readonly());
+ r = -EAGAIN;
+ } else if (r == -EINVAL) {
if (journaler->get_read_pos() < journaler->get_expire_pos()) {
// this should only happen if you're following somebody else
assert(journaler->is_readonly());
@@ -605,7 +609,7 @@ void MDLog::_replay_thread()
}
dout(10) << "_replay_thread kicking waiters" << dendl;
- finish_contexts(g_ceph_context, waitfor_replay, 0);
+ finish_contexts(g_ceph_context, waitfor_replay, r);
dout(10) << "_replay_thread finish" << dendl;
mds->mds_lock.Unlock();
diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
index fc05ca0ecb7..83722274981 100644
--- a/src/mds/MDS.cc
+++ b/src/mds/MDS.cc
@@ -800,7 +800,9 @@ void MDS::handle_command(MMonCommand *m)
clog.info() << "tcmalloc not enabled, can't use heap profiler commands\n";
else {
ostringstream ss;
- ceph_heap_profiler_handle_command(m->cmd, ss);
+ vector<std::string> cmdargs;
+ cmdargs.insert(cmdargs.begin(), m->cmd.begin()+1, m->cmd.end());
+ ceph_heap_profiler_handle_command(cmdargs, ss);
clog.info() << ss.str();
}
} else dout(0) << "unrecognized command! " << m->cmd << dendl;
@@ -1523,7 +1525,6 @@ void MDS::active_start()
mdcache->open_root();
mdcache->clean_open_file_lists();
- mdcache->scan_stray_dir();
mdcache->export_remaining_imported_caps();
finish_contexts(g_ceph_context, waiting_for_replay); // kick waiters
finish_contexts(g_ceph_context, waiting_for_active); // kick waiters
diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc
index 1646a134ad5..f1ab9b112d8 100644
--- a/src/mds/MDSMap.cc
+++ b/src/mds/MDSMap.cc
@@ -470,7 +470,7 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const
::encode(cas_pool, bl);
// kclient ignores everything from here
- __u16 ev = 5;
+ __u16 ev = 6;
::encode(ev, bl);
::encode(compat, bl);
::encode(metadata_pool, bl);
@@ -483,6 +483,8 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const
::encode(failed, bl);
::encode(stopped, bl);
::encode(last_failure_osd_epoch, bl);
+ ::encode(ever_allowed_snaps, bl);
+ ::encode(explicitly_allowed_snaps, bl);
ENCODE_FINISH(bl);
}
}
@@ -540,5 +542,12 @@ void MDSMap::decode(bufferlist::iterator& p)
::decode(stopped, p);
if (ev >= 4)
::decode(last_failure_osd_epoch, p);
+ if (ev >= 6) {
+ ::decode(ever_allowed_snaps, p);
+ ::decode(explicitly_allowed_snaps, p);
+ } else {
+ ever_allowed_snaps = true;
+ explicitly_allowed_snaps = false;
+ }
DECODE_FINISH(p);
}
diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h
index 5bfc7cc20d5..5eadf156a95 100644
--- a/src/mds/MDSMap.h
+++ b/src/mds/MDSMap.h
@@ -175,6 +175,9 @@ protected:
map<int32_t,uint64_t> up; // who is in those roles
map<uint64_t,mds_info_t> mds_info;
+ bool ever_allowed_snaps; //< the cluster has ever allowed snap creation
+ bool explicitly_allowed_snaps; //< the user has explicitly enabled snap creation
+
public:
CompatSet compat;
@@ -188,7 +191,9 @@ public:
max_file_size(0),
cas_pool(-1),
metadata_pool(0),
- max_mds(0)
+ max_mds(0),
+ ever_allowed_snaps(false),
+ explicitly_allowed_snaps(false)
{ }
utime_t get_session_timeout() {
@@ -201,6 +206,14 @@ public:
void set_flag(int f) { flags |= f; }
void clear_flag(int f) { flags &= ~f; }
+ void set_snaps_allowed() {
+ set_flag(CEPH_MDSMAP_ALLOW_SNAPS);
+ ever_allowed_snaps = true;
+ explicitly_allowed_snaps = true;
+ }
+ bool allows_snaps() { return test_flag(CEPH_MDSMAP_ALLOW_SNAPS); }
+ void clear_snaps_allowed() { clear_flag(CEPH_MDSMAP_ALLOW_SNAPS); }
+
epoch_t get_epoch() const { return epoch; }
void inc_epoch() { epoch++; }
diff --git a/src/mds/Makefile.am b/src/mds/Makefile.am
new file mode 100644
index 00000000000..05d0e1a2429
--- /dev/null
+++ b/src/mds/Makefile.am
@@ -0,0 +1,92 @@
+libmds_la_SOURCES = \
+ mds/Anchor.cc \
+ mds/Capability.cc \
+ mds/Dumper.cc \
+ mds/Resetter.cc \
+ mds/MDS.cc \
+ mds/flock.cc \
+ mds/locks.c \
+ mds/journal.cc \
+ mds/Server.cc \
+ mds/Mutation.cc \
+ mds/MDCache.cc \
+ mds/Locker.cc \
+ mds/Migrator.cc \
+ mds/MDBalancer.cc \
+ mds/CDentry.cc \
+ mds/CDir.cc \
+ mds/CInode.cc \
+ mds/LogEvent.cc \
+ mds/MDSTable.cc \
+ mds/InoTable.cc \
+ mds/MDSTableClient.cc \
+ mds/MDSTableServer.cc \
+ mds/AnchorServer.cc \
+ mds/AnchorClient.cc \
+ mds/SnapRealm.cc \
+ mds/SnapServer.cc \
+ mds/snap.cc \
+ mds/SessionMap.cc \
+ mds/MDLog.cc
+libmds_la_LIBADD = $(LIBOSDC)
+noinst_LTLIBRARIES += libmds.la
+
+noinst_HEADERS += \
+ mds/inode_backtrace.h \
+ mds/flock.h \
+ mds/locks.c \
+ mds/locks.h \
+ mds/Anchor.h \
+ mds/AnchorClient.h \
+ mds/AnchorServer.h \
+ mds/CDentry.h \
+ mds/CDir.h \
+ mds/CInode.h \
+ mds/Capability.h \
+ mds/Dumper.h \
+ mds/InoTable.h \
+ mds/LocalLock.h \
+ mds/Locker.h \
+ mds/LogEvent.h \
+ mds/LogSegment.h \
+ mds/MDBalancer.h \
+ mds/MDCache.h \
+ mds/MDLog.h \
+ mds/MDS.h \
+ mds/MDSMap.h \
+ mds/MDSTable.h \
+ mds/MDSTableServer.h \
+ mds/MDSTableClient.h \
+ mds/Mutation.h \
+ mds/Migrator.h \
+ mds/Resetter.h \
+ mds/ScatterLock.h \
+ mds/Server.h \
+ mds/SessionMap.h \
+ mds/SimpleLock.h \
+ mds/SnapClient.h \
+ mds/SnapRealm.h \
+ mds/SnapServer.h \
+ mds/inode_backtrace.h \
+ mds/mds_table_types.h \
+ mds/mdstypes.h \
+ mds/snap.h
+
+noinst_HEADERS += \
+ mds/events/ECommitted.h \
+ mds/events/EExport.h \
+ mds/events/EFragment.h \
+ mds/events/EImportFinish.h \
+ mds/events/EImportStart.h \
+ mds/events/EMetaBlob.h \
+ mds/events/EOpen.h \
+ mds/events/EResetJournal.h \
+ mds/events/ESession.h \
+ mds/events/ESessions.h \
+ mds/events/ESlaveUpdate.h \
+ mds/events/ESubtreeMap.h \
+ mds/events/ETableClient.h \
+ mds/events/ETableServer.h \
+ mds/events/EUpdate.h
+
+
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 466d4818456..41862847e27 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -1167,10 +1167,11 @@ void Server::dispatch_client_request(MDRequest *mdr)
// inodes ops.
case CEPH_MDS_OP_LOOKUP:
- case CEPH_MDS_OP_LOOKUPSNAP:
handle_client_getattr(mdr, true);
break;
+ case CEPH_MDS_OP_LOOKUPSNAP:
+ // lookupsnap does not reference a CDentry; treat it as a getattr
case CEPH_MDS_OP_GETATTR:
handle_client_getattr(mdr, false);
break;
@@ -3085,6 +3086,7 @@ void Server::handle_client_file_readlock(MDRequest *mdr)
checking_lock.length = req->head.args.filelock_change.length;
checking_lock.client = req->get_orig_source().num();
checking_lock.pid = req->head.args.filelock_change.pid;
+ checking_lock.pid_namespace = req->head.args.filelock_change.pid_namespace;
checking_lock.type = req->head.args.filelock_change.type;
// get the appropriate lock state
@@ -4909,8 +4911,10 @@ void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn)
inode_t *pi = in->project_inode();
mdr->add_projected_inode(in); // do this _after_ my dn->pre_dirty().. we apply that one manually.
pi->version = in->pre_dirty();
- pi->nlink--;
pi->ctime = mdr->now;
+ pi->nlink--;
+ if (pi->nlink == 0)
+ in->state_set(CInode::STATE_ORPHAN);
if (dnl->is_primary()) {
// primary link. add stray dentry.
@@ -6054,8 +6058,10 @@ void Server::_rename_prepare(MDRequest *mdr,
pi->nlink--;
}
if (tpi) {
- tpi->nlink--;
tpi->ctime = mdr->now;
+ tpi->nlink--;
+ if (tpi->nlink == 0)
+ oldin->state_set(CInode::STATE_ORPHAN);
}
}
@@ -7157,6 +7163,12 @@ struct C_MDS_mksnap_finish : public Context {
/* This function takes responsibility for the passed mdr*/
void Server::handle_client_mksnap(MDRequest *mdr)
{
+ if (!mds->mdsmap->allows_snaps()) {
+ // you can't make snapshots until you set an option right now
+ reply_request(mdr, -EPERM);
+ return;
+ }
+
MClientRequest *req = mdr->client_request;
CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
if (!diri || diri->state_test(CInode::STATE_PURGING)) {
diff --git a/src/mds/flock.h b/src/mds/flock.h
index ae93d1660f0..b767fe58507 100644
--- a/src/mds/flock.h
+++ b/src/mds/flock.h
@@ -12,7 +12,7 @@
inline ostream& operator<<(ostream& out, ceph_filelock& l) {
out << "start: " << l.start << ", length: " << l.length
<< ", client: " << l.client << ", pid: " << l.pid
- << ", type: " << (int)l.type
+ << ", pid_ns: " << l.pid_namespace << ", type: " << (int)l.type
<< std::endl;
return out;
}
diff --git a/src/mds/locks.c b/src/mds/locks.c
index 37e3f5ea764..f367eda29d2 100644
--- a/src/mds/locks.c
+++ b/src/mds/locks.c
@@ -1,15 +1,15 @@
-
// there must be a better way?
typedef char bool;
#define false 0
#define true 1
+#include "include/int_types.h"
+
#include <netinet/in.h>
#if defined(__linux__)
#include <linux/types.h>
#elif defined(__FreeBSD__)
#include <sys/types.h>
-#include "include/inttypes.h"
#endif
#include <string.h>
#include <fcntl.h>
diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc
index 6886786f27e..362f74774c4 100644
--- a/src/mds/mdstypes.cc
+++ b/src/mds/mdstypes.cc
@@ -204,7 +204,7 @@ ostream& operator<<(ostream& out, const client_writeable_range_t& r)
*/
void inode_t::encode(bufferlist &bl) const
{
- ENCODE_START(7, 6, bl);
+ ENCODE_START(8, 6, bl);
::encode(ino, bl);
::encode(rdev, bl);
@@ -238,6 +238,7 @@ void inode_t::encode(bufferlist &bl) const
::encode(xattr_version, bl);
::encode(backtrace_version, bl);
::encode(old_pools, bl);
+ ::encode(max_size_ever, bl);
ENCODE_FINISH(bl);
}
@@ -294,6 +295,8 @@ void inode_t::decode(bufferlist::iterator &p)
::decode(backtrace_version, p);
if (struct_v >= 7)
::decode(old_pools, p);
+ if (struct_v >= 8)
+ ::decode(max_size_ever, p);
DECODE_FINISH(p);
}
diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h
index 5537407a75d..bd53c85b48d 100644
--- a/src/mds/mdstypes.h
+++ b/src/mds/mdstypes.h
@@ -3,7 +3,8 @@
#ifndef CEPH_MDSTYPES_H
#define CEPH_MDSTYPES_H
-#include <inttypes.h>
+#include "include/int_types.h"
+
#include <math.h>
#include <ostream>
#include <set>
@@ -328,6 +329,7 @@ struct inode_t {
ceph_file_layout layout;
vector <int64_t> old_pools;
uint64_t size; // on directory, # dentries
+ uint64_t max_size_ever; // max size the file has ever been
uint32_t truncate_seq;
uint64_t truncate_size, truncate_from;
uint32_t truncate_pending;
@@ -352,7 +354,8 @@ struct inode_t {
inode_t() : ino(0), rdev(0),
mode(0), uid(0), gid(0),
nlink(0), anchored(false),
- size(0), truncate_seq(0), truncate_size(0), truncate_from(0),
+ size(0), max_size_ever(0),
+ truncate_seq(0), truncate_size(0), truncate_from(0),
truncate_pending(0),
time_warp_seq(0),
version(0), file_data_version(0), xattr_version(0), backtrace_version(0) {
@@ -368,6 +371,8 @@ struct inode_t {
bool is_truncating() const { return (truncate_pending > 0); }
void truncate(uint64_t old_size, uint64_t new_size) {
assert(new_size < old_size);
+ if (old_size > max_size_ever)
+ max_size_ever = old_size;
truncate_from = old_size;
size = new_size;
rstat.rbytes = new_size;
@@ -474,6 +479,7 @@ struct fnode_t {
void decode(bufferlist::iterator& bl);
void dump(Formatter *f) const;
static void generate_test_instances(list<fnode_t*>& ls);
+ fnode_t() : version(0) {};
};
WRITE_CLASS_ENCODER(fnode_t)
@@ -1132,8 +1138,9 @@ class MDSCacheObject {
// -- state --
const static int STATE_AUTH = (1<<30);
const static int STATE_DIRTY = (1<<29);
- const static int STATE_REJOINING = (1<<28); // replica has not joined w/ primary copy
- const static int STATE_REJOINUNDEF = (1<<27); // contents undefined.
+ const static int STATE_NOTIFYREF = (1<<28); // notify dropping ref drop through _put()
+ const static int STATE_REJOINING = (1<<27); // replica has not joined w/ primary copy
+ const static int STATE_REJOINUNDEF = (1<<26); // contents undefined.
// -- wait --
@@ -1219,6 +1226,7 @@ protected:
#endif
assert(ref > 0);
}
+ virtual void _put() {}
void put(int by) {
#ifdef MDS_REF_SET
if (ref == 0 || ref_map[by] == 0) {
@@ -1234,6 +1242,8 @@ protected:
#endif
if (ref == 0)
last_put();
+ if (state_test(STATE_NOTIFYREF))
+ _put();
}
}
diff --git a/src/messages/MOSDOpReply.h b/src/messages/MOSDOpReply.h
index 65f7b23987c..c0e989f7c3a 100644
--- a/src/messages/MOSDOpReply.h
+++ b/src/messages/MOSDOpReply.h
@@ -31,7 +31,7 @@
class MOSDOpReply : public Message {
- static const int HEAD_VERSION = 4;
+ static const int HEAD_VERSION = 6;
static const int COMPAT_VERSION = 2;
object_t oid;
@@ -39,9 +39,12 @@ class MOSDOpReply : public Message {
vector<OSDOp> ops;
int64_t flags;
int32_t result;
- eversion_t reassert_version;
+ eversion_t bad_replay_version;
+ eversion_t replay_version;
+ version_t user_version;
epoch_t osdmap_epoch;
int32_t retry_attempt;
+ request_redirect_t redirect;
public:
object_t get_oid() const { return oid; }
@@ -52,10 +55,42 @@ public:
bool is_onnvram() const { return get_flags() & CEPH_OSD_FLAG_ONNVRAM; }
int get_result() const { return result; }
- eversion_t get_version() { return reassert_version; }
+ eversion_t get_replay_version() const { return replay_version; }
+ version_t get_user_version() const { return user_version; }
void set_result(int r) { result = r; }
- void set_version(eversion_t v) { reassert_version = v; }
+
+ void set_reply_versions(eversion_t v, version_t uv) {
+ replay_version = v;
+ user_version = uv;
+ /* We go through some shenanigans here for backwards compatibility
+ * with old clients, who do not look at our replay_version and
+ * user_version but instead see what we now call the
+ * bad_replay_version. On pools without caching
+ * the user_version infrastructure is a slightly-laggy copy of
+ * the regular pg version/at_version infrastructure; the difference
+ * being it is not updated on watch ops like that is -- but on updates
+ * it is set equal to at_version. This means that for non-watch write ops
+ * on classic pools, all three of replay_version, user_version, and
+ * bad_replay_version are identical. But for watch ops the replay_version
+ * has been updated, while the user_at_version has not, and the semantics
+ * we promised old clients are that the version they see is not an update.
+ * So set the bad_replay_version to be the same as the user_at_version. */
+ bad_replay_version = v;
+ if (uv) {
+ bad_replay_version.version = uv;
+ }
+ }
+
+ /* Don't fill in replay_version for non-write ops */
+ void set_enoent_reply_versions(eversion_t v, version_t uv) {
+ user_version = uv;
+ bad_replay_version = v;
+ }
+
+ void set_redirect(const request_redirect_t& redir) { redirect = redir; }
+ const request_redirect_t& get_redirect() const { return redirect; }
+ bool is_redirect_reply() const { return !redirect.empty(); }
void add_flags(int f) { flags |= f; }
@@ -79,7 +114,7 @@ public:
}
// osdmap
- epoch_t get_map_epoch() { return osdmap_epoch; }
+ epoch_t get_map_epoch() const { return osdmap_epoch; }
/*osd_reqid_t get_reqid() { return osd_reqid_t(get_dest(),
head.client_inc,
@@ -99,7 +134,7 @@ public:
oid = req->oid;
pgid = req->pgid;
osdmap_epoch = e;
- reassert_version = req->reassert_version;
+ user_version = 0;
retry_attempt = req->get_retry_attempt();
// zero out ops payload_len
@@ -121,6 +156,7 @@ public:
head.layout.ol_pgid = pgid.get_old_pg().v;
head.flags = flags;
head.osdmap_epoch = osdmap_epoch;
+ head.reassert_version = bad_replay_version;
head.result = result;
head.num_ops = ops.size();
head.object_len = oid.name.length();
@@ -134,7 +170,7 @@ public:
::encode(pgid, payload);
::encode(flags, payload);
::encode(result, payload);
- ::encode(reassert_version, payload);
+ ::encode(bad_replay_version, payload);
::encode(osdmap_epoch, payload);
__u32 num_ops = ops.size();
@@ -146,6 +182,10 @@ public:
for (unsigned i = 0; i < num_ops; i++)
::encode(ops[i].rval, payload);
+
+ ::encode(replay_version, payload);
+ ::encode(user_version, payload);
+ ::encode(redirect, payload);
}
}
virtual void decode_payload() {
@@ -161,7 +201,8 @@ public:
pgid = pg_t(head.layout.ol_pgid);
result = head.result;
flags = head.flags;
- reassert_version = head.reassert_version;
+ replay_version = head.reassert_version;
+ user_version = replay_version.version;
osdmap_epoch = head.osdmap_epoch;
retry_attempt = -1;
} else {
@@ -169,7 +210,7 @@ public:
::decode(pgid, p);
::decode(flags, p);
::decode(result, p);
- ::decode(reassert_version, p);
+ ::decode(bad_replay_version, p);
::decode(osdmap_epoch, p);
__u32 num_ops = ops.size();
@@ -189,6 +230,17 @@ public:
OSDOp::split_osd_op_vector_out_data(ops, data);
}
+
+ if (header.version >= 5) {
+ ::decode(replay_version, p);
+ ::decode(user_version, p);
+ } else {
+ replay_version = bad_replay_version;
+ user_version = replay_version.version;
+ }
+
+ if (header.version >= 6)
+ ::decode(redirect, p);
}
}
@@ -196,7 +248,9 @@ public:
void print(ostream& out) const {
out << "osd_op_reply(" << get_tid()
- << " " << oid << " " << ops;
+ << " " << oid << " " << ops
+ << " v" << get_replay_version()
+ << " uv" << get_user_version();
if (is_ondisk())
out << " ondisk";
else if (is_onnvram())
@@ -208,6 +262,9 @@ public:
char buf[80];
out << " (" << strerror_r(-get_result(), buf, sizeof(buf)) << ")";
}
+ if (is_redirect_reply()) {
+ out << " redirect: { " << redirect << " }";
+ }
out << ")";
}
diff --git a/src/messages/MOSDSubOp.h b/src/messages/MOSDSubOp.h
index 50b1a926957..4169e01325e 100644
--- a/src/messages/MOSDSubOp.h
+++ b/src/messages/MOSDSubOp.h
@@ -25,7 +25,7 @@
class MOSDSubOp : public Message {
- static const int HEAD_VERSION = 7;
+ static const int HEAD_VERSION = 8;
static const int COMPAT_VERSION = 1;
public:
@@ -86,6 +86,9 @@ public:
// indicates that we must fix hobject_t encoding
bool hobject_incorrect_pool;
+ hobject_t new_temp_oid; ///< new temp object that we must now start tracking
+ hobject_t discard_temp_oid; ///< previously used temp object that we can now stop tracking
+
int get_cost() const {
if (ops.size() == 1 && ops[0].op.op == CEPH_OSD_OP_PULL)
return ops[0].op.extent.length;
@@ -150,6 +153,11 @@ public:
poid.pool = pgid.pool();
hobject_incorrect_pool = true;
}
+
+ if (header.version >= 8) {
+ ::decode(new_temp_oid, p);
+ ::decode(discard_temp_oid, p);
+ }
}
virtual void encode_payload(uint64_t features) {
@@ -194,6 +202,8 @@ public:
::encode(current_progress, payload);
::encode(omap_entries, payload);
::encode(omap_header, payload);
+ ::encode(new_temp_oid, payload);
+ ::encode(discard_temp_oid, payload);
}
MOSDSubOp()
diff --git a/src/messages/Makefile.am b/src/messages/Makefile.am
new file mode 100644
index 00000000000..c503d3fca9b
--- /dev/null
+++ b/src/messages/Makefile.am
@@ -0,0 +1,113 @@
+
+noinst_HEADERS += \
+ messages/MAuth.h \
+ messages/MAuthReply.h \
+ messages/MCacheExpire.h \
+ messages/MClientCaps.h \
+ messages/MClientCapRelease.h \
+ messages/MClientLease.h \
+ messages/MClientReconnect.h \
+ messages/MClientReply.h \
+ messages/MClientRequest.h \
+ messages/MClientRequestForward.h \
+ messages/MClientSession.h \
+ messages/MClientSnap.h \
+ messages/MCommand.h \
+ messages/MCommandReply.h \
+ messages/MDentryLink.h \
+ messages/MDentryUnlink.h \
+ messages/MDirUpdate.h \
+ messages/MDiscover.h \
+ messages/MDiscoverReply.h \
+ messages/MExportCaps.h \
+ messages/MExportCapsAck.h \
+ messages/MExportDir.h \
+ messages/MExportDirAck.h \
+ messages/MExportDirCancel.h \
+ messages/MExportDirDiscover.h \
+ messages/MExportDirDiscoverAck.h \
+ messages/MExportDirFinish.h \
+ messages/MExportDirNotify.h \
+ messages/MExportDirNotifyAck.h \
+ messages/MExportDirPrep.h \
+ messages/MExportDirPrepAck.h \
+ messages/MGenericMessage.h \
+ messages/MGetPoolStats.h \
+ messages/MGetPoolStatsReply.h \
+ messages/MHeartbeat.h \
+ messages/MInodeFileCaps.h \
+ messages/MLock.h \
+ messages/MLog.h \
+ messages/MLogAck.h \
+ messages/MMDSBeacon.h \
+ messages/MMDSCacheRejoin.h \
+ messages/MMDSLoadTargets.h \
+ messages/MMDSFindIno.h \
+ messages/MMDSFindInoReply.h \
+ messages/MMDSFragmentNotify.h \
+ messages/MMDSMap.h \
+ messages/MMDSOpenIno.h \
+ messages/MMDSOpenInoReply.h \
+ messages/MMDSResolve.h \
+ messages/MMDSResolveAck.h \
+ messages/MMDSSlaveRequest.h \
+ messages/MMDSTableRequest.h \
+ messages/MMonCommand.h \
+ messages/MMonCommandAck.h \
+ messages/MMonElection.h \
+ messages/MMonGetMap.h \
+ messages/MMonGetVersion.h \
+ messages/MMonGetVersionReply.h \
+ messages/MMonGlobalID.h \
+ messages/MMonHealth.h \
+ messages/MMonJoin.h \
+ messages/MMonMap.h \
+ messages/MMonPaxos.h \
+ messages/MMonProbe.h \
+ messages/MMonScrub.h \
+ messages/MMonSubscribe.h \
+ messages/MMonSubscribeAck.h \
+ messages/MMonSync.h \
+ messages/MOSDAlive.h \
+ messages/MOSDBoot.h \
+ messages/MOSDFailure.h \
+ messages/MOSDMarkMeDown.h \
+ messages/MOSDMap.h \
+ messages/MOSDOp.h \
+ messages/MOSDOpReply.h \
+ messages/MOSDPGBackfill.h \
+ messages/MOSDPGCreate.h \
+ messages/MOSDPGPush.h \
+ messages/MOSDPGPull.h \
+ messages/MOSDPGPushReply.h \
+ messages/MOSDPGInfo.h \
+ messages/MOSDPGLog.h \
+ messages/MOSDPGMissing.h \
+ messages/MOSDPGNotify.h \
+ messages/MOSDPGQuery.h \
+ messages/MOSDPGRemove.h \
+ messages/MOSDPGScan.h \
+ messages/MBackfillReserve.h \
+ messages/MRecoveryReserve.h \
+ messages/MMonQuorumService.h \
+ messages/MOSDPGTemp.h \
+ messages/MOSDPGTrim.h \
+ messages/MOSDPing.h \
+ messages/MOSDRepScrub.h \
+ messages/MOSDScrub.h \
+ messages/MOSDSubOp.h \
+ messages/MOSDSubOpReply.h \
+ messages/MPGStats.h \
+ messages/MPGStatsAck.h \
+ messages/MPing.h \
+ messages/MPoolOp.h \
+ messages/MPoolOpReply.h \
+ messages/MRemoveSnaps.h \
+ messages/MRoute.h \
+ messages/MForward.h \
+ messages/MStatfs.h \
+ messages/MStatfsReply.h \
+ messages/MTimeCheck.h \
+ messages/MWatchNotify.h \
+ messages/PaxosServiceMessage.h
+
diff --git a/src/mon/DataHealthService.cc b/src/mon/DataHealthService.cc
index 6e8aa313a36..5fc745ce11d 100644
--- a/src/mon/DataHealthService.cc
+++ b/src/mon/DataHealthService.cc
@@ -81,6 +81,18 @@ health_status_t DataHealthService::get_health(
health_detail = "low disk space!";
}
+ if (stats.store_stats.bytes_total >= g_conf->mon_leveldb_size_warn) {
+ if (health_status > HEALTH_WARN)
+ health_status = HEALTH_WARN;
+ if (!health_detail.empty())
+ health_detail.append("; ");
+ stringstream ss;
+ ss << "store is getting too big! "
+ << prettybyte_t(stats.store_stats.bytes_total)
+ << " >= " << prettybyte_t(g_conf->mon_leveldb_size_warn);
+ health_detail.append(ss.str());
+ }
+
if (overall_status > health_status)
overall_status = health_status;
@@ -95,18 +107,15 @@ health_status_t DataHealthService::get_health(
if (f) {
f->open_object_section("mon");
f->dump_string("name", mon_name.c_str());
- f->dump_int("kb_total", stats.kb_total);
- f->dump_int("kb_used", stats.kb_used);
- f->dump_int("kb_avail", stats.kb_avail);
- f->dump_int("avail_percent", stats.latest_avail_percent);
- f->dump_stream("last_updated") << stats.last_update;
+ // leave this unenclosed by an object section to avoid breaking backward-compatibility
+ stats.dump(f);
f->dump_stream("health") << health_status;
if (health_status != HEALTH_OK)
- f->dump_string("health_detail", health_detail);
+ f->dump_string("health_detail", health_detail);
f->close_section();
}
}
-
+
if (f) {
f->close_section(); // mons
f->close_section(); // data_health
@@ -115,6 +124,22 @@ health_status_t DataHealthService::get_health(
return overall_status;
}
+int DataHealthService::update_store_stats(DataStats &ours)
+{
+ map<string,uint64_t> extra;
+ uint64_t store_size = mon->store->get_estimated_size(extra);
+ assert(store_size > 0);
+
+ ours.store_stats.bytes_total = store_size;
+ ours.store_stats.bytes_sst = extra["sst"];
+ ours.store_stats.bytes_log = extra["log"];
+ ours.store_stats.bytes_misc = extra["misc"];
+ ours.last_update = ceph_clock_now(g_ceph_context);
+
+ return 0;
+}
+
+
int DataHealthService::update_stats()
{
struct statfs stbuf;
@@ -135,7 +160,8 @@ int DataHealthService::update_stats()
<< " total " << ours.kb_total << " used " << ours.kb_used << " avail " << ours.kb_avail
<< dendl;
ours.last_update = ceph_clock_now(g_ceph_context);
- return 0;
+
+ return update_store_stats(ours);
}
void DataHealthService::share_stats()
diff --git a/src/mon/DataHealthService.h b/src/mon/DataHealthService.h
index 337e7a450f7..750c58e5f80 100644
--- a/src/mon/DataHealthService.h
+++ b/src/mon/DataHealthService.h
@@ -34,6 +34,7 @@ class DataHealthService :
int last_warned_percent;
void handle_tell(MMonHealth *m);
+ int update_store_stats(DataStats &ours);
int update_stats();
void share_stats();
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc
index 9988d8c8402..48c1c99d584 100644
--- a/src/mon/MDSMonitor.cc
+++ b/src/mon/MDSMonitor.cc
@@ -920,6 +920,36 @@ bool MDSMonitor::prepare_command(MMonCommand *m)
r = 0;
}
+ } else if (prefix == "mds set") {
+ string key;
+ cmd_getval(g_ceph_context, cmdmap, "key", key);
+ string sure;
+ cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+ if (key == "allow_new_snaps") {
+ if (sure != "--yes-i-really-mean-it") {
+ ss << "Snapshots are unstable and will probably break your FS! Add --yes-i-really-mean-it if you are sure";
+ r = -EPERM;
+ } else {
+ pending_mdsmap.set_snaps_allowed();
+ ss << "turned on snaps";
+ r = 0;
+ }
+ }
+ } else if (prefix == "mds unset") {
+ string key;
+ cmd_getval(g_ceph_context, cmdmap, "key", key);
+ string sure;
+ cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+ if (key == "allow_new_snaps") {
+ if (sure != "--yes-i-really-mean-it") {
+ ss << "this won't get rid of snapshots or restore the cluster if it's broken. Add --yes-i-really-mean-it if you are sure";
+ r = -EPERM;
+ } else {
+ pending_mdsmap.clear_snaps_allowed();
+ ss << "disabled new snapshots";
+ r = 0;
+ }
+ }
} else if (prefix == "mds add_data_pool") {
int64_t poolid;
cmd_getval(g_ceph_context, cmdmap, "poolid", poolid);
@@ -947,6 +977,7 @@ bool MDSMonitor::prepare_command(MMonCommand *m)
ss << "this is DANGEROUS and will wipe out the mdsmap's fs, and may clobber data in the new pools you specify. add --yes-i-really-mean-it if you do.";
r = -EPERM;
} else {
+ newmap.inc = pending_mdsmap.inc;
pending_mdsmap = newmap;
pending_mdsmap.epoch = mdsmap.epoch + 1;
create_new_fs(pending_mdsmap, metadata, data);
diff --git a/src/mon/Makefile.am b/src/mon/Makefile.am
new file mode 100644
index 00000000000..d9094a81b0e
--- /dev/null
+++ b/src/mon/Makefile.am
@@ -0,0 +1,45 @@
+libmon_la_SOURCES = \
+ mon/Monitor.cc \
+ mon/Paxos.cc \
+ mon/PaxosService.cc \
+ mon/OSDMonitor.cc \
+ mon/MDSMonitor.cc \
+ mon/MonmapMonitor.cc \
+ mon/PGMonitor.cc \
+ mon/PGMap.cc \
+ mon/LogMonitor.cc \
+ mon/AuthMonitor.cc \
+ mon/Elector.cc \
+ mon/MonitorStore.cc \
+ mon/HealthMonitor.cc \
+ mon/DataHealthService.cc \
+ mon/ConfigKeyService.cc
+libmon_la_LIBADD = $(LIBAUTH) $(LIBCOMMON) $(LIBOS)
+noinst_LTLIBRARIES += libmon.la
+
+noinst_HEADERS += \
+ mon/AuthMonitor.h \
+ mon/DataHealthService.h \
+ mon/Elector.h \
+ mon/LogMonitor.h \
+ mon/ConfigKeyService.h \
+ mon/HealthMonitor.h \
+ mon/HealthService.h \
+ mon/MDSMonitor.h \
+ mon/MonmapMonitor.h \
+ mon/MonCap.h \
+ mon/MonClient.h \
+ mon/MonCommands.h \
+ mon/MonMap.h \
+ mon/Monitor.h \
+ mon/MonitorStore.h \
+ mon/MonitorDBStore.h \
+ mon/OSDMonitor.h \
+ mon/PGMap.h \
+ mon/PGMonitor.h \
+ mon/Paxos.h \
+ mon/PaxosService.h \
+ mon/QuorumService.h \
+ mon/Session.h \
+ mon/mon_types.h
+
diff --git a/src/mon/MonClient.cc b/src/mon/MonClient.cc
index 299379ac249..38bead8f29f 100644
--- a/src/mon/MonClient.cc
+++ b/src/mon/MonClient.cc
@@ -328,6 +328,12 @@ void MonClient::shutdown()
version_requests.erase(version_requests.begin());
}
+ while (!waiting_for_session.empty()) {
+ ldout(cct, 20) << __func__ << " discarding pending message " << *waiting_for_session.front() << dendl;
+ waiting_for_session.front()->put();
+ waiting_for_session.pop_front();
+ }
+
monc_lock.Unlock();
if (initialized) {
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index ec1ee71c9e1..d9fc56e032d 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -59,7 +59,7 @@
* CephString: optional badchars
* CephSocketpath: validation involves "is it S_ISSOCK"
* CephIPAddr: v4 or v6 addr with optional port, syntax validated
- * CephEntityAddr: CephIPAddr + '/nonce'
+ * CephEntityAddr: CephIPAddr + optional '/nonce'
* CephPoolname: Plainold string
* CephObjectname: Another plainold string
* CephPgid: n.xxx where n is an int > 0, xxx is a hex number > 0
@@ -210,8 +210,8 @@ COMMAND("quorum_status", "report status of monitor quorum", \
"mon", "r", "cli,rest")
COMMAND("mon_status", "report status of monitors", "mon", "r", "cli,rest")
COMMAND("sync force " \
- "name=validate1,type=CephChoices,strings=--yes-i-really-mean-it " \
- "name=validate2,type=CephChoices,strings=--i-know-what-i-am-doing", \
+ "name=validate1,type=CephChoices,strings=--yes-i-really-mean-it,req=false " \
+ "name=validate2,type=CephChoices,strings=--i-know-what-i-am-doing,req=false", \
"force sync of and clear monitor store", "mon", "rw", "cli,rest")
COMMAND("heap " \
"name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
@@ -274,6 +274,15 @@ COMMAND("mds compat rm_compat " \
COMMAND("mds compat rm_incompat " \
"name=feature,type=CephInt,range=0", \
"remove incompatible feature", "mds", "rw", "cli,rest")
+COMMAND("mds set " \
+ "name=key,type=CephChoices,strings=allow_new_snaps " \
+ "name=sure,type=CephString,req=false", \
+ "set <key>", \
+ "mds", "w", "cli,rest")
+COMMAND("mds unset " \
+ "name=key,type=CephChoices,strings=allow_new_snaps " \
+ "name=sure,type=CephString,req=false", \
+ "unset <key>", "mds", "w", "cli,rest")
COMMAND("mds add_data_pool " \
"name=poolid,type=CephInt,range=0", \
"add data pool <poolid>", "mds", "rw", "cli,rest")
@@ -283,14 +292,14 @@ COMMAND("mds remove_data_pool " \
COMMAND("mds newfs " \
"name=metadata,type=CephInt,range=0 " \
"name=data,type=CephInt,range=0 " \
- "name=sure,type=CephChoices,strings=--yes-i-really-mean-it", \
+ "name=sure,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
"make new filesystom using pools <metadata> and <data>", \
"mds", "rw", "cli,rest")
/*
* Monmap commands
*/
COMMAND("mon dump " \
- "name=epoch,type=CephInt,req=false", \
+ "name=epoch,type=CephInt,range=0,req=false", \
"dump formatted monmap (optionally from epoch)", \
"mon", "r", "cli,rest")
COMMAND("mon stat", "summarize monitor status", "mon", "r", "cli,rest")
@@ -384,7 +393,7 @@ COMMAND("osd crush create-or-move " \
"create entry or move existing entry for <name> <weight> at/to location <args>", \
"osd", "rw", "cli,rest")
COMMAND("osd crush move " \
- "name=id,type=CephOsdName " \
+ "name=name,type=CephString,goodchars=[A-Za-z0-9-_.] " \
"name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=]", \
"move existing entry for <name> to location <args>", \
"osd", "rw", "cli,rest")
@@ -456,7 +465,7 @@ COMMAND("osd reweight " \
"reweight osd to 0.0 < <weight> < 1.0", "osd", "rw", "cli,rest")
COMMAND("osd lost " \
"name=id,type=CephInt,range=0 " \
- "name=sure,type=CephChoices,strings=--yes-i-really-mean-it", \
+ "name=sure,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
"mark osd as permanently lost. THIS DESTROYS DATA IF NO MORE REPLICAS EXIST, BE CAREFUL", \
"osd", "rw", "cli,rest")
COMMAND("osd create " \
@@ -479,13 +488,14 @@ COMMAND("osd pool rmsnap " \
COMMAND("osd pool create " \
"name=pool,type=CephPoolname " \
"name=pg_num,type=CephInt,range=0 " \
- "name=pgp_num,type=CephInt,range=0,req=false", \
+ "name=pgp_num,type=CephInt,range=0,req=false " \
+ "name=properties,type=CephString,n=N,req=false,goodchars=[A-Za-z0-9-_.=]", \
"create pool", "osd", "rw", "cli,rest")
COMMAND("osd pool delete " \
"name=pool,type=CephPoolname " \
- "name=pool2,type=CephPoolname " \
- "name=sure,type=CephChoices,strings=--yes-i-really-really-mean-it", \
- "delete pool (say pool twice, add --yes-i-really-really-mean-it)", \
+ "name=pool2,type=CephPoolname,req=false " \
+ "name=sure,type=CephChoices,strings=--yes-i-really-really-mean-it,req=false", \
+ "delete pool", \
"osd", "rw", "cli,rest")
COMMAND("osd pool rename " \
"name=srcpool,type=CephPoolname " \
@@ -497,8 +507,8 @@ COMMAND("osd pool get " \
"get pool parameter <var>", "osd", "r", "cli,rest")
COMMAND("osd pool set " \
"name=pool,type=CephPoolname " \
- "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset " \
- "name=val,type=CephInt", \
+ "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool " \
+ "name=val,type=CephString", \
"set pool parameter <var> to <val>", "osd", "rw", "cli,rest")
// 'val' is a CephString because it can include a unit. Perhaps
// there should be a Python type for validation/conversion of strings
@@ -516,6 +526,27 @@ COMMAND("osd thrash " \
"name=num_epochs,type=CephInt,range=0", \
"thrash OSDs for <num_epochs>", "osd", "rw", "cli,rest")
+// tiering
+COMMAND("osd tier add " \
+ "name=pool,type=CephPoolname " \
+ "name=tierpool,type=CephPoolname",
+ "add the tier <tierpool> to base pool <pool>", "osd", "rw", "cli,rest")
+COMMAND("osd tier remove " \
+ "name=pool,type=CephPoolname " \
+ "name=tierpool,type=CephPoolname",
+ "remove the tier <tierpool> from base pool <pool>", "osd", "rw", "cli,rest")
+COMMAND("osd tier cache-mode " \
+ "name=pool,type=CephPoolname " \
+ "name=mode,type=CephChoices,strings=none|writeback|invalidate+forward|readonly", \
+ "specify the caching mode for cache tier <pool>", "osd", "rw", "cli,rest")
+COMMAND("osd tier set-overlay " \
+ "name=pool,type=CephPoolname " \
+ "name=overlaypool,type=CephPoolname", \
+ "set the overlay pool for base pool <pool> to be <overlaypool>", "osd", "rw", "cli,rest")
+COMMAND("osd tier remove-overlay " \
+ "name=pool,type=CephPoolname ", \
+ "remove the overlay pool for base pool <pool>", "osd", "rw", "cli,rest")
+
/*
* mon/ConfigKeyService.cc
*/
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index 45ca02027fc..d8c90bc3d76 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -1292,7 +1292,8 @@ void Monitor::handle_probe_reply(MMonProbe *m)
dout(10) << "handle_probe_reply " << m->get_source_inst() << *m << dendl;
dout(10) << " monmap is " << *monmap << dendl;
- if (!is_probing()) {
+ // discover name and addrs during probing or electing states.
+ if (!is_probing() && !is_electing()) {
m->put();
return;
}
@@ -1326,6 +1327,12 @@ void Monitor::handle_probe_reply(MMonProbe *m)
<< peer_name << " -> " << m->name << " in my monmap"
<< dendl;
monmap->rename(peer_name, m->name);
+
+ if (is_electing()) {
+ m->put();
+ bootstrap();
+ return;
+ }
} else {
dout(10) << " peer name is " << peer_name << dendl;
}
@@ -1342,6 +1349,12 @@ void Monitor::handle_probe_reply(MMonProbe *m)
}
}
+ // end discover phase
+ if (!is_probing()) {
+ m->put();
+ return;
+ }
+
assert(paxos != NULL);
if (is_synchronizing()) {
@@ -1841,13 +1854,7 @@ void Monitor::get_status(stringstream &ss, Formatter *f)
}
#undef COMMAND
-struct MonCommand {
- string cmdstring;
- string helpstring;
- string module;
- string req_perms;
- string availability;
-} mon_commands[] = {
+MonCommand mon_commands[] = {
#define COMMAND(parsesig, helptext, modulename, req_perms, avail) \
{parsesig, helptext, modulename, req_perms, avail},
#include <mon/MonCommands.h>
@@ -1896,6 +1903,26 @@ bool Monitor::_allowed_command(MonSession *s, string &module, string &prefix,
return capable;
}
+void get_command_descriptions(const MonCommand *commands,
+ unsigned commands_size,
+ Formatter *f,
+ bufferlist *rdata) {
+ int cmdnum = 0;
+ f->open_object_section("command_descriptions");
+ for (const MonCommand *cp = commands;
+ cp < &commands[commands_size]; cp++) {
+
+ ostringstream secname;
+ secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
+ dump_cmddesc_to_json(f, secname.str(),
+ cp->cmdstring, cp->helpstring, cp->module,
+ cp->req_perms, cp->availability);
+ cmdnum++;
+ }
+ f->close_section(); // command_descriptions
+
+ f->flush(*rdata);
+}
void Monitor::handle_command(MMonCommand *m)
{
@@ -1940,23 +1967,9 @@ void Monitor::handle_command(MMonCommand *m)
cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
if (prefix == "get_command_descriptions") {
- int cmdnum = 0;
- Formatter *f = new_formatter("json");
- f->open_object_section("command_descriptions");
- for (MonCommand *cp = mon_commands;
- cp < &mon_commands[ARRAY_SIZE(mon_commands)]; cp++) {
-
- ostringstream secname;
- secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
- dump_cmddesc_to_json(f, secname.str(),
- cp->cmdstring, cp->helpstring, cp->module,
- cp->req_perms, cp->availability);
- cmdnum++;
- }
- f->close_section(); // command_descriptions
-
bufferlist rdata;
- f->flush(rdata);
+ Formatter *f = new_formatter("json");
+ get_command_descriptions(mon_commands, ARRAY_SIZE(mon_commands), f, &rdata);
delete f;
reply_command(m, 0, "", rdata, 0);
return;
@@ -2548,67 +2561,98 @@ bool Monitor::_ms_dispatch(Message *m)
EntityName entity_name;
bool src_is_mon;
- src_is_mon = !connection || (connection->get_peer_type() & CEPH_ENTITY_TYPE_MON);
-
- if (connection) {
- bool reuse_caps = false;
- dout(20) << "have connection" << dendl;
- s = static_cast<MonSession *>(connection->get_priv());
- if (s && s->closed) {
- caps = s->caps;
- reuse_caps = true;
- s->put();
- s = NULL;
+ // regardless of who we are or who the sender is, the message must
+ // have a connection associated. If it doesn't then something fishy
+ // is going on.
+ assert(connection);
+
+ src_is_mon = (connection->get_peer_type() & CEPH_ENTITY_TYPE_MON);
+
+ bool reuse_caps = false;
+ dout(20) << "have connection" << dendl;
+ s = static_cast<MonSession *>(connection->get_priv());
+ if (s && s->closed) {
+ caps = s->caps;
+ reuse_caps = true;
+ s->put();
+ s = NULL;
+ }
+ if (!s) {
+ // if the sender is not a monitor, make sure their first message for a
+ // session is an MAuth. If it is not, assume it's a stray message,
+ // and considering that we are creating a new session it is safe to
+ // assume that the sender hasn't authenticated yet, so we have no way
+ // of assessing whether we should handle it or not.
+ if (!src_is_mon && (m->get_type() != CEPH_MSG_AUTH &&
+ m->get_type() != CEPH_MSG_MON_GET_MAP)) {
+ dout(1) << __func__ << " dropping stray message " << *m
+ << " from " << m->get_source_inst() << dendl;
+ return false;
}
- if (!s) {
- if (!exited_quorum.is_zero() && !src_is_mon) {
- waitlist_or_zap_client(m);
- return true;
- }
- dout(10) << "do not have session, making new one" << dendl;
- s = session_map.new_session(m->get_source_inst(), m->get_connection().get());
- m->get_connection()->set_priv(s->get());
- dout(10) << "ms_dispatch new session " << s << " for " << s->inst << dendl;
-
- if (m->get_connection()->get_peer_type() != CEPH_ENTITY_TYPE_MON) {
- dout(10) << "setting timeout on session" << dendl;
- // set an initial timeout here, so we will trim this session even if they don't
- // do anything.
- s->until = ceph_clock_now(g_ceph_context);
- s->until += g_conf->mon_subscribe_interval;
- } else {
- //give it monitor caps; the peer type has been authenticated
- reuse_caps = false;
- dout(5) << "setting monitor caps on this connection" << dendl;
- if (!s->caps.is_allow_all()) //but no need to repeatedly copy
- s->caps = *mon_caps;
- }
- if (reuse_caps)
- s->caps = caps;
+
+ if (!exited_quorum.is_zero() && !src_is_mon) {
+ waitlist_or_zap_client(m);
+ return true;
+ }
+
+ dout(10) << "do not have session, making new one" << dendl;
+ s = session_map.new_session(m->get_source_inst(), m->get_connection().get());
+ m->get_connection()->set_priv(s->get());
+ dout(10) << "ms_dispatch new session " << s << " for " << s->inst << dendl;
+
+ if (!src_is_mon) {
+ dout(10) << "setting timeout on session" << dendl;
+ // set an initial timeout here, so we will trim this session even if they don't
+ // do anything.
+ s->until = ceph_clock_now(g_ceph_context);
+ s->until += g_conf->mon_subscribe_interval;
} else {
- dout(20) << "ms_dispatch existing session " << s << " for " << s->inst << dendl;
+ //give it monitor caps; the peer type has been authenticated
+ reuse_caps = false;
+ dout(5) << "setting monitor caps on this connection" << dendl;
+ if (!s->caps.is_allow_all()) //but no need to repeatedly copy
+ s->caps = *mon_caps;
}
+ if (reuse_caps)
+ s->caps = caps;
+ } else {
+ dout(20) << "ms_dispatch existing session " << s << " for " << s->inst << dendl;
+ }
+
+ if (s) {
if (s->auth_handler) {
entity_name = s->auth_handler->get_entity_name();
}
- }
-
- if (s)
dout(20) << " caps " << s->caps.get_str() << dendl;
+ }
if (is_synchronizing() && !src_is_mon) {
waitlist_or_zap_client(m);
return true;
}
- {
- switch (m->get_type()) {
-
+ ret = dispatch(s, m, src_is_mon);
+
+ if (s) {
+ s->put();
+ }
+
+ return ret;
+}
+
+bool Monitor::dispatch(MonSession *s, Message *m, const bool src_is_mon)
+{
+ bool ret = true;
+
+ assert(m != NULL);
+
+ switch (m->get_type()) {
+
case MSG_ROUTE:
handle_route(static_cast<MRoute*>(m));
break;
- // misc
+ // misc
case CEPH_MSG_MON_GET_MAP:
handle_mon_get_map(static_cast<MMonGetMap*>(m));
break;
@@ -2634,12 +2678,11 @@ bool Monitor::_ms_dispatch(Message *m)
case MSG_MON_SYNC:
handle_sync(static_cast<MMonSync*>(m));
break;
-
case MSG_MON_SCRUB:
handle_scrub(static_cast<MMonScrub*>(m));
break;
- // OSDs
+ // OSDs
case MSG_OSD_MARK_ME_DOWN:
case MSG_OSD_FAILURE:
case MSG_OSD_BOOT:
@@ -2652,20 +2695,20 @@ bool Monitor::_ms_dispatch(Message *m)
paxos_service[PAXOS_OSDMAP]->dispatch((PaxosServiceMessage*)m);
break;
- // MDSs
+ // MDSs
case MSG_MDS_BEACON:
case MSG_MDS_OFFLOAD_TARGETS:
paxos_service[PAXOS_MDSMAP]->dispatch((PaxosServiceMessage*)m);
break;
- // auth
+ // auth
case MSG_MON_GLOBAL_ID:
case CEPH_MSG_AUTH:
/* no need to check caps here */
paxos_service[PAXOS_AUTH]->dispatch((PaxosServiceMessage*)m);
break;
- // pg
+ // pg
case CEPH_MSG_STATFS:
case MSG_PGSTATS:
case MSG_GETPOOLSTATS:
@@ -2676,7 +2719,7 @@ bool Monitor::_ms_dispatch(Message *m)
paxos_service[PAXOS_OSDMAP]->dispatch((PaxosServiceMessage*)m);
break;
- // log
+ // log
case MSG_LOG:
paxos_service[PAXOS_LOG]->dispatch((PaxosServiceMessage*)m);
break;
@@ -2685,60 +2728,60 @@ bool Monitor::_ms_dispatch(Message *m)
clog.handle_log_ack((MLogAck*)m);
break;
- // monmap
+ // monmap
case MSG_MON_JOIN:
paxos_service[PAXOS_MONMAP]->dispatch((PaxosServiceMessage*)m);
break;
- // paxos
+ // paxos
case MSG_MON_PAXOS:
{
- MMonPaxos *pm = static_cast<MMonPaxos*>(m);
- if (!src_is_mon &&
- !s->is_capable("mon", MON_CAP_X)) {
- //can't send these!
- pm->put();
- break;
- }
+ MMonPaxos *pm = static_cast<MMonPaxos*>(m);
+ if (!src_is_mon ||
+ !s->is_capable("mon", MON_CAP_X)) {
+ //can't send these!
+ pm->put();
+ break;
+ }
- if (state == STATE_SYNCHRONIZING) {
- // we are synchronizing. These messages would do us no
- // good, thus just drop them and ignore them.
- dout(10) << __func__ << " ignore paxos msg from "
- << pm->get_source_inst() << dendl;
- pm->put();
- break;
- }
+ if (state == STATE_SYNCHRONIZING) {
+ // we are synchronizing. These messages would do us no
+ // good, thus just drop them and ignore them.
+ dout(10) << __func__ << " ignore paxos msg from "
+ << pm->get_source_inst() << dendl;
+ pm->put();
+ break;
+ }
- // sanitize
- if (pm->epoch > get_epoch()) {
- bootstrap();
- pm->put();
- break;
- }
- if (pm->epoch != get_epoch()) {
- pm->put();
- break;
- }
+ // sanitize
+ if (pm->epoch > get_epoch()) {
+ bootstrap();
+ pm->put();
+ break;
+ }
+ if (pm->epoch != get_epoch()) {
+ pm->put();
+ break;
+ }
- paxos->dispatch((PaxosServiceMessage*)m);
+ paxos->dispatch((PaxosServiceMessage*)m);
}
break;
- // elector messages
+ // elector messages
case MSG_MON_ELECTION:
//check privileges here for simplicity
if (s &&
- !s->is_capable("mon", MON_CAP_X)) {
- dout(0) << "MMonElection received from entity without enough caps!"
- << s->caps << dendl;
- m->put();
- break;
+ !s->is_capable("mon", MON_CAP_X)) {
+ dout(0) << "MMonElection received from entity without enough caps!"
+ << s->caps << dendl;
+ m->put();
+ break;
}
if (!is_probing() && !is_synchronizing()) {
- elector.dispatch(m);
+ elector.dispatch(m);
} else {
- m->put();
+ m->put();
}
break;
@@ -2756,10 +2799,6 @@ bool Monitor::_ms_dispatch(Message *m)
default:
ret = false;
- }
- }
- if (s) {
- s->put();
}
return ret;
diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h
index df4a751361a..2c1c2cdeb19 100644
--- a/src/mon/Monitor.h
+++ b/src/mon/Monitor.h
@@ -700,6 +700,8 @@ public:
lock.Unlock();
return ret;
}
+ // dissociate message handling from session and connection logic
+ bool dispatch(MonSession *s, Message *m, const bool src_is_mon);
//mon_caps is used for un-connected messages from monitors
MonCap * mon_caps;
bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new);
@@ -844,5 +846,17 @@ public:
long parse_pos_long(const char *s, ostream *pss = NULL);
+struct MonCommand {
+ string cmdstring;
+ string helpstring;
+ string module;
+ string req_perms;
+ string availability;
+};
+
+void get_command_descriptions(const MonCommand *commands,
+ unsigned commands_size,
+ Formatter *f,
+ bufferlist *rdata);
#endif
diff --git a/src/mon/MonitorDBStore.h b/src/mon/MonitorDBStore.h
index 276620f7516..eda5aaf1802 100644
--- a/src/mon/MonitorDBStore.h
+++ b/src/mon/MonitorDBStore.h
@@ -41,7 +41,8 @@ class MonitorDBStore
string key, endkey;
bufferlist bl;
- Op() { }
+ Op()
+ : type(0) { }
Op(int t, string p, string k)
: type(t), prefix(p), key(k) { }
Op(int t, const string& p, string k, bufferlist& b)
@@ -509,6 +510,10 @@ class MonitorDBStore
db->compact_prefix(prefix);
}
+ uint64_t get_estimated_size(map<string, uint64_t> &extras) {
+ return db->get_estimated_size(extras);
+ }
+
MonitorDBStore(const string& path) :
db(0), do_dump(false), dump_fd(-1) {
string::const_reverse_iterator rit;
@@ -523,8 +528,8 @@ class MonitorDBStore
LevelDBStore *db_ptr = new LevelDBStore(g_ceph_context, full_path);
if (!db_ptr) {
- std::cout << __func__ << " error initializing level db back storage in "
- << full_path << std::endl;
+ derr << __func__ << " error initializing level db back storage in "
+ << full_path << dendl;
assert(0 != "MonitorDBStore: error initializing level db back storage");
}
db.reset(db_ptr);
diff --git a/src/mon/MonmapMonitor.cc b/src/mon/MonmapMonitor.cc
index 799f19df154..ca855592445 100644
--- a/src/mon/MonmapMonitor.cc
+++ b/src/mon/MonmapMonitor.cc
@@ -298,20 +298,45 @@ bool MonmapMonitor::prepare_command(MMonCommand *m)
addr.set_port(CEPH_MON_PORT);
}
- if (pending_map.contains(addr) ||
- pending_map.contains(name)) {
+ /**
+ * If we have a monitor with the same name and different addr, then EEXIST
+ * If we have a monitor with the same addr and different name, then EEXIST
+ * If we have a monitor with the same addr and same name, then return as if
+ * we had just added the monitor.
+ * If we don't have the monitor, add it.
+ */
+
+ err = 0;
+ if (!ss.str().empty())
+ ss << "; ";
+
+ do {
+ if (pending_map.contains(addr)) {
+ string n = pending_map.get_name(addr);
+ if (n == name)
+ break;
+ } else if (pending_map.contains(name)) {
+ entity_addr_t tmp_addr = pending_map.get_addr(name);
+ if (tmp_addr == addr)
+ break;
+ } else {
+ break;
+ }
err = -EEXIST;
- if (!ss.str().empty())
- ss << "; ";
- ss << "mon " << name << " " << addr << " already exists";
+ ss << "mon." << name << " at " << addr << " already exists";
+ goto out;
+ } while (false);
+
+ ss << "added mon." << name << " at " << addr;
+ if (pending_map.contains(name)) {
goto out;
}
pending_map.add(name, addr);
pending_map.last_changed = ceph_clock_now(g_ceph_context);
- ss << "added mon." << name << " at " << addr;
getline(ss, rs);
- wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, get_last_committed()));
+ wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+ get_last_committed()));
return true;
} else if (prefix == "mon remove") {
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 32413c111d3..425375b29e2 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -120,7 +120,12 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap)
* We will possibly have a stashed latest that *we* wrote, and we will
* always be sure to have the oldest full map in the first..last range
* due to encode_trim_extra(), which includes the oldest full map in the trim
- * transaction. Start with whichever is newer.
+ * transaction.
+ *
+ * encode_trim_extra() does not however write the full map's
+ * version to 'full_latest'. This is only done when we are building the
+ * full maps from the incremental versions. But don't panic! We make sure
+ * that the following conditions find whichever full map version is newer.
*/
version_t latest_full = get_version_latest_full();
if (latest_full == 0 && get_first_committed() > 1)
@@ -179,32 +184,49 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap)
}
// walk through incrementals
- MonitorDBStore::Transaction t;
+ MonitorDBStore::Transaction *t = NULL;
+ size_t tx_size = 0;
while (version > osdmap.epoch) {
bufferlist inc_bl;
int err = get_version(osdmap.epoch+1, inc_bl);
assert(err == 0);
assert(inc_bl.length());
-
+
dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1 << dendl;
OSDMap::Incremental inc(inc_bl);
err = osdmap.apply_incremental(inc);
assert(err == 0);
+ if (t == NULL)
+ t = new MonitorDBStore::Transaction;
+
// write out the full map for all past epochs
bufferlist full_bl;
osdmap.encode(full_bl);
- put_version_full(&t, osdmap.epoch, full_bl);
+ tx_size += full_bl.length();
+
+ put_version_full(t, osdmap.epoch, full_bl);
+ put_version_latest_full(t, osdmap.epoch);
// share
dout(1) << osdmap << dendl;
if (osdmap.epoch == 1) {
- t.erase("mkfs", "osdmap");
+ t->erase("mkfs", "osdmap");
+ }
+
+ if (tx_size > g_conf->mon_sync_max_payload_size*2) {
+ mon->store->apply_transaction(*t);
+ delete t;
+ t = NULL;
+ tx_size = 0;
}
}
- if (!t.empty())
- mon->store->apply_transaction(t);
+
+ if (t != NULL) {
+ mon->store->apply_transaction(*t);
+ delete t;
+ }
for (int o = 0; o < osdmap.get_max_osd(); o++) {
if (osdmap.is_down(o)) {
@@ -620,7 +642,6 @@ void OSDMonitor::encode_trim_extra(MonitorDBStore::Transaction *tx, version_t fi
bufferlist bl;
get_version_full(first, bl);
put_version_full(tx, first, bl);
- put_version_latest_full(tx, first);
}
// -------------
@@ -1457,17 +1478,15 @@ bool OSDMonitor::prepare_remove_snaps(MRemoveSnaps *m)
if (!pi.removed_snaps.contains(*q) &&
(!pending_inc.new_pools.count(p->first) ||
!pending_inc.new_pools[p->first].removed_snaps.contains(*q))) {
- if (pending_inc.new_pools.count(p->first) == 0)
- pending_inc.new_pools[p->first] = pi;
- pg_pool_t& newpi = pending_inc.new_pools[p->first];
- newpi.removed_snaps.insert(*q);
+ pg_pool_t *newpi = pending_inc.get_new_pool(p->first, &pi);
+ newpi->removed_snaps.insert(*q);
dout(10) << " pool " << p->first << " removed_snaps added " << *q
- << " (now " << newpi.removed_snaps << ")" << dendl;
- if (*q > newpi.get_snap_seq()) {
- dout(10) << " pool " << p->first << " snap_seq " << newpi.get_snap_seq() << " -> " << *q << dendl;
- newpi.set_snap_seq(*q);
+ << " (now " << newpi->removed_snaps << ")" << dendl;
+ if (*q > newpi->get_snap_seq()) {
+ dout(10) << " pool " << p->first << " snap_seq " << newpi->get_snap_seq() << " -> " << *q << dendl;
+ newpi->set_snap_seq(*q);
}
- newpi.set_snap_epoch(pending_inc.epoch);
+ newpi->set_snap_epoch(pending_inc.epoch);
}
}
}
@@ -2156,7 +2175,7 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
osdmap.get_inst(i));
}
r = 0;
- ss << " instructed to " << whostr;
+ ss << " instructed to " << pvec.back();
} else {
long osd = parse_osd_id(whostr.c_str(), &ss);
if (osd < 0) {
@@ -2326,9 +2345,7 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
void OSDMonitor::update_pool_flags(int64_t pool_id, uint64_t flags)
{
const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
- if (pending_inc.new_pools.count(pool_id) == 0)
- pending_inc.new_pools[pool_id] = *pool;
- pending_inc.new_pools[pool_id].flags = flags;
+ pending_inc.get_new_pool(pool_id, pool)->flags = flags;
}
bool OSDMonitor::update_pools_status()
@@ -2477,10 +2494,11 @@ int OSDMonitor::prepare_new_pool(MPoolOp *m)
MonSession *session = m->get_session();
if (!session)
return -EPERM;
+ vector<string> properties;
if (m->auid)
- return prepare_new_pool(m->name, m->auid, m->crush_rule, 0, 0);
+ return prepare_new_pool(m->name, m->auid, m->crush_rule, 0, 0, properties);
else
- return prepare_new_pool(m->name, session->auid, m->crush_rule, 0, 0);
+ return prepare_new_pool(m->name, session->auid, m->crush_rule, 0, 0, properties);
}
/**
@@ -2489,11 +2507,13 @@ int OSDMonitor::prepare_new_pool(MPoolOp *m)
* @param crush_rule The crush rule to use. If <0, will use the system default
* @param pg_num The pg_num to use. If set to 0, will use the system default
* @param pgp_num The pgp_num to use. If set to 0, will use the system default
+ * @param properties An opaque list of key[=value] pairs for pool configuration
*
* @return 0 in all cases. That's silly.
*/
int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, int crush_rule,
- unsigned pg_num, unsigned pgp_num)
+ unsigned pg_num, unsigned pgp_num,
+ const vector<string> &properties)
{
for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
p != pending_inc.new_pool_names.end();
@@ -2505,22 +2525,36 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, int crush_rule,
if (-1 == pending_inc.new_pool_max)
pending_inc.new_pool_max = osdmap.pool_max;
int64_t pool = ++pending_inc.new_pool_max;
- pending_inc.new_pools[pool].type = pg_pool_t::TYPE_REP;
- pending_inc.new_pools[pool].flags = g_conf->osd_pool_default_flags;
+ pg_pool_t empty;
+ pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
+ pi->type = pg_pool_t::TYPE_REP;
+ pi->flags = g_conf->osd_pool_default_flags;
if (g_conf->osd_pool_default_flag_hashpspool)
- pending_inc.new_pools[pool].flags |= pg_pool_t::FLAG_HASHPSPOOL;
+ pi->flags |= pg_pool_t::FLAG_HASHPSPOOL;
- pending_inc.new_pools[pool].size = g_conf->osd_pool_default_size;
- pending_inc.new_pools[pool].min_size = g_conf->get_osd_pool_default_min_size();
+ pi->size = g_conf->osd_pool_default_size;
+ pi->min_size = g_conf->get_osd_pool_default_min_size();
if (crush_rule >= 0)
- pending_inc.new_pools[pool].crush_ruleset = crush_rule;
+ pi->crush_ruleset = crush_rule;
else
- pending_inc.new_pools[pool].crush_ruleset = g_conf->osd_pool_default_crush_rule;
- pending_inc.new_pools[pool].object_hash = CEPH_STR_HASH_RJENKINS;
- pending_inc.new_pools[pool].set_pg_num(pg_num ? pg_num : g_conf->osd_pool_default_pg_num);
- pending_inc.new_pools[pool].set_pgp_num(pgp_num ? pgp_num : g_conf->osd_pool_default_pgp_num);
- pending_inc.new_pools[pool].last_change = pending_inc.epoch;
- pending_inc.new_pools[pool].auid = auid;
+ pi->crush_ruleset = g_conf->osd_pool_default_crush_rule;
+ pi->object_hash = CEPH_STR_HASH_RJENKINS;
+ pi->set_pg_num(pg_num ? pg_num : g_conf->osd_pool_default_pg_num);
+ pi->set_pgp_num(pgp_num ? pgp_num : g_conf->osd_pool_default_pgp_num);
+ pi->last_change = pending_inc.epoch;
+ pi->auid = auid;
+ for (vector<string>::const_iterator i = properties.begin();
+ i != properties.end();
+ i++) {
+ size_t equal = i->find('=');
+ if (equal == string::npos)
+ pi->properties[*i] = string();
+ else {
+ const string key = i->substr(0, equal);
+ const string value = i->substr(equal);
+ pi->properties[key] = value;
+ }
+ }
pending_inc.new_pool_names[pool] = name;
return 0;
}
@@ -2584,6 +2618,125 @@ void OSDMonitor::parse_loc_map(const vector<string>& args, map<string,string> *
}
}
+int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
+ stringstream& ss)
+{
+ string poolstr;
+ cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+ int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
+ if (pool < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ return -ENOENT;
+ }
+ string var;
+ cmd_getval(g_ceph_context, cmdmap, "var", var);
+
+ pg_pool_t p = *osdmap.get_pg_pool(pool);
+ if (pending_inc.new_pools.count(pool))
+ p = pending_inc.new_pools[pool];
+
+ // accept val as a json string or int, and parse out int or float
+ // values from the string as needed
+ string val;
+ cmd_getval(g_ceph_context, cmdmap, "val", val);
+ string interr;
+ int64_t n = 0;
+ if (!cmd_getval(g_ceph_context, cmdmap, "val", n))
+ n = strict_strtoll(val.c_str(), 10, &interr);
+ string floaterr;
+ float f;
+ if (!cmd_getval(g_ceph_context, cmdmap, "val", f))
+ f = strict_strtod(val.c_str(), &floaterr);
+
+ if (var == "size") {
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (n == 0 || n > 10) {
+ ss << "pool size must be between 1 and 10";
+ return -EINVAL;
+ }
+ p.size = n;
+ if (n < p.min_size)
+ p.min_size = n;
+ ss << "set pool " << pool << " size to " << n;
+ } else if (var == "min_size") {
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ p.min_size = n;
+ ss << "set pool " << pool << " min_size to " << n;
+ } else if (var == "crash_replay_interval") {
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ p.crash_replay_interval = n;
+ ss << "set pool " << pool << " to crash_replay_interval to " << n;
+ } else if (var == "pg_num") {
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (n <= (int)p.get_pg_num()) {
+ ss << "specified pg_num " << n << " <= current " << p.get_pg_num();
+ } else if (!mon->pgmon()->pg_map.creating_pgs.empty()) {
+ ss << "currently creating pgs, wait";
+ return -EAGAIN;
+ } else {
+ p.set_pg_num(n);
+ ss << "set pool " << pool << " pg_num to " << n;
+ }
+ } else if (var == "pgp_num") {
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (n > (int)p.get_pg_num()) {
+ ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
+ } else if (!mon->pgmon()->pg_map.creating_pgs.empty()) {
+ ss << "still creating pgs, wait";
+ return -EAGAIN;
+ } else {
+ p.set_pgp_num(n);
+ ss << "set pool " << pool << " pgp_num to " << n;
+ }
+ } else if (var == "crush_ruleset") {
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (osdmap.crush->rule_exists(n)) {
+ p.crush_ruleset = n;
+ ss << "set pool " << pool << " crush_ruleset to " << n;
+ } else {
+ ss << "crush ruleset " << n << " does not exist";
+ return -ENOENT;
+ }
+ } else if (var == "hashpspool") {
+ if (val == "true") {
+ p.flags |= pg_pool_t::FLAG_HASHPSPOOL;
+ ss << "set";
+ } else if (val == "false") {
+ p.flags ^= pg_pool_t::FLAG_HASHPSPOOL;
+ ss << "unset";
+ } else {
+ ss << "expecting value true or false";
+ return -EINVAL;
+ }
+ ss << " pool " << pool << " flag hashpspool";
+ } else {
+ ss << "unrecognized variable '" << var << "'";
+ return -EINVAL;
+ }
+
+ p.last_change = pending_inc.epoch;
+ pending_inc.new_pools[pool] = p;
+ return 0;
+}
+
bool OSDMonitor::prepare_command(MMonCommand *m)
{
bool ret = false;
@@ -2701,9 +2854,15 @@ bool OSDMonitor::prepare_command(MMonCommand *m)
err = -EINVAL;
goto reply;
}
- int bucketno = newcrush.add_bucket(0, CRUSH_BUCKET_STRAW,
+ int bucketno;
+ err = newcrush.add_bucket(0, CRUSH_BUCKET_STRAW,
CRUSH_HASH_DEFAULT, type, 0, NULL,
- NULL);
+ NULL, &bucketno);
+ if (err < 0) {
+ char buf[128];
+ ss << "add_bucket error: '" << strerror_r(-err, buf, sizeof(buf)) << "'";
+ goto reply;
+ }
err = newcrush.set_item_name(bucketno, name);
if (err < 0) {
ss << "error setting bucket name to '" << name << "'";
@@ -2827,6 +2986,7 @@ bool OSDMonitor::prepare_command(MMonCommand *m)
string args;
vector<string> argvec;
+ cmd_getval(g_ceph_context, cmdmap, "name", name);
cmd_getval(g_ceph_context, cmdmap, "args", argvec);
map<string,string> loc;
parse_loc_map(argvec, &loc);
@@ -2981,7 +3141,7 @@ bool OSDMonitor::prepare_command(MMonCommand *m)
cmd_getval(g_ceph_context, cmdmap, "weight", w);
err = newcrush.adjust_item_weightf(g_ceph_context, id, w);
- if (err == 0) {
+ if (err >= 0) {
pending_inc.crush.clear();
newcrush.encode(pending_inc.crush);
ss << "reweighted item id " << id << " name '" << name << "' to " << w
@@ -3383,7 +3543,11 @@ done:
string snapname;
cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
const pg_pool_t *p = osdmap.get_pg_pool(pool);
- if (p->snap_exists(snapname.c_str())) {
+ if (p->is_unmanaged_snaps_mode()) {
+ ss << "pool " << poolstr << " is in unmanaged snaps mode";
+ err = -EINVAL;
+ goto reply;
+ } else if (p->snap_exists(snapname.c_str())) {
ss << "pool " << poolstr << " snap " << snapname << " already exists";
err = 0;
goto reply;
@@ -3417,7 +3581,11 @@ done:
string snapname;
cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
const pg_pool_t *p = osdmap.get_pg_pool(pool);
- if (!p->snap_exists(snapname.c_str())) {
+ if (p->is_unmanaged_snaps_mode()) {
+ ss << "pool " << poolstr << " is in unmanaged snaps mode";
+ err = -EINVAL;
+ goto reply;
+ } else if (!p->snap_exists(snapname.c_str())) {
ss << "pool " << poolstr << " snap " << snapname << " does not exist";
err = 0;
goto reply;
@@ -3468,9 +3636,13 @@ done:
goto reply;
}
+ vector<string> properties;
+ cmd_getval(g_ceph_context, cmdmap, "properties", properties);
+
err = prepare_new_pool(poolstr, 0, // auid=0 for admin created pool
-1, // default crush rule
- pg_num, pgp_num);
+ pg_num, pgp_num,
+ properties);
if (err < 0 && err != -EEXIST) {
goto reply;
}
@@ -3533,71 +3705,188 @@ done:
return true;
}
} else if (prefix == "osd pool set") {
- // set a pool variable to a positive int
+ err = prepare_command_pool_set(cmdmap, ss);
+ if (err < 0)
+ goto reply;
+
+ getline(ss, rs);
+ wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, get_last_committed()));
+ return true;
+ } else if (prefix == "osd tier add") {
string poolstr;
cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
- int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
- if (pool < 0) {
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id < 0) {
ss << "unrecognized pool '" << poolstr << "'";
err = -ENOENT;
- } else {
- const pg_pool_t *p = osdmap.get_pg_pool(pool);
- int64_t n;
- cmd_getval(g_ceph_context, cmdmap, "val", n);
- string var;
- cmd_getval(g_ceph_context, cmdmap, "var", var);
- if (pending_inc.new_pools.count(pool) == 0)
- pending_inc.new_pools[pool] = *p;
- if (var == "size") {
- if (n == 0 || n > 10) {
- ss << "pool size must be between 1 and 10";
- err = -EINVAL;
- goto reply;
- }
- pending_inc.new_pools[pool].size = n;
- if (n < p->min_size)
- pending_inc.new_pools[pool].min_size = n;
- ss << "set pool " << pool << " size to " << n;
- } else if (var == "min_size") {
- pending_inc.new_pools[pool].min_size = n;
- ss << "set pool " << pool << " min_size to " << n;
- } else if (var == "crash_replay_interval") {
- pending_inc.new_pools[pool].crash_replay_interval = n;
- ss << "set pool " << pool << " to crash_replay_interval to " << n;
- } else if (var == "pg_num") {
- if (n <= p->get_pg_num()) {
- ss << "specified pg_num " << n << " <= current " << p->get_pg_num();
- } else if (!mon->pgmon()->pg_map.creating_pgs.empty()) {
- ss << "currently creating pgs, wait";
- err = -EAGAIN;
- } else {
- pending_inc.new_pools[pool].set_pg_num(n);
- ss << "set pool " << pool << " pg_num to " << n;
- }
- } else if (var == "pgp_num") {
- if (n > p->get_pg_num()) {
- ss << "specified pgp_num " << n << " > pg_num " << p->get_pg_num();
- } else if (!mon->pgmon()->pg_map.creating_pgs.empty()) {
- ss << "still creating pgs, wait";
- err = -EAGAIN;
- } else {
- pending_inc.new_pools[pool].set_pgp_num(n);
- ss << "set pool " << pool << " pgp_num to " << n;
- }
- } else if (var == "crush_ruleset") {
- if (osdmap.crush->rule_exists(n)) {
- pending_inc.new_pools[pool].crush_ruleset = n;
- ss << "set pool " << pool << " crush_ruleset to " << n;
- } else {
- ss << "crush ruleset " << n << " does not exist";
- err = -ENOENT;
- }
- }
- pending_inc.new_pools[pool].last_change = pending_inc.epoch;
- getline(ss, rs);
- wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, get_last_committed()));
- return true;
+ goto reply;
+ }
+ string tierpoolstr;
+ cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
+ int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
+ if (tierpool_id < 0) {
+ ss << "unrecognized pool '" << tierpoolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+ assert(p);
+ const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
+ assert(tp);
+ if (p->tiers.count(tierpool_id)) {
+ assert(tp->tier_of == pool_id);
+ err = 0;
+ ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
+ goto reply;
+ }
+ if (tp->is_tier()) {
+ ss << "tier pool '" << tierpoolstr << "' is already a tier of '"
+ << osdmap.get_pool_name(tp->tier_of) << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ // go
+ pending_inc.get_new_pool(pool_id, p)->tiers.insert(tierpool_id);
+ pending_inc.get_new_pool(tierpool_id, p)->tier_of = pool_id;
+ ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
+ wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(), get_last_committed()));
+ return true;
+ } else if (prefix == "osd tier remove") {
+ string poolstr;
+ cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ string tierpoolstr;
+ cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
+ int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
+ if (tierpool_id < 0) {
+ ss << "unrecognized pool '" << tierpoolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+ assert(p);
+ const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
+ assert(tp);
+ if (p->tiers.count(tierpool_id) == 0) {
+ ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
+ err = 0;
+ goto reply;
+ }
+ if (tp->tier_of != pool_id) {
+ ss << "tier pool '" << tierpoolstr << "' is a tier of '" << tp->tier_of << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ if (p->read_tier == tierpool_id) {
+ ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
+ err = -EBUSY;
+ goto reply;
+ }
+ // go
+ pending_inc.get_new_pool(pool_id, p)->tiers.erase(tierpool_id);
+ pending_inc.get_new_pool(tierpool_id, tp)->clear_tier();
+ ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
+ wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(), get_last_committed()));
+ return true;
+ } else if (prefix == "osd tier set-overlay") {
+ string poolstr;
+ cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ string overlaypoolstr;
+ cmd_getval(g_ceph_context, cmdmap, "overlaypool", overlaypoolstr);
+ int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
+ if (overlaypool_id < 0) {
+ ss << "unrecognized pool '" << overlaypoolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+ assert(p);
+ if (p->tiers.count(overlaypool_id) == 0) {
+ ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ if (p->read_tier == overlaypool_id) {
+ err = 0;
+ ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
+ goto reply;
+ }
+ if (p->has_read_tier()) {
+ ss << "pool '" << poolstr << "' has overlay '"
+ << osdmap.get_pool_name(p->read_tier)
+ << "'; please remove-overlay first";
+ err = -EINVAL;
+ goto reply;
+ }
+ // go
+ pending_inc.get_new_pool(pool_id, p)->read_tier = overlaypool_id;
+ pending_inc.get_new_pool(pool_id, p)->write_tier = overlaypool_id;
+ ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
+ wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(), get_last_committed()));
+ return true;
+ } else if (prefix == "osd tier remove-overlay") {
+ string poolstr;
+ cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+ assert(p);
+ if (!p->has_read_tier()) {
+ err = 0;
+ ss << "there is now (or already was) no overlay for '" << poolstr << "'";
+ goto reply;
+ }
+ // go
+ pending_inc.get_new_pool(pool_id, p)->clear_read_tier();
+ pending_inc.get_new_pool(pool_id, p)->clear_write_tier();
+ ss << "there is now (or already was) no overlay for '" << poolstr << "'";
+ wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(), get_last_committed()));
+ return true;
+ } else if (prefix == "osd tier cache-mode") {
+ string poolstr;
+ cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+ assert(p);
+ if (!p->is_tier()) {
+ ss << "pool '" << poolstr << "' is not a tier";
+ err = -EINVAL;
+ goto reply;
+ }
+ string modestr;
+ cmd_getval(g_ceph_context, cmdmap, "mode", modestr);
+ pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
+ if (mode < 0) {
+ ss << "'" << modestr << "' is not a valid cache mode";
+ err = -EINVAL;
+ goto reply;
}
+ // go
+ pending_inc.get_new_pool(pool_id, p)->cache_mode = mode;
+ ss << "set cache-mode for pool '" << poolstr
+ << "' to " << pg_pool_t::get_cache_mode_name(mode);
+ wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(), get_last_committed()));
+ return true;
} else if (prefix == "osd pool set-quota") {
string poolstr;
cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
@@ -3627,13 +3916,11 @@ done:
goto reply;
}
- if (pending_inc.new_pools.count(pool_id) == 0)
- pending_inc.new_pools[pool_id] = *osdmap.get_pg_pool(pool_id);
-
+ pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
if (field == "max_objects") {
- pending_inc.new_pools[pool_id].quota_max_objects = value;
+ pi->quota_max_objects = value;
} else if (field == "max_bytes") {
- pending_inc.new_pools[pool_id].quota_max_bytes = value;
+ pi->quota_max_bytes = value;
} else {
assert(0 == "unrecognized option");
}
@@ -3658,7 +3945,6 @@ done:
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, get_last_committed()));
return true;
}
-
} else if (prefix == "osd thrash") {
int64_t num_epochs;
cmd_getval(g_ceph_context, cmdmap, "num_epochs", num_epochs, int64_t(0));
@@ -3694,7 +3980,7 @@ bool OSDMonitor::preprocess_pool_op(MPoolOp *m)
_pool_op_reply(m, 0, osdmap.get_epoch());
return true;
}
-
+
// check if the snap and snapname exists
bool snap_exists = false;
const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
@@ -3790,6 +4076,38 @@ bool OSDMonitor::prepare_pool_op(MPoolOp *m)
int ret = 0;
bool changed = false;
+ if (!osdmap.have_pg_pool(m->pool)) {
+ _pool_op_reply(m, -ENOENT, osdmap.get_epoch());
+ return false;
+ }
+
+ const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
+
+ switch (m->op) {
+ case POOL_OP_CREATE_SNAP:
+ case POOL_OP_DELETE_SNAP:
+ if (!pool->is_unmanaged_snaps_mode()) {
+ bool snap_exists = pool->snap_exists(m->name.c_str());
+ if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
+ || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
+ ret = 0;
+ } else {
+ break;
+ }
+ } else {
+ ret = -EINVAL;
+ }
+ _pool_op_reply(m, ret, osdmap.get_epoch());
+ return false;
+
+ case POOL_OP_CREATE_UNMANAGED_SNAP:
+ case POOL_OP_DELETE_UNMANAGED_SNAP:
+ if (pool->is_pool_snaps_mode()) {
+ _pool_op_reply(m, -EINVAL, osdmap.get_epoch());
+ return false;
+ }
+ }
+
// projected pool info
pg_pool_t pp;
if (pending_inc.new_pools.count(m->pool))
@@ -3816,7 +4134,7 @@ bool OSDMonitor::prepare_pool_op(MPoolOp *m)
goto out;
}
}
-
+
switch (m->op) {
case POOL_OP_CREATE_SNAP:
if (!pp.snap_exists(m->name.c_str())) {
@@ -3871,8 +4189,7 @@ bool OSDMonitor::prepare_pool_op(MPoolOp *m)
out:
wait_for_finished_proposal(new OSDMonitor::C_PoolOp(this, m, ret, pending_inc.epoch, &reply_data));
- propose_pending();
- return false;
+ return true;
}
bool OSDMonitor::prepare_pool_op_create(MPoolOp *m)
diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h
index 04f7cf5b196..439c8435055 100644
--- a/src/mon/OSDMonitor.h
+++ b/src/mon/OSDMonitor.h
@@ -235,7 +235,8 @@ private:
bool prepare_pool_op_create (MPoolOp *m);
bool prepare_pool_op_delete(MPoolOp *m);
int prepare_new_pool(string& name, uint64_t auid, int crush_rule,
- unsigned pg_num, unsigned pgp_num);
+ unsigned pg_num, unsigned pgp_num,
+ const vector<string> &properties);
int prepare_new_pool(MPoolOp *m);
void update_pool_flags(int64_t pool_id, uint64_t flags);
@@ -323,6 +324,9 @@ private:
bool preprocess_command(MMonCommand *m);
bool prepare_command(MMonCommand *m);
+ int prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
+ stringstream& ss);
+
void handle_osd_timeouts(const utime_t &now,
std::map<int,utime_t> &last_osd_report);
void mark_all_down();
diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
index e9a35c6b8ab..ea70bbd61c3 100644
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -30,7 +30,7 @@ void PGMap::Incremental::encode(bufferlist &bl, uint64_t features) const
return;
}
- ENCODE_START(6, 5, bl);
+ ENCODE_START(7, 5, bl);
::encode(version, bl);
::encode(pg_stat_updates, bl);
::encode(osd_stat_updates, bl);
@@ -41,6 +41,7 @@ void PGMap::Incremental::encode(bufferlist &bl, uint64_t features) const
::encode(nearfull_ratio, bl);
::encode(pg_remove, bl);
::encode(stamp, bl);
+ ::encode(osd_epochs, bl);
ENCODE_FINISH(bl);
}
@@ -89,6 +90,17 @@ void PGMap::Incremental::decode(bufferlist::iterator &bl)
}
if (struct_v >= 6)
::decode(stamp, bl);
+ if (struct_v >= 7) {
+ ::decode(osd_epochs, bl);
+ } else {
+ for (map<int32_t, osd_stat_t>::iterator i = osd_stat_updates.begin();
+ i != osd_stat_updates.end();
+ ++i) {
+ // This isn't accurate, but will cause trimming to behave like
+ // previously.
+ osd_epochs.insert(make_pair(i->first, osdmap_epoch));
+ }
+ }
DECODE_FINISH(bl);
}
@@ -140,6 +152,7 @@ void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
o.back()->version = 2;
o.back()->pg_stat_updates[pg_t(1,2,3)] = pg_stat_t();
o.back()->osd_stat_updates[5] = osd_stat_t();
+ o.back()->osd_epochs[5] = 12;
o.push_back(new Incremental);
o.back()->version = 3;
o.back()->osdmap_epoch = 1;
@@ -148,6 +161,7 @@ void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
o.back()->nearfull_ratio = .3;
o.back()->pg_stat_updates[pg_t(4,5,6)] = pg_stat_t();
o.back()->osd_stat_updates[6] = osd_stat_t();
+ o.back()->osd_epochs[6] = 12;
o.back()->pg_remove.insert(pg_t(1,2,3));
o.back()->osd_stat_rm.insert(5);
}
@@ -195,8 +209,10 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
}
stat_pg_add(update_pg, update_stat);
}
- for (map<int32_t,osd_stat_t>::const_iterator p = inc.osd_stat_updates.begin();
- p != inc.osd_stat_updates.end();
+ assert(osd_stat.size() == osd_epochs.size());
+ for (map<int32_t,osd_stat_t>::const_iterator p =
+ inc.get_osd_stat_updates().begin();
+ p != inc.get_osd_stat_updates().end();
++p) {
int osd = p->first;
const osd_stat_t &new_stats(p->second);
@@ -209,6 +225,8 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
stat_osd_sub(t->second);
t->second = new_stats;
}
+ assert(inc.get_osd_epochs().find(osd) != inc.get_osd_epochs().end());
+ osd_epochs.insert(*(inc.get_osd_epochs().find(osd)));
stat_osd_add(new_stats);
@@ -226,8 +244,8 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
}
}
- for (set<int>::iterator p = inc.osd_stat_rm.begin();
- p != inc.osd_stat_rm.end();
+ for (set<int>::iterator p = inc.get_osd_stat_rm().begin();
+ p != inc.get_osd_stat_rm().end();
++p) {
hash_map<int32_t,osd_stat_t>::iterator t = osd_stat.find(*p);
if (t != osd_stat.end()) {
@@ -416,6 +434,14 @@ epoch_t PGMap::calc_min_last_epoch_clean() const
if (lec < min)
min = lec;
}
+ // also scan osd epochs
+ // don't trim past the oldest reported osd epoch
+ for (hash_map<int32_t, epoch_t>::const_iterator i = osd_epochs.begin();
+ i != osd_epochs.end();
+ ++i) {
+ if (i->second < min)
+ min = i->second;
+ }
return min;
}
@@ -434,7 +460,7 @@ void PGMap::encode(bufferlist &bl, uint64_t features) const
return;
}
- ENCODE_START(5, 4, bl);
+ ENCODE_START(6, 4, bl);
::encode(version, bl);
::encode(pg_stat, bl);
::encode(osd_stat, bl);
@@ -443,6 +469,7 @@ void PGMap::encode(bufferlist &bl, uint64_t features) const
::encode(full_ratio, bl);
::encode(nearfull_ratio, bl);
::encode(stamp, bl);
+ ::encode(osd_epochs, bl);
ENCODE_FINISH(bl);
}
@@ -472,6 +499,17 @@ void PGMap::decode(bufferlist::iterator &bl)
}
if (struct_v >= 5)
::decode(stamp, bl);
+ if (struct_v >= 6) {
+ ::decode(osd_epochs, bl);
+ } else {
+ for (hash_map<int32_t, osd_stat_t>::iterator i = osd_stat.begin();
+ i != osd_stat.end();
+ ++i) {
+ // This isn't accurate, but will cause trimming to behave like
+ // previously.
+ osd_epochs.insert(make_pair(i->first, last_osdmap_epoch));
+ }
+ }
DECODE_FINISH(bl);
calc_stats();
@@ -488,7 +526,10 @@ void PGMap::dirty_all(Incremental& inc)
inc.pg_stat_updates[p->first] = p->second;
}
for (hash_map<int32_t, osd_stat_t>::const_iterator p = osd_stat.begin(); p != osd_stat.end(); ++p) {
- inc.osd_stat_updates[p->first] = p->second;
+ assert(inc.get_osd_epochs().count(p->first));
+ inc.update_stat(p->first,
+ inc.get_osd_epochs().find(p->first)->second,
+ p->second);
}
}
@@ -701,7 +742,8 @@ void PGMap::dump_stuck_plain(ostream& ss, PGMap::StuckPG type, utime_t cutoff) c
{
hash_map<pg_t, pg_stat_t> stuck_pg_stats;
get_stuck_stats(type, cutoff, stuck_pg_stats);
- dump_pg_stats_plain(ss, stuck_pg_stats);
+ if (!stuck_pg_stats.empty())
+ dump_pg_stats_plain(ss, stuck_pg_stats);
}
void PGMap::dump_osd_perf_stats(Formatter *f) const
diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h
index 84d89f87517..7a202fc0006 100644
--- a/src/mon/PGMap.h
+++ b/src/mon/PGMap.h
@@ -43,12 +43,13 @@ public:
float full_ratio;
float nearfull_ratio;
+ // mapping of osd to most recently reported osdmap epoch
+ hash_map<int32_t,epoch_t> osd_epochs;
+
class Incremental {
public:
version_t version;
map<pg_t,pg_stat_t> pg_stat_updates;
- map<int32_t,osd_stat_t> osd_stat_updates;
- set<int32_t> osd_stat_rm;
epoch_t osdmap_epoch;
epoch_t pg_scan; // osdmap epoch
set<pg_t> pg_remove;
@@ -56,6 +57,38 @@ public:
float nearfull_ratio;
utime_t stamp;
+ private:
+ map<int32_t,osd_stat_t> osd_stat_updates;
+ set<int32_t> osd_stat_rm;
+
+ // mapping of osd to most recently reported osdmap epoch
+ map<int32_t,epoch_t> osd_epochs;
+ public:
+
+ const map<int32_t, osd_stat_t> &get_osd_stat_updates() const {
+ return osd_stat_updates;
+ }
+ const set<int32_t> &get_osd_stat_rm() const {
+ return osd_stat_rm;
+ }
+ const map<int32_t, epoch_t> &get_osd_epochs() const {
+ return osd_epochs;
+ }
+
+ void update_stat(int32_t osd, epoch_t epoch, const osd_stat_t &stat) {
+ osd_stat_updates[osd] = stat;
+ osd_epochs[osd] = epoch;
+ assert(osd_epochs.size() == osd_stat_updates.size());
+ }
+ void stat_osd_out(int32_t osd) {
+ // 0 the stats for the osd
+ osd_stat_updates[osd] = osd_stat_t();
+ }
+ void rm_stat(int32_t osd) {
+ osd_stat_rm.insert(osd);
+ osd_epochs.erase(osd);
+ osd_stat_updates.erase(osd);
+ }
void encode(bufferlist &bl, uint64_t features=-1) const;
void decode(bufferlist::iterator &bl);
void dump(Formatter *f) const;
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index ff2d1fe4947..0644922ddb4 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -494,15 +494,19 @@ void PGMonitor::encode_pending(MonitorDBStore::Transaction *t)
{
bufferlist dirty;
string prefix = pgmap_osd_prefix;
- for (map<int32_t,osd_stat_t>::const_iterator p = pending_inc.osd_stat_updates.begin();
- p != pending_inc.osd_stat_updates.end();
+ for (map<int32_t,osd_stat_t>::const_iterator p =
+ pending_inc.get_osd_stat_updates().begin();
+ p != pending_inc.get_osd_stat_updates().end();
++p) {
::encode(p->first, dirty);
bufferlist bl;
::encode(p->second, bl, features);
t->put(prefix, stringify(p->first), bl);
}
- for (set<int32_t>::const_iterator p = pending_inc.osd_stat_rm.begin(); p != pending_inc.osd_stat_rm.end(); ++p) {
+ for (set<int32_t>::const_iterator p =
+ pending_inc.get_osd_stat_rm().begin();
+ p != pending_inc.get_osd_stat_rm().end();
+ ++p) {
::encode(*p, dirty);
t->erase(prefix, stringify(*p));
}
@@ -725,7 +729,11 @@ bool PGMonitor::prepare_pg_stats(MPGStats *stats)
}
// osd stat
- pending_inc.osd_stat_updates[from] = stats->osd_stat;
+ if (mon->osdmon()->osdmap.is_in(from)) {
+ pending_inc.update_stat(from, stats->epoch, stats->osd_stat);
+ } else {
+ pending_inc.update_stat(from, stats->epoch, osd_stat_t());
+ }
if (pg_map.osd_stat.count(from))
dout(10) << " got osd." << from << " " << stats->osd_stat << " (was " << pg_map.osd_stat[from] << ")" << dendl;
@@ -842,11 +850,7 @@ void PGMonitor::check_osd_map(epoch_t epoch)
++p)
if (p->second == CEPH_OSD_OUT) {
dout(10) << "check_osd_map osd." << p->first << " went OUT" << dendl;
- pending_inc.osd_stat_rm.insert(p->first);
- } else {
- dout(10) << "check_osd_map osd." << p->first << " is IN" << dendl;
- pending_inc.osd_stat_rm.erase(p->first);
- pending_inc.osd_stat_updates[p->first];
+ pending_inc.stat_osd_out(p->first);
}
// this is conservative: we want to know if any osds (maybe) got marked down.
@@ -867,7 +871,7 @@ void PGMonitor::check_osd_map(epoch_t epoch)
// whether it was created *or* destroyed, we can safely drop
// it's osd_stat_t record.
dout(10) << "check_osd_map osd." << p->first << " created or destroyed" << dendl;
- pending_inc.osd_stat_rm.insert(p->first);
+ pending_inc.rm_stat(p->first);
// and adjust full, nearfull set
pg_map.nearfull_osds.erase(p->first);
@@ -1428,7 +1432,6 @@ bool PGMonitor::preprocess_command(MMonCommand *m)
cmd_getval(g_ceph_context, cmdmap, "threshold", threshold,
int64_t(g_conf->mon_pg_stuck_threshold));
- boost::scoped_ptr<Formatter> f(new_formatter("json"));
r = dump_stuck_pg_stats(ds, f.get(), (int)threshold, stuckop_vec);
ss << "ok";
r = 0;
@@ -1848,6 +1851,54 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
detail->push_back(make_pair(HEALTH_ERR, ss.str()));
}
}
+
+ // pg skew
+ int num_in = mon->osdmon()->osdmap.get_num_in_osds();
+ if (num_in && g_conf->mon_pg_warn_min_per_osd > 0) {
+ int per = pg_map.pg_stat.size() / num_in;
+ if (per < g_conf->mon_pg_warn_min_per_osd) {
+ ostringstream ss;
+ ss << "too few pgs per osd (" << per << " < min " << g_conf->mon_pg_warn_min_per_osd << ")";
+ summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+ if (detail)
+ detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+ }
+ }
+ if (!pg_map.pg_stat.empty()) {
+ for (hash_map<int,pool_stat_t>::const_iterator p = pg_map.pg_pool_sum.begin();
+ p != pg_map.pg_pool_sum.end();
+ ++p) {
+ const pg_pool_t *pi = mon->osdmon()->osdmap.get_pg_pool(p->first);
+ if (!pi)
+ continue; // in case osdmap changes haven't propagated to PGMap yet
+ if (pi->get_pg_num() > pi->get_pgp_num()) {
+ ostringstream ss;
+ ss << "pool " << mon->osdmon()->osdmap.get_pool_name(p->first) << " pg_num "
+ << pi->get_pg_num() << " > pgp_num " << pi->get_pgp_num();
+ summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+ if (detail)
+ detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+ }
+ int average_objects_per_pg = pg_map.pg_sum.stats.sum.num_objects / pg_map.pg_stat.size();
+ if (average_objects_per_pg > 0) {
+ int objects_per_pg = p->second.stats.sum.num_objects / pi->get_pg_num();
+ float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
+ if (g_conf->mon_pg_warn_max_object_skew > 0 &&
+ ratio > g_conf->mon_pg_warn_max_object_skew) {
+ ostringstream ss;
+ ss << "pool " << mon->osdmon()->osdmap.get_pool_name(p->first) << " has too few pgs";
+ summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+ if (detail) {
+ ostringstream ss;
+ ss << "pool " << mon->osdmon()->osdmap.get_pool_name(p->first) << " objects per pg ("
+ << objects_per_pg << ") is more than " << ratio << " times cluster average ("
+ << average_objects_per_pg << ")";
+ detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+ }
+ }
+ }
+ }
+ }
}
void PGMonitor::check_full_osd_health(list<pair<health_status_t,string> >& summary,
diff --git a/src/mon/PGMonitor.h b/src/mon/PGMonitor.h
index 44015395e94..d29f47c1c43 100644
--- a/src/mon/PGMonitor.h
+++ b/src/mon/PGMonitor.h
@@ -28,6 +28,7 @@ using namespace std;
#include "PaxosService.h"
#include "include/types.h"
#include "include/utime.h"
+#include "include/histogram.h"
#include "msg/Messenger.h"
#include "common/config.h"
#include "mon/MonitorDBStore.h"
diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc
index 016686aae06..495268ff9ee 100644
--- a/src/mon/Paxos.cc
+++ b/src/mon/Paxos.cc
@@ -328,6 +328,15 @@ bool Paxos::store_state(MMonPaxos *m)
// apply.
decode_append_transaction(t, it->second);
}
+
+ // discard obsolete uncommitted value?
+ if (uncommitted_v && uncommitted_v <= last_committed) {
+ dout(10) << " forgetting obsolete uncommitted value " << uncommitted_v
+ << " pn " << uncommitted_pn << dendl;
+ uncommitted_v = 0;
+ uncommitted_pn = 0;
+ uncommitted_value.clear();
+ }
}
if (!t.empty()) {
dout(30) << __func__ << " transaction dump:\n";
@@ -425,7 +434,7 @@ void Paxos::handle_last(MMonPaxos *last)
// did this person send back an accepted but uncommitted value?
if (last->uncommitted_pn) {
- if (last->uncommitted_pn > uncommitted_pn &&
+ if (last->uncommitted_pn >= uncommitted_pn &&
last->last_committed >= last_committed &&
last->last_committed + 1 >= uncommitted_v) {
uncommitted_v = last->last_committed+1;
diff --git a/src/mon/Paxos.h b/src/mon/Paxos.h
index 1bd3a3c1f51..764c4fee404 100644
--- a/src/mon/Paxos.h
+++ b/src/mon/Paxos.h
@@ -131,7 +131,7 @@ class Paxos;
* This libary is based on the Paxos algorithm, but varies in a few key ways:
* 1- Only a single new value is generated at a time, simplifying the recovery logic.
* 2- Nodes track "committed" values, and share them generously (and trustingly)
- * 3- A 'leasing' mechism is built-in, allowing nodes to determine when it is
+ * 3- A 'leasing' mechanism is built-in, allowing nodes to determine when it is
* safe to "read" their copy of the last committed value.
*
* This provides a simple replication substrate that services can be built on top of.
@@ -325,8 +325,7 @@ private:
*
* Instead of performing a full commit each time a read is requested, we
* keep leases. Each lease will have an expiration date, which may or may
- * not be extended. This member variable will keep when is the lease
- * expiring.
+ * not be extended.
*/
utime_t lease_expire;
/**
@@ -1091,7 +1090,7 @@ public:
* onto paxos-related keys), and then we will decode those same bufferlists
* we just wrote and apply the transactions they hold. We will also update
* our first and last committed values to point to the new values, if need
- * be. All all this is done tightly wrapped in a transaction to ensure we
+ * be. All this is done tightly wrapped in a transaction to ensure we
* enjoy the atomicity guarantees given by our awesome k/v store.
*
* @param m A message
diff --git a/src/mon/mon_types.h b/src/mon/mon_types.h
index 0eae3b172bf..0ae1aaf8d5e 100644
--- a/src/mon/mon_types.h
+++ b/src/mon/mon_types.h
@@ -40,6 +40,52 @@ inline const char *get_paxos_name(int p) {
#define CEPH_MON_ONDISK_MAGIC "ceph mon volume v012"
+/**
+ * leveldb store stats
+ *
+ * If we ever decide to support multiple backends for the monitor store,
+ * we should then create an abstract class 'MonitorStoreStats' of sorts
+ * and inherit it on LevelDBStoreStats. I'm sure you'll figure something
+ * out.
+ */
+struct LevelDBStoreStats {
+ uint64_t bytes_total;
+ uint64_t bytes_sst;
+ uint64_t bytes_log;
+ uint64_t bytes_misc;
+ utime_t last_update;
+
+ void dump(Formatter *f) const {
+ assert(f != NULL);
+ f->dump_int("bytes_total", bytes_total);
+ f->dump_int("bytes_sst", bytes_sst);
+ f->dump_int("bytes_log", bytes_log);
+ f->dump_int("bytes_misc", bytes_misc);
+ f->dump_stream("last_updated") << last_update;
+ }
+
+ void encode(bufferlist &bl) const {
+ ENCODE_START(1, 1, bl);
+ ::encode(bytes_total, bl);
+ ::encode(bytes_sst, bl);
+ ::encode(bytes_log, bl);
+ ::encode(bytes_misc, bl);
+ ::encode(last_update, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::iterator &p) {
+ DECODE_START(1, p);
+ ::decode(bytes_total, p);
+ ::decode(bytes_sst, p);
+ ::decode(bytes_log, p);
+ ::decode(bytes_misc, p);
+ ::decode(last_update, p);
+ DECODE_FINISH(p);
+ }
+};
+WRITE_CLASS_ENCODER(LevelDBStoreStats);
+
// data stats
struct DataStats {
@@ -50,13 +96,29 @@ struct DataStats {
int latest_avail_percent;
utime_t last_update;
+ LevelDBStoreStats store_stats;
+
+ void dump(Formatter *f) const {
+ assert(f != NULL);
+ f->dump_int("kb_total", kb_total);
+ f->dump_int("kb_used", kb_used);
+ f->dump_int("kb_avail", kb_avail);
+ f->dump_int("avail_percent", latest_avail_percent);
+ f->dump_stream("last_updated") << last_update;
+
+ f->open_object_section("store_stats");
+ store_stats.dump(f);
+ f->close_section();
+ }
+
void encode(bufferlist &bl) const {
- ENCODE_START(1, 1, bl);
+ ENCODE_START(2, 1, bl);
::encode(kb_total, bl);
::encode(kb_used, bl);
::encode(kb_avail, bl);
::encode(latest_avail_percent, bl);
::encode(last_update, bl);
+ ::encode(store_stats, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::iterator &p) {
@@ -66,10 +128,12 @@ struct DataStats {
::decode(kb_avail, p);
::decode(latest_avail_percent, p);
::decode(last_update, p);
+ if (struct_v > 1)
+ ::decode(store_stats, p);
+
DECODE_FINISH(p);
}
};
-
WRITE_CLASS_ENCODER(DataStats);
struct ScrubResult {
diff --git a/src/msg/Makefile.am b/src/msg/Makefile.am
new file mode 100644
index 00000000000..a849a1ca26a
--- /dev/null
+++ b/src/msg/Makefile.am
@@ -0,0 +1,20 @@
+libmsg_la_SOURCES = \
+ msg/Accepter.cc \
+ msg/DispatchQueue.cc \
+ msg/Message.cc \
+ msg/Messenger.cc \
+ msg/Pipe.cc \
+ msg/SimpleMessenger.cc \
+ msg/msg_types.cc
+
+noinst_HEADERS += \
+ msg/Accepter.h \
+ msg/DispatchQueue.h \
+ msg/Dispatcher.h \
+ msg/Message.h \
+ msg/Messenger.h \
+ msg/Pipe.h \
+ msg/SimpleMessenger.h \
+ msg/msg_types.h
+
+noinst_LTLIBRARIES += libmsg.la
diff --git a/src/msg/Pipe.cc b/src/msg/Pipe.cc
index 50656fee53b..66b64d0097a 100644
--- a/src/msg/Pipe.cc
+++ b/src/msg/Pipe.cc
@@ -1136,6 +1136,19 @@ void Pipe::unregister_pipe()
}
}
+void Pipe::join()
+{
+ ldout(msgr->cct, 20) << "join" << dendl;
+ if (writer_thread.is_started())
+ writer_thread.join();
+ if (reader_thread.is_started())
+ reader_thread.join();
+ if (delay_thread) {
+ ldout(msgr->cct, 20) << "joining delay_thread" << dendl;
+ delay_thread->stop();
+ delay_thread->join();
+ }
+}
void Pipe::requeue_sent()
{
diff --git a/src/msg/Pipe.h b/src/msg/Pipe.h
index 5f94305350c..6c91395a352 100644
--- a/src/msg/Pipe.h
+++ b/src/msg/Pipe.h
@@ -234,16 +234,7 @@ class DispatchQueue;
void register_pipe();
void unregister_pipe();
- void join() {
- if (writer_thread.is_started())
- writer_thread.join();
- if (reader_thread.is_started())
- reader_thread.join();
- if (delay_thread) {
- delay_thread->stop();
- delay_thread->join();
- }
- }
+ void join();
void stop();
void _send(Message *m) {
diff --git a/src/msg/msg_types.cc b/src/msg/msg_types.cc
index 38416abd4f2..b02db768bfb 100644
--- a/src/msg/msg_types.cc
+++ b/src/msg/msg_types.cc
@@ -135,7 +135,7 @@ bool entity_addr_t::parse(const char *s, const char **end)
ostream& operator<<(ostream& out, const sockaddr_storage &ss)
{
char buf[NI_MAXHOST] = { 0 };
- char serv[20] = { 0 };
+ char serv[NI_MAXSERV] = { 0 };
size_t hostlen;
if (ss.ss_family == AF_INET)
diff --git a/src/objclass/class_api.cc b/src/objclass/class_api.cc
index 6e8de53467f..bb26c752f9b 100644
--- a/src/objclass/class_api.cc
+++ b/src/objclass/class_api.cc
@@ -177,7 +177,7 @@ int cls_read(cls_method_context_t hctx, int ofs, int len,
int cls_get_request_origin(cls_method_context_t hctx, entity_inst_t *origin)
{
ReplicatedPG::OpContext **pctx = static_cast<ReplicatedPG::OpContext **>(hctx);
- *origin = (*pctx)->op->request->get_orig_source_inst();
+ *origin = (*pctx)->op->get_req()->get_orig_source_inst();
return 0;
}
@@ -582,7 +582,7 @@ uint64_t cls_current_version(cls_method_context_t hctx)
{
ReplicatedPG::OpContext *ctx = *(ReplicatedPG::OpContext **)hctx;
- return ctx->at_version.version;
+ return ctx->pg->info.last_user_version;
}
diff --git a/src/objsync/boto_del.py b/src/objsync/boto_del.py
index 14e790544ec..ba512e1ca33 100755
--- a/src/objsync/boto_del.py
+++ b/src/objsync/boto_del.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
#
# Ceph - scalable distributed file system
diff --git a/src/os/BtrfsFileStoreBackend.cc b/src/os/BtrfsFileStoreBackend.cc
index 9fa96babab7..bb11a45e10b 100644
--- a/src/os/BtrfsFileStoreBackend.cc
+++ b/src/os/BtrfsFileStoreBackend.cc
@@ -12,7 +12,9 @@
*
*/
-#include <inttypes.h>
+#include "include/int_types.h"
+#include "include/types.h"
+
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
@@ -22,7 +24,6 @@
#include <sys/ioctl.h>
#include "include/compat.h"
#include "include/linux_fiemap.h"
-#include "include/types.h"
#include "include/color.h"
#include "include/buffer.h"
#include "include/assert.h"
@@ -320,8 +321,9 @@ int BtrfsFileStoreBackend::list_checkpoints(list<string>& ls)
list<string> snaps;
char path[PATH_MAX];
- struct dirent buf, *de;
- while (::readdir_r(dir, &buf, &de) == 0) {
+ char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
+ struct dirent *de;
+ while (::readdir_r(dir, (struct dirent *)&buf, &de) == 0) {
if (!de)
break;
diff --git a/src/os/CollectionIndex.h b/src/os/CollectionIndex.h
index 9b1ceae8c46..89b7b862632 100644
--- a/src/os/CollectionIndex.h
+++ b/src/os/CollectionIndex.h
@@ -23,7 +23,7 @@
#include "include/object.h"
/**
- * CollectionIndex provides an interface for manipulating indexed colelctions
+ * CollectionIndex provides an interface for manipulating indexed collections
*/
class CollectionIndex {
protected:
@@ -127,26 +127,26 @@ protected:
* @return Error Code, 0 for success
*/
virtual int created(
- const hobject_t &hoid, ///< [in] Created object.
+ const ghobject_t &oid, ///< [in] Created object.
const char *path ///< [in] Path to created object.
) = 0;
/**
- * Removes hoid from the collection
+ * Removes oid from the collection
*
* @return Error Code, 0 for success
*/
virtual int unlink(
- const hobject_t &hoid ///< [in] Object to remove
+ const ghobject_t &oid ///< [in] Object to remove
) = 0;
/**
- * Gets the IndexedPath for hoid.
+ * Gets the IndexedPath for oid.
*
* @return Error Code, 0 for success
*/
virtual int lookup(
- const hobject_t &hoid, ///< [in] Object to lookup
+ const ghobject_t &oid, ///< [in] Object to lookup
IndexedPath *path, ///< [out] Path to object
int *exist ///< [out] True if the object exists, else false
) = 0;
@@ -167,17 +167,17 @@ protected:
/// List contents of collection by hash
virtual int collection_list_partial(
- const hobject_t &start, ///< [in] object at which to start
+ const ghobject_t &start, ///< [in] object at which to start
int min_count, ///< [in] get at least min_count objects
int max_count, ///< [in] return at most max_count objects
snapid_t seq, ///< [in] list only objects with snap >= seq
- vector<hobject_t> *ls, ///< [out] Listed objects
- hobject_t *next ///< [out] Next object to list
+ vector<ghobject_t> *ls, ///< [out] Listed objects
+ ghobject_t *next ///< [out] Next object to list
) = 0;
/// List contents of collection.
virtual int collection_list(
- vector<hobject_t> *ls ///< [out] Listed Objects
+ vector<ghobject_t> *ls ///< [out] Listed Objects
) = 0;
/// Call prior to removing directory
diff --git a/src/os/DBObjectMap.cc b/src/os/DBObjectMap.cc
index 5142f4d7420..635870b0db5 100644
--- a/src/os/DBObjectMap.cc
+++ b/src/os/DBObjectMap.cc
@@ -1,13 +1,13 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-#include <iostream>
-#include <inttypes.h>
+#include "include/int_types.h"
#include "include/buffer.h"
+
+#include <iostream>
#include <set>
#include <map>
#include <string>
#include <tr1/memory>
-
#include <vector>
#include "ObjectMap.h"
@@ -130,61 +130,68 @@ bool DBObjectMap::check(std::ostream &out)
return retval;
}
-string DBObjectMap::hobject_key(const hobject_t &hoid)
+string DBObjectMap::ghobject_key(const ghobject_t &oid)
{
string out;
- append_escaped(hoid.oid.name, &out);
+ append_escaped(oid.hobj.oid.name, &out);
out.push_back('.');
- append_escaped(hoid.get_key(), &out);
+ append_escaped(oid.hobj.get_key(), &out);
out.push_back('.');
- append_escaped(hoid.nspace, &out);
+ append_escaped(oid.hobj.nspace, &out);
out.push_back('.');
char snap_with_hash[1000];
char *t = snap_with_hash;
char *end = t + sizeof(snap_with_hash);
- if (hoid.snap == CEPH_NOSNAP)
+ if (oid.hobj.snap == CEPH_NOSNAP)
t += snprintf(t, end - t, "head");
- else if (hoid.snap == CEPH_SNAPDIR)
+ else if (oid.hobj.snap == CEPH_SNAPDIR)
t += snprintf(t, end - t, "snapdir");
else
- t += snprintf(t, end - t, "%llx", (long long unsigned)hoid.snap);
+ t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
- if (hoid.pool == -1)
+ if (oid.hobj.pool == -1)
t += snprintf(t, end - t, ".none");
else
- t += snprintf(t, end - t, ".%llx", (long long unsigned)hoid.pool);
- snprintf(t, end - t, ".%.*X", (int)(sizeof(hoid.hash)*2), hoid.hash);
+ t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.hobj.pool);
+ snprintf(t, end - t, ".%.*X", (int)(sizeof(oid.hobj.hash)*2), oid.hobj.hash);
+
+ if (oid.generation != ghobject_t::NO_GEN) {
+ assert(oid.shard_id != ghobject_t::NO_SHARD);
+
+ t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.generation);
+ t += snprintf(t, end - t, ".%x", (int)oid.shard_id);
+ }
out += string(snap_with_hash);
return out;
}
-string DBObjectMap::hobject_key_v0(coll_t c, const hobject_t &hoid)
+string DBObjectMap::ghobject_key_v0(coll_t c, const ghobject_t &oid)
{
string out;
append_escaped(c.to_str(), &out);
out.push_back('.');
- append_escaped(hoid.oid.name, &out);
+ append_escaped(oid.hobj.oid.name, &out);
out.push_back('.');
- append_escaped(hoid.get_key(), &out);
+ append_escaped(oid.hobj.get_key(), &out);
out.push_back('.');
char snap_with_hash[1000];
char *t = snap_with_hash;
char *end = t + sizeof(snap_with_hash);
- if (hoid.snap == CEPH_NOSNAP)
+ if (oid.hobj.snap == CEPH_NOSNAP)
t += snprintf(t, end - t, ".head");
- else if (hoid.snap == CEPH_SNAPDIR)
+ else if (oid.hobj.snap == CEPH_SNAPDIR)
t += snprintf(t, end - t, ".snapdir");
else
- t += snprintf(t, end - t, ".%llx", (long long unsigned)hoid.snap);
- snprintf(t, end - t, ".%.*X", (int)(sizeof(hoid.hash)*2), hoid.hash);
+ t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.hobj.snap);
+ snprintf(t, end - t, ".%.*X", (int)(sizeof(oid.hobj.hash)*2), oid.hobj.hash);
out += string(snap_with_hash);
return out;
}
-bool DBObjectMap::parse_hobject_key_v0(const string &in, coll_t *c,
- hobject_t *hoid)
+bool DBObjectMap::parse_ghobject_key_v0(const string &in, coll_t *c,
+ ghobject_t *oid)
{
string coll;
string name;
@@ -244,13 +251,13 @@ bool DBObjectMap::parse_hobject_key_v0(const string &in, coll_t *c,
pg_t pg;
if (c->is_pg_prefix(pg))
pool = (int64_t)pg.pool();
- (*hoid) = hobject_t(name, key, snap, hash, pool, "");
+ (*oid) = ghobject_t(hobject_t(name, key, snap, hash, pool, ""));
return true;
}
-string DBObjectMap::map_header_key(const hobject_t &hoid)
+string DBObjectMap::map_header_key(const ghobject_t &oid)
{
- return hobject_key(hoid);
+ return ghobject_key(oid);
}
string DBObjectMap::header_key(uint64_t seq)
@@ -311,9 +318,9 @@ int DBObjectMap::DBObjectMapIteratorImpl::init()
}
ObjectMap::ObjectMapIterator DBObjectMap::get_iterator(
- const hobject_t &hoid)
+ const ghobject_t &oid)
{
- Header header = lookup_map_header(hoid);
+ Header header = lookup_map_header(oid);
if (!header)
return ObjectMapIterator(new EmptyIteratorImpl());
return _get_iterator(header);
@@ -496,15 +503,15 @@ int DBObjectMap::DBObjectMapIteratorImpl::status()
return r;
}
-int DBObjectMap::set_keys(const hobject_t &hoid,
+int DBObjectMap::set_keys(const ghobject_t &oid,
const map<string, bufferlist> &set,
const SequencerPosition *spos)
{
KeyValueDB::Transaction t = db->get_transaction();
- Header header = lookup_create_map_header(hoid, t);
+ Header header = lookup_create_map_header(oid, t);
if (!header)
return -EINVAL;
- if (check_spos(hoid, header, spos))
+ if (check_spos(oid, header, spos))
return 0;
t->set(user_prefix(header), set);
@@ -512,15 +519,15 @@ int DBObjectMap::set_keys(const hobject_t &hoid,
return db->submit_transaction(t);
}
-int DBObjectMap::set_header(const hobject_t &hoid,
+int DBObjectMap::set_header(const ghobject_t &oid,
const bufferlist &bl,
const SequencerPosition *spos)
{
KeyValueDB::Transaction t = db->get_transaction();
- Header header = lookup_create_map_header(hoid, t);
+ Header header = lookup_create_map_header(oid, t);
if (!header)
return -EINVAL;
- if (check_spos(hoid, header, spos))
+ if (check_spos(oid, header, spos))
return 0;
_set_header(header, bl, t);
return db->submit_transaction(t);
@@ -534,10 +541,10 @@ void DBObjectMap::_set_header(Header header, const bufferlist &bl,
t->set(sys_prefix(header), to_set);
}
-int DBObjectMap::get_header(const hobject_t &hoid,
+int DBObjectMap::get_header(const ghobject_t &oid,
bufferlist *bl)
{
- Header header = lookup_map_header(hoid);
+ Header header = lookup_map_header(oid);
if (!header) {
return 0;
}
@@ -568,16 +575,16 @@ int DBObjectMap::_get_header(Header header,
return 0;
}
-int DBObjectMap::clear(const hobject_t &hoid,
+int DBObjectMap::clear(const ghobject_t &oid,
const SequencerPosition *spos)
{
KeyValueDB::Transaction t = db->get_transaction();
- Header header = lookup_map_header(hoid);
+ Header header = lookup_map_header(oid);
if (!header)
return -ENOENT;
- if (check_spos(hoid, header, spos))
+ if (check_spos(oid, header, spos))
return 0;
- remove_map_header(hoid, header, t);
+ remove_map_header(oid, header, t);
assert(header->num_children > 0);
header->num_children--;
int r = _clear(header, t);
@@ -688,15 +695,15 @@ int DBObjectMap::need_parent(DBObjectMapIterator iter)
return 1;
}
-int DBObjectMap::rm_keys(const hobject_t &hoid,
+int DBObjectMap::rm_keys(const ghobject_t &oid,
const set<string> &to_clear,
const SequencerPosition *spos)
{
- Header header = lookup_map_header(hoid);
+ Header header = lookup_map_header(oid);
if (!header)
return -ENOENT;
KeyValueDB::Transaction t = db->get_transaction();
- if (check_spos(hoid, header, spos))
+ if (check_spos(oid, header, spos))
return 0;
t->rmkeys(user_prefix(header), to_clear);
if (!header->parent) {
@@ -756,17 +763,17 @@ int DBObjectMap::rm_keys(const hobject_t &hoid,
parent->num_children--;
_clear(parent, t);
header->parent = 0;
- set_map_header(hoid, *header, t);
+ set_map_header(oid, *header, t);
t->rmkeys_by_prefix(complete_prefix(header));
}
return db->submit_transaction(t);
}
-int DBObjectMap::get(const hobject_t &hoid,
+int DBObjectMap::get(const ghobject_t &oid,
bufferlist *_header,
map<string, bufferlist> *out)
{
- Header header = lookup_map_header(hoid);
+ Header header = lookup_map_header(oid);
if (!header)
return -ENOENT;
_get_header(header, _header);
@@ -779,13 +786,13 @@ int DBObjectMap::get(const hobject_t &hoid,
return 0;
}
-int DBObjectMap::get_keys(const hobject_t &hoid,
+int DBObjectMap::get_keys(const ghobject_t &oid,
set<string> *keys)
{
- Header header = lookup_map_header(hoid);
+ Header header = lookup_map_header(oid);
if (!header)
return -ENOENT;
- ObjectMapIterator iter = get_iterator(hoid);
+ ObjectMapIterator iter = get_iterator(oid);
for (; iter->valid(); iter->next()) {
if (iter->status())
return iter->status();
@@ -816,40 +823,40 @@ int DBObjectMap::scan(Header header,
return 0;
}
-int DBObjectMap::get_values(const hobject_t &hoid,
+int DBObjectMap::get_values(const ghobject_t &oid,
const set<string> &keys,
map<string, bufferlist> *out)
{
- Header header = lookup_map_header(hoid);
+ Header header = lookup_map_header(oid);
if (!header)
return -ENOENT;
return scan(header, keys, 0, out);
}
-int DBObjectMap::check_keys(const hobject_t &hoid,
+int DBObjectMap::check_keys(const ghobject_t &oid,
const set<string> &keys,
set<string> *out)
{
- Header header = lookup_map_header(hoid);
+ Header header = lookup_map_header(oid);
if (!header)
return -ENOENT;
return scan(header, keys, out, 0);
}
-int DBObjectMap::get_xattrs(const hobject_t &hoid,
+int DBObjectMap::get_xattrs(const ghobject_t &oid,
const set<string> &to_get,
map<string, bufferlist> *out)
{
- Header header = lookup_map_header(hoid);
+ Header header = lookup_map_header(oid);
if (!header)
return -ENOENT;
return db->get(xattr_prefix(header), to_get, out);
}
-int DBObjectMap::get_all_xattrs(const hobject_t &hoid,
+int DBObjectMap::get_all_xattrs(const ghobject_t &oid,
set<string> *out)
{
- Header header = lookup_map_header(hoid);
+ Header header = lookup_map_header(oid);
if (!header)
return -ENOENT;
KeyValueDB::Iterator iter = db->get_iterator(xattr_prefix(header));
@@ -860,39 +867,39 @@ int DBObjectMap::get_all_xattrs(const hobject_t &hoid,
return iter->status();
}
-int DBObjectMap::set_xattrs(const hobject_t &hoid,
+int DBObjectMap::set_xattrs(const ghobject_t &oid,
const map<string, bufferlist> &to_set,
const SequencerPosition *spos)
{
KeyValueDB::Transaction t = db->get_transaction();
- Header header = lookup_create_map_header(hoid, t);
+ Header header = lookup_create_map_header(oid, t);
if (!header)
return -EINVAL;
- if (check_spos(hoid, header, spos))
+ if (check_spos(oid, header, spos))
return 0;
t->set(xattr_prefix(header), to_set);
return db->submit_transaction(t);
}
-int DBObjectMap::remove_xattrs(const hobject_t &hoid,
+int DBObjectMap::remove_xattrs(const ghobject_t &oid,
const set<string> &to_remove,
const SequencerPosition *spos)
{
KeyValueDB::Transaction t = db->get_transaction();
- Header header = lookup_map_header(hoid);
+ Header header = lookup_map_header(oid);
if (!header)
return -ENOENT;
- if (check_spos(hoid, header, spos))
+ if (check_spos(oid, header, spos))
return 0;
t->rmkeys(xattr_prefix(header), to_remove);
return db->submit_transaction(t);
}
-int DBObjectMap::clone(const hobject_t &hoid,
- const hobject_t &target,
+int DBObjectMap::clone(const ghobject_t &oid,
+ const ghobject_t &target,
const SequencerPosition *spos)
{
- if (hoid == target)
+ if (oid == target)
return 0;
KeyValueDB::Transaction t = db->get_transaction();
@@ -907,18 +914,18 @@ int DBObjectMap::clone(const hobject_t &hoid,
}
}
- Header parent = lookup_map_header(hoid);
+ Header parent = lookup_map_header(oid);
if (!parent)
return db->submit_transaction(t);
- Header source = generate_new_header(hoid, parent);
+ Header source = generate_new_header(oid, parent);
Header destination = generate_new_header(target, parent);
if (spos)
destination->spos = *spos;
parent->num_children = 2;
set_header(parent, t);
- set_map_header(hoid, *source, t);
+ set_map_header(oid, *source, t);
set_map_header(target, *destination, t);
map<string, bufferlist> to_set;
@@ -973,9 +980,9 @@ int DBObjectMap::upgrade()
to_get);
coll_t coll;
- hobject_t hoid;
- assert(parse_hobject_key_v0(iter->key(), &coll, &hoid));
- new_map_headers[hobject_key(hoid)] = got.begin()->second;
+ ghobject_t oid;
+ assert(parse_ghobject_key_v0(iter->key(), &coll, &oid));
+ new_map_headers[ghobject_key(oid)] = got.begin()->second;
}
t->rmkeys(LEAF_PREFIX, legacy_to_remove);
@@ -1038,18 +1045,18 @@ int DBObjectMap::init(bool do_upgrade)
return 0;
}
-int DBObjectMap::sync(const hobject_t *hoid,
+int DBObjectMap::sync(const ghobject_t *oid,
const SequencerPosition *spos) {
KeyValueDB::Transaction t = db->get_transaction();
write_state(t);
- if (hoid) {
+ if (oid) {
assert(spos);
- Header header = lookup_map_header(*hoid);
+ Header header = lookup_map_header(*oid);
if (header) {
- dout(10) << "hoid: " << *hoid << " setting spos to "
+ dout(10) << "oid: " << *oid << " setting spos to "
<< *spos << dendl;
header->spos = *spos;
- set_map_header(*hoid, *header, t);
+ set_map_header(*oid, *header, t);
}
}
return db->submit_transaction_sync(t);
@@ -1067,27 +1074,27 @@ int DBObjectMap::write_state(KeyValueDB::Transaction _t) {
}
-DBObjectMap::Header DBObjectMap::_lookup_map_header(const hobject_t &hoid)
+DBObjectMap::Header DBObjectMap::_lookup_map_header(const ghobject_t &oid)
{
- while (map_header_in_use.count(hoid))
+ while (map_header_in_use.count(oid))
header_cond.Wait(header_lock);
map<string, bufferlist> out;
set<string> to_get;
- to_get.insert(map_header_key(hoid));
+ to_get.insert(map_header_key(oid));
int r = db->get(HOBJECT_TO_SEQ, to_get, &out);
if (r < 0)
return Header();
if (out.empty())
return Header();
- Header ret(new _Header(), RemoveMapHeaderOnDelete(this, hoid));
+ Header ret(new _Header(), RemoveMapHeaderOnDelete(this, oid));
bufferlist::iterator iter = out.begin()->second.begin();
ret->decode(iter);
return ret;
}
-DBObjectMap::Header DBObjectMap::_generate_new_header(const hobject_t &hoid,
+DBObjectMap::Header DBObjectMap::_generate_new_header(const ghobject_t &oid,
Header parent)
{
Header header = Header(new _Header(), RemoveOnDelete(this));
@@ -1097,7 +1104,7 @@ DBObjectMap::Header DBObjectMap::_generate_new_header(const hobject_t &hoid,
header->spos = parent->spos;
}
header->num_children = 1;
- header->hoid = hoid;
+ header->oid = oid;
assert(!in_use.count(header->seq));
in_use.insert(header->seq);
@@ -1137,14 +1144,14 @@ DBObjectMap::Header DBObjectMap::lookup_parent(Header input)
}
DBObjectMap::Header DBObjectMap::lookup_create_map_header(
- const hobject_t &hoid,
+ const ghobject_t &oid,
KeyValueDB::Transaction t)
{
Mutex::Locker l(header_lock);
- Header header = _lookup_map_header(hoid);
+ Header header = _lookup_map_header(oid);
if (!header) {
- header = _generate_new_header(hoid, Header());
- set_map_header(hoid, *header, t);
+ header = _generate_new_header(oid, Header());
+ set_map_header(oid, *header, t);
}
return header;
}
@@ -1169,50 +1176,50 @@ void DBObjectMap::set_header(Header header, KeyValueDB::Transaction t)
t->set(sys_prefix(header), to_write);
}
-void DBObjectMap::remove_map_header(const hobject_t &hoid,
+void DBObjectMap::remove_map_header(const ghobject_t &oid,
Header header,
KeyValueDB::Transaction t)
{
dout(20) << "remove_map_header: removing " << header->seq
- << " hoid " << hoid << dendl;
+ << " oid " << oid << dendl;
set<string> to_remove;
- to_remove.insert(map_header_key(hoid));
+ to_remove.insert(map_header_key(oid));
t->rmkeys(HOBJECT_TO_SEQ, to_remove);
}
-void DBObjectMap::set_map_header(const hobject_t &hoid, _Header header,
+void DBObjectMap::set_map_header(const ghobject_t &oid, _Header header,
KeyValueDB::Transaction t)
{
dout(20) << "set_map_header: setting " << header.seq
- << " hoid " << hoid << " parent seq "
+ << " oid " << oid << " parent seq "
<< header.parent << dendl;
map<string, bufferlist> to_set;
- header.encode(to_set[map_header_key(hoid)]);
+ header.encode(to_set[map_header_key(oid)]);
t->set(HOBJECT_TO_SEQ, to_set);
}
-bool DBObjectMap::check_spos(const hobject_t &hoid,
+bool DBObjectMap::check_spos(const ghobject_t &oid,
Header header,
const SequencerPosition *spos)
{
if (!spos || *spos > header->spos) {
stringstream out;
if (spos)
- dout(10) << "hoid: " << hoid << " not skipping op, *spos "
+ dout(10) << "oid: " << oid << " not skipping op, *spos "
<< *spos << dendl;
else
- dout(10) << "hoid: " << hoid << " not skipping op, *spos "
+ dout(10) << "oid: " << oid << " not skipping op, *spos "
<< "empty" << dendl;
dout(10) << " > header.spos " << header->spos << dendl;
return false;
} else {
- dout(10) << "hoid: " << hoid << " skipping op, *spos " << *spos
+ dout(10) << "oid: " << oid << " skipping op, *spos " << *spos
<< " <= header.spos " << header->spos << dendl;
return true;
}
}
-int DBObjectMap::list_objects(vector<hobject_t> *out)
+int DBObjectMap::list_objects(vector<ghobject_t> *out)
{
KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
for (iter->seek_to_first(); iter->valid(); iter->next()) {
@@ -1220,7 +1227,7 @@ int DBObjectMap::list_objects(vector<hobject_t> *out)
bufferlist::iterator bliter = bl.begin();
_Header header;
header.decode(bliter);
- out->push_back(header.hoid);
+ out->push_back(header.oid);
}
return 0;
}
diff --git a/src/os/DBObjectMap.h b/src/os/DBObjectMap.h
index ba05dff6c6f..459447f9c97 100644
--- a/src/os/DBObjectMap.h
+++ b/src/os/DBObjectMap.h
@@ -26,7 +26,7 @@
* @see user_prefix
* @see sys_prefix
*
- * - HOBJECT_TO_SEQ: Contains leaf mapping from hobject_t->seq and
+ * - GHOBJECT_TO_SEQ: Contains leaf mapping from ghobject_t->hobj.seq and
* corresponding omap header
* - SYS_PREFIX: GLOBAL_STATE_KEY - contains next seq number
* @see State
@@ -66,89 +66,89 @@ public:
* Set of headers currently in use
*/
set<uint64_t> in_use;
- set<hobject_t> map_header_in_use;
+ set<ghobject_t> map_header_in_use;
DBObjectMap(KeyValueDB *db) : db(db),
header_lock("DBOBjectMap")
{}
int set_keys(
- const hobject_t &hoid,
+ const ghobject_t &oid,
const map<string, bufferlist> &set,
const SequencerPosition *spos=0
);
int set_header(
- const hobject_t &hoid,
+ const ghobject_t &oid,
const bufferlist &bl,
const SequencerPosition *spos=0
);
int get_header(
- const hobject_t &hoid,
+ const ghobject_t &oid,
bufferlist *bl
);
int clear(
- const hobject_t &hoid,
+ const ghobject_t &oid,
const SequencerPosition *spos=0
);
int rm_keys(
- const hobject_t &hoid,
+ const ghobject_t &oid,
const set<string> &to_clear,
const SequencerPosition *spos=0
);
int get(
- const hobject_t &hoid,
+ const ghobject_t &oid,
bufferlist *header,
map<string, bufferlist> *out
);
int get_keys(
- const hobject_t &hoid,
+ const ghobject_t &oid,
set<string> *keys
);
int get_values(
- const hobject_t &hoid,
+ const ghobject_t &oid,
const set<string> &keys,
map<string, bufferlist> *out
);
int check_keys(
- const hobject_t &hoid,
+ const ghobject_t &oid,
const set<string> &keys,
set<string> *out
);
int get_xattrs(
- const hobject_t &hoid,
+ const ghobject_t &oid,
const set<string> &to_get,
map<string, bufferlist> *out
);
int get_all_xattrs(
- const hobject_t &hoid,
+ const ghobject_t &oid,
set<string> *out
);
int set_xattrs(
- const hobject_t &hoid,
+ const ghobject_t &oid,
const map<string, bufferlist> &to_set,
const SequencerPosition *spos=0
);
int remove_xattrs(
- const hobject_t &hoid,
+ const ghobject_t &oid,
const set<string> &to_remove,
const SequencerPosition *spos=0
);
int clone(
- const hobject_t &hoid,
- const hobject_t &target,
+ const ghobject_t &oid,
+ const ghobject_t &target,
const SequencerPosition *spos=0
);
@@ -162,13 +162,13 @@ public:
bool check(std::ostream &out);
/// Ensure that all previous operations are durable
- int sync(const hobject_t *hoid=0, const SequencerPosition *spos=0);
+ int sync(const ghobject_t *oid=0, const SequencerPosition *spos=0);
/// Util, list all objects, there must be no other concurrent access
- int list_objects(vector<hobject_t> *objs ///< [out] objects
+ int list_objects(vector<ghobject_t> *objs ///< [out] objects
);
- ObjectMapIterator get_iterator(const hobject_t &hoid);
+ ObjectMapIterator get_iterator(const ghobject_t &oid);
static const string USER_PREFIX;
static const string XATTR_PREFIX;
@@ -223,7 +223,7 @@ public:
uint64_t num_children;
coll_t c;
- hobject_t hoid;
+ ghobject_t oid;
SequencerPosition spos;
@@ -233,7 +233,7 @@ public:
::encode(parent, bl);
::encode(num_children, bl);
::encode(c, bl);
- ::encode(hoid, bl);
+ ::encode(oid, bl);
::encode(spos, bl);
ENCODE_FINISH(bl);
}
@@ -244,7 +244,7 @@ public:
::decode(parent, bl);
::decode(num_children, bl);
::decode(c, bl);
- ::decode(hoid, bl);
+ ::decode(oid, bl);
if (struct_v >= 2)
::decode(spos, bl);
DECODE_FINISH(bl);
@@ -255,7 +255,7 @@ public:
f->dump_unsigned("parent", parent);
f->dump_unsigned("num_children", num_children);
f->dump_stream("coll") << c;
- f->dump_stream("oid") << hoid;
+ f->dump_stream("oid") << oid;
}
static void generate_test_instances(list<_Header*> &o) {
@@ -269,15 +269,15 @@ public:
};
/// String munging (public for testing)
- static string hobject_key(const hobject_t &hoid);
- static string hobject_key_v0(coll_t c, const hobject_t &hoid);
- static bool parse_hobject_key_v0(const string &in,
- coll_t *c, hobject_t *hoid);
+ static string ghobject_key(const ghobject_t &oid);
+ static string ghobject_key_v0(coll_t c, const ghobject_t &oid);
+ static bool parse_ghobject_key_v0(const string &in,
+ coll_t *c, ghobject_t *oid);
private:
/// Implicit lock on Header->seq
typedef std::tr1::shared_ptr<_Header> Header;
- string map_header_key(const hobject_t &hoid);
+ string map_header_key(const ghobject_t &oid);
string header_key(uint64_t seq);
string complete_prefix(Header header);
string user_prefix(Header header);
@@ -368,40 +368,40 @@ private:
/// Set node containing input to new contents
void set_header(Header input, KeyValueDB::Transaction t);
- /// Remove leaf node corresponding to hoid in c
- void remove_map_header(const hobject_t &hoid,
+ /// Remove leaf node corresponding to oid in c
+ void remove_map_header(const ghobject_t &oid,
Header header,
KeyValueDB::Transaction t);
- /// Set leaf node for c and hoid to the value of header
- void set_map_header(const hobject_t &hoid, _Header header,
+ /// Set leaf node for c and oid to the value of header
+ void set_map_header(const ghobject_t &oid, _Header header,
KeyValueDB::Transaction t);
- /// Set leaf node for c and hoid to the value of header
- bool check_spos(const hobject_t &hoid,
+ /// Set leaf node for c and oid to the value of header
+ bool check_spos(const ghobject_t &oid,
Header header,
const SequencerPosition *spos);
- /// Lookup or create header for c hoid
- Header lookup_create_map_header(const hobject_t &hoid,
+ /// Lookup or create header for c oid
+ Header lookup_create_map_header(const ghobject_t &oid,
KeyValueDB::Transaction t);
/**
- * Generate new header for c hoid with new seq number
+ * Generate new header for c oid with new seq number
*
* Has the side effect of syncronously saving the new DBObjectMap state
*/
- Header _generate_new_header(const hobject_t &hoid, Header parent);
- Header generate_new_header(const hobject_t &hoid, Header parent) {
+ Header _generate_new_header(const ghobject_t &oid, Header parent);
+ Header generate_new_header(const ghobject_t &oid, Header parent) {
Mutex::Locker l(header_lock);
- return _generate_new_header(hoid, parent);
+ return _generate_new_header(oid, parent);
}
- /// Lookup leaf header for c hoid
- Header _lookup_map_header(const hobject_t &hoid);
- Header lookup_map_header(const hobject_t &hoid) {
+ /// Lookup leaf header for c oid
+ Header _lookup_map_header(const ghobject_t &oid);
+ Header lookup_map_header(const ghobject_t &oid) {
Mutex::Locker l(header_lock);
- return _lookup_map_header(hoid);
+ return _lookup_map_header(oid);
}
/// Lookup header node for input
@@ -448,12 +448,12 @@ private:
class RemoveMapHeaderOnDelete {
public:
DBObjectMap *db;
- hobject_t obj;
- RemoveMapHeaderOnDelete(DBObjectMap *db, const hobject_t &obj) :
- db(db), obj(obj) {}
+ ghobject_t oid;
+ RemoveMapHeaderOnDelete(DBObjectMap *db, const ghobject_t &oid) :
+ db(db), oid(oid) {}
void operator() (_Header *header) {
Mutex::Locker l(db->header_lock);
- db->map_header_in_use.erase(obj);
+ db->map_header_in_use.erase(oid);
db->map_header_cond.Signal();
delete header;
}
diff --git a/src/os/FDCache.h b/src/os/FDCache.h
index f0f40e7bbf4..93557d43c47 100644
--- a/src/os/FDCache.h
+++ b/src/os/FDCache.h
@@ -18,7 +18,7 @@
#include <memory>
#include <errno.h>
#include <cstdio>
-#include "hobject.h"
+#include "common/hobject.h"
#include "common/Mutex.h"
#include "common/Cond.h"
#include "common/shared_cache.hpp"
@@ -49,7 +49,7 @@ public:
};
private:
- SharedLRU<hobject_t, FD> registry;
+ SharedLRU<ghobject_t, FD> registry;
CephContext *cct;
public:
@@ -63,16 +63,16 @@ public:
}
typedef std::tr1::shared_ptr<FD> FDRef;
- FDRef lookup(const hobject_t &hoid) {
+ FDRef lookup(const ghobject_t &hoid) {
return registry.lookup(hoid);
}
- FDRef add(const hobject_t &hoid, int fd) {
+ FDRef add(const ghobject_t &hoid, int fd) {
return registry.add(hoid, new FD(fd));
}
/// clear cached fd for hoid, subsequent lookups will get an empty FD
- void clear(const hobject_t &hoid) {
+ void clear(const ghobject_t &hoid) {
registry.clear(hoid);
assert(!registry.lookup(hoid));
}
diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc
index 80561056daa..6940dff1405 100644
--- a/src/os/FileStore.cc
+++ b/src/os/FileStore.cc
@@ -11,8 +11,8 @@
* Foundation. See file COPYING.
*
*/
+#include "include/int_types.h"
-#include <inttypes.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/types.h>
@@ -30,10 +30,6 @@
#include <iostream>
#include <map>
-#if defined(__FreeBSD__)
-#include "include/inttypes.h"
-#endif
-
#include "include/compat.h"
#include "include/linux_fiemap.h"
@@ -90,6 +86,23 @@ using ceph::crypto::SHA1;
#define REPLAY_GUARD_XATTR "user.cephos.seq"
#define GLOBAL_REPLAY_GUARD_XATTR "user.cephos.gseq"
+//Initial features in new superblock.
+static CompatSet get_fs_initial_compat_set() {
+ CompatSet::FeatureSet ceph_osd_feature_compat;
+ CompatSet::FeatureSet ceph_osd_feature_ro_compat;
+ CompatSet::FeatureSet ceph_osd_feature_incompat;
+ return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
+ ceph_osd_feature_incompat);
+}
+
+//Features are added here that this FileStore supports.
+static CompatSet get_fs_supported_compat_set() {
+ CompatSet compat = get_fs_initial_compat_set();
+ //Any features here can be set in code, but not in initial superblock
+ compat.incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS);
+ return compat;
+}
+
void FileStore::FSPerfTracker::update_from_perfcounters(
PerfCounters &logger)
@@ -128,12 +141,12 @@ int FileStore::init_index(coll_t cid)
{
char path[PATH_MAX];
get_cdir(cid, path, sizeof(path));
- int r = index_manager.init_index(cid, path, on_disk_version);
+ int r = index_manager.init_index(cid, path, target_version);
assert(!m_filestore_fail_eio || r != -EIO);
return r;
}
-int FileStore::lfn_find(coll_t cid, const hobject_t& oid, IndexedPath *path)
+int FileStore::lfn_find(coll_t cid, const ghobject_t& oid, IndexedPath *path)
{
Index index;
int r, exist;
@@ -151,20 +164,25 @@ int FileStore::lfn_find(coll_t cid, const hobject_t& oid, IndexedPath *path)
return 0;
}
-int FileStore::lfn_truncate(coll_t cid, const hobject_t& oid, off_t length)
+int FileStore::lfn_truncate(coll_t cid, const ghobject_t& oid, off_t length)
{
IndexedPath path;
- int r = lfn_find(cid, oid, &path);
+ FDRef fd;
+ int r = lfn_open(cid, oid, false, &fd, &path);
if (r < 0)
return r;
- r = ::truncate(path->path(), length);
+ r = ::ftruncate(**fd, length);
if (r < 0)
r = -errno;
+ if (r >= 0 && m_filestore_sloppy_crc) {
+ int rc = backend->_crc_update_truncate(**fd, length);
+ assert(rc >= 0);
+ }
assert(!m_filestore_fail_eio || r != -EIO);
return r;
}
-int FileStore::lfn_stat(coll_t cid, const hobject_t& oid, struct stat *buf)
+int FileStore::lfn_stat(coll_t cid, const ghobject_t& oid, struct stat *buf)
{
IndexedPath path;
int r = lfn_find(cid, oid, &path);
@@ -177,12 +195,15 @@ int FileStore::lfn_stat(coll_t cid, const hobject_t& oid, struct stat *buf)
}
int FileStore::lfn_open(coll_t cid,
- const hobject_t& oid,
+ const ghobject_t& oid,
bool create,
FDRef *outfd,
IndexedPath *path,
Index *index)
{
+ assert(get_allow_sharded_objects() ||
+ ( oid.shard_id == ghobject_t::NO_SHARD &&
+ oid.generation == ghobject_t::NO_GEN ));
assert(outfd);
int flags = O_RDWR;
if (create)
@@ -250,14 +271,14 @@ void FileStore::lfn_close(FDRef fd)
{
}
-int FileStore::lfn_link(coll_t c, coll_t cid, const hobject_t& o)
+int FileStore::lfn_link(coll_t c, coll_t newcid, const ghobject_t& o, const ghobject_t& newoid)
{
Index index_new, index_old;
IndexedPath path_new, path_old;
int exist;
int r;
- if (c < cid) {
- r = get_index(cid, &index_new);
+ if (c < newcid) {
+ r = get_index(newcid, &index_new);
if (r < 0)
return r;
r = get_index(c, &index_old);
@@ -267,7 +288,7 @@ int FileStore::lfn_link(coll_t c, coll_t cid, const hobject_t& o)
r = get_index(c, &index_old);
if (r < 0)
return r;
- r = get_index(cid, &index_new);
+ r = get_index(newcid, &index_new);
if (r < 0)
return r;
}
@@ -280,7 +301,7 @@ int FileStore::lfn_link(coll_t c, coll_t cid, const hobject_t& o)
if (!exist)
return -ENOENT;
- r = index_new->lookup(o, &path_new, &exist);
+ r = index_new->lookup(newoid, &path_new, &exist);
if (r < 0) {
assert(!m_filestore_fail_eio || r != -EIO);
return r;
@@ -294,7 +315,7 @@ int FileStore::lfn_link(coll_t c, coll_t cid, const hobject_t& o)
if (r < 0)
return -errno;
- r = index_new->created(o, path_new->path());
+ r = index_new->created(newoid, path_new->path());
if (r < 0) {
assert(!m_filestore_fail_eio || r != -EIO);
return r;
@@ -302,8 +323,9 @@ int FileStore::lfn_link(coll_t c, coll_t cid, const hobject_t& o)
return 0;
}
-int FileStore::lfn_unlink(coll_t cid, const hobject_t& o,
- const SequencerPosition &spos)
+int FileStore::lfn_unlink(coll_t cid, const ghobject_t& o,
+ const SequencerPosition &spos,
+ bool force_clear_omap)
{
Index index;
int r = get_index(cid, &index);
@@ -319,14 +341,18 @@ int FileStore::lfn_unlink(coll_t cid, const hobject_t& o,
return r;
}
- struct stat st;
- r = ::stat(path->path(), &st);
- if (r < 0) {
- r = -errno;
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
+ if (!force_clear_omap) {
+ struct stat st;
+ r = ::stat(path->path(), &st);
+ if (r < 0) {
+ r = -errno;
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+ }
+ if (st.st_nlink == 1)
+ force_clear_omap = true;
}
- if (st.st_nlink == 1) {
+ if (force_clear_omap) {
dout(20) << __func__ << ": clearing omap on " << o
<< " in cid " << cid << dendl;
r = object_map->clear(o, &spos);
@@ -396,7 +422,12 @@ FileStore::FileStore(const std::string &base, const std::string &jdev, const cha
m_filestore_queue_committing_max_ops(g_conf->filestore_queue_committing_max_ops),
m_filestore_queue_committing_max_bytes(g_conf->filestore_queue_committing_max_bytes),
m_filestore_do_dump(false),
- m_filestore_dump_fmt(true)
+ m_filestore_dump_fmt(true),
+ m_filestore_sloppy_crc(g_conf->filestore_sloppy_crc),
+ m_filestore_sloppy_crc_block_size(g_conf->filestore_sloppy_crc_block_size),
+ m_fs_type(FS_TYPE_NONE),
+ m_filestore_max_inline_xattr_size(0),
+ m_filestore_max_inline_xattrs(0)
{
m_filestore_kill_at.set(g_conf->filestore_kill_at);
@@ -446,6 +477,8 @@ FileStore::FileStore(const std::string &base, const std::string &jdev, const cha
generic_backend = new GenericFileStoreBackend(this);
backend = generic_backend;
+
+ superblock.compat_features = get_fs_initial_compat_set();
}
FileStore::~FileStore()
@@ -591,6 +624,13 @@ int FileStore::mkfs()
goto close_fsid_fd;
}
+ ret = write_superblock();
+ if (ret < 0) {
+ derr << "mkfs: write_superblock() failed: "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+
struct statfs basefs;
ret = ::fstatfs(basedir_fd, &basefs);
if (ret < 0) {
@@ -790,12 +830,14 @@ int FileStore::_detect_fs()
blk_size = st.f_bsize;
+ m_fs_type = FS_TYPE_OTHER;
#if defined(__linux__)
if (st.f_type == BTRFS_SUPER_MAGIC) {
dout(0) << "mount detected btrfs" << dendl;
backend = new BtrfsFileStoreBackend(this);
wbthrottle.set_fs(WBThrottle::BTRFS);
+ m_fs_type = FS_TYPE_BTRFS;
} else if (st.f_type == XFS_SUPER_MAGIC) {
dout(1) << "mount detected xfs" << dendl;
if (m_filestore_replica_fadvise) {
@@ -803,15 +845,19 @@ int FileStore::_detect_fs()
g_conf->set_val("filestore_replica_fadvise", "false");
g_conf->apply_changes(NULL);
assert(m_filestore_replica_fadvise == false);
+ m_fs_type = FS_TYPE_XFS;
}
}
#endif
#ifdef HAVE_LIBZFS
if (st.f_type == ZFS_SUPER_MAGIC) {
backend = new ZFSFileStoreBackend(this);
+ m_fs_type = FS_TYPE_ZFS;
}
#endif
+ set_xattr_limits_via_conf();
+
r = backend->detect_features();
if (r < 0) {
derr << "_detect_fs: detect_features error: " << cpp_strerror(r) << dendl;
@@ -852,14 +898,7 @@ int FileStore::_detect_fs()
chain_fsetxattr(tmpfd, "user.test4", &buf, sizeof(buf));
ret = chain_fsetxattr(tmpfd, "user.test5", &buf, sizeof(buf));
if (ret == -ENOSPC) {
- if (!g_conf->filestore_xattr_use_omap) {
- dout(0) << "limited size xattrs -- automatically enabling filestore_xattr_use_omap" << dendl;
- g_conf->set_val("filestore_xattr_use_omap", "true");
- g_conf->apply_changes(NULL);
- assert(g_conf->filestore_xattr_use_omap == true);
- } else {
- dout(0) << "limited size xattrs -- filestore_xattr_use_omap already enabled" << dendl;
- }
+ dout(0) << "limited size xattrs" << dendl;
}
chain_fremovexattr(tmpfd, "user.test");
chain_fremovexattr(tmpfd, "user.test2");
@@ -916,6 +955,49 @@ int FileStore::_sanity_check_fs()
return 0;
}
+int FileStore::write_superblock()
+{
+ bufferlist bl;
+ ::encode(superblock, bl);
+ return safe_write_file(basedir.c_str(), "superblock",
+ bl.c_str(), bl.length());
+}
+
+int FileStore::read_superblock()
+{
+ bufferptr bp(PATH_MAX);
+ int ret = safe_read_file(basedir.c_str(), "superblock",
+ bp.c_str(), bp.length());
+ if (ret < 0) {
+ if (ret == -ENOENT) {
+ // If the file doesn't exist write initial CompatSet
+ return write_superblock();
+ }
+ return ret;
+ }
+
+ bufferlist bl;
+ bl.push_back(bp);
+ bufferlist::iterator i = bl.begin();
+ ::decode(superblock, i);
+ return 0;
+}
+
+void FileStore::set_allow_sharded_objects()
+{
+ if (!get_allow_sharded_objects()) {
+ superblock.compat_features.incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS);
+ int ret = write_superblock();
+ assert(ret == 0); //Should we return error and make caller handle it?
+ }
+ return;
+}
+
+bool FileStore::get_allow_sharded_objects()
+{
+ return superblock.compat_features.incompat.contains(CEPH_FS_FEATURE_INCOMPAT_SHARDS);
+}
+
int FileStore::update_version_stamp()
{
return write_version_stamp();
@@ -923,25 +1005,19 @@ int FileStore::update_version_stamp()
int FileStore::version_stamp_is_valid(uint32_t *version)
{
- char fn[PATH_MAX];
- snprintf(fn, sizeof(fn), "%s/store_version", basedir.c_str());
- int fd = ::open(fn, O_RDONLY, 0644);
- if (fd < 0) {
- if (errno == ENOENT)
+ bufferptr bp(PATH_MAX);
+ int ret = safe_read_file(basedir.c_str(), "store_version",
+ bp.c_str(), bp.length());
+ if (ret < 0) {
+ if (ret == -ENOENT)
return 0;
- else
- return -errno;
+ return ret;
}
- bufferptr bp(PATH_MAX);
- int ret = safe_read(fd, bp.c_str(), bp.length());
- TEMP_FAILURE_RETRY(::close(fd));
- if (ret < 0)
- return -errno;
bufferlist bl;
bl.push_back(bp);
bufferlist::iterator i = bl.begin();
::decode(*version, i);
- if (*version == on_disk_version)
+ if (*version == target_version)
return 1;
else
return 0;
@@ -949,19 +1025,11 @@ int FileStore::version_stamp_is_valid(uint32_t *version)
int FileStore::write_version_stamp()
{
- char fn[PATH_MAX];
- snprintf(fn, sizeof(fn), "%s/store_version", basedir.c_str());
- int fd = ::open(fn, O_WRONLY|O_CREAT|O_TRUNC, 0644);
- if (fd < 0)
- return -errno;
bufferlist bl;
- ::encode(on_disk_version, bl);
-
- int ret = safe_write(fd, bl.c_str(), bl.length());
- TEMP_FAILURE_RETRY(::close(fd));
- if (ret < 0)
- return -errno;
- return 0;
+ ::encode(target_version, bl);
+
+ return safe_write_file(basedir.c_str(), "store_version",
+ bl.c_str(), bl.length());
}
int FileStore::read_op_seq(uint64_t *seq)
@@ -1003,6 +1071,7 @@ int FileStore::mount()
char buf[PATH_MAX];
uint64_t initial_op_seq;
set<string> cluster_snaps;
+ CompatSet supported_compat_set = get_fs_supported_compat_set();
dout(5) << "basedir " << basedir << " journal " << journalpath << dendl;
@@ -1057,12 +1126,26 @@ int FileStore::mount()
ret = -EINVAL;
derr << "FileStore::mount : stale version stamp " << version_stamp
<< ". Please run the FileStore update script before starting the "
- << "OSD, or set filestore_update_to to " << on_disk_version
+ << "OSD, or set filestore_update_to to " << target_version
<< dendl;
goto close_fsid_fd;
}
}
+ ret = read_superblock();
+ if (ret < 0) {
+ ret = -EINVAL;
+ goto close_fsid_fd;
+ }
+
+ // Check if this FileStore supports all the necessary features to mount
+ if (supported_compat_set.compare(superblock.compat_features) == -1) {
+ derr << "FileStore::mount : Incompatible features set "
+ << superblock.compat_features << dendl;
+ ret = -EINVAL;
+ goto close_fsid_fd;
+ }
+
// open some dir handles
basedir_fd = ::open(basedir.c_str(), O_RDONLY);
if (basedir_fd < 0) {
@@ -1710,7 +1793,7 @@ int FileStore::_do_transactions(
for (list<Transaction*>::iterator p = tls.begin();
p != tls.end();
++p, trans_num++) {
- r = _do_transaction(**p, op_seq, trans_num);
+ r = _do_transaction(**p, op_seq, trans_num, handle);
if (r < 0)
break;
if (handle)
@@ -1812,7 +1895,7 @@ void FileStore::_set_replay_guard(coll_t cid,
void FileStore::_set_replay_guard(int fd,
const SequencerPosition& spos,
- const hobject_t *hoid,
+ const ghobject_t *hoid,
bool in_progress)
{
if (backend->can_checkpoint())
@@ -1893,7 +1976,7 @@ void FileStore::_close_replay_guard(int fd, const SequencerPosition& spos)
dout(10) << "_close_replay_guard " << spos << " done" << dendl;
}
-int FileStore::_check_replay_guard(coll_t cid, hobject_t oid, const SequencerPosition& spos)
+int FileStore::_check_replay_guard(coll_t cid, ghobject_t oid, const SequencerPosition& spos)
{
if (!replaying || backend->can_checkpoint())
return 1;
@@ -1972,7 +2055,9 @@ int FileStore::_check_replay_guard(int fd, const SequencerPosition& spos)
}
}
-unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_num)
+unsigned FileStore::_do_transaction(
+ Transaction& t, uint64_t op_seq, int trans_num,
+ ThreadPool::TPHandle *handle)
{
dout(10) << "_do_transaction on " << &t << dendl;
@@ -1980,6 +2065,9 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
SequencerPosition spos(op_seq, trans_num, 0);
while (i.have_op()) {
+ if (handle)
+ handle->reset_tp_timeout();
+
int op = i.get_op();
int r = 0;
@@ -1991,7 +2079,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_TOUCH:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
if (_check_replay_guard(cid, oid, spos) > 0)
r = _touch(cid, oid);
}
@@ -2000,7 +2088,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_WRITE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
uint64_t off = i.get_length();
uint64_t len = i.get_length();
bool replica = i.get_replica();
@@ -2014,7 +2102,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_ZERO:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
uint64_t off = i.get_length();
uint64_t len = i.get_length();
if (_check_replay_guard(cid, oid, spos) > 0)
@@ -2035,7 +2123,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_TRUNCATE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
uint64_t off = i.get_length();
if (_check_replay_guard(cid, oid, spos) > 0)
r = _truncate(cid, oid, off);
@@ -2045,7 +2133,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_REMOVE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
if (_check_replay_guard(cid, oid, spos) > 0)
r = _remove(cid, oid, spos);
}
@@ -2054,7 +2142,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_SETATTR:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
string name = i.get_attrname();
bufferlist bl;
i.get_bl(bl);
@@ -2072,7 +2160,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_SETATTRS:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
map<string, bufferptr> aset;
i.get_attrset(aset);
if (_check_replay_guard(cid, oid, spos) > 0)
@@ -2085,7 +2173,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_RMATTR:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
string name = i.get_attrname();
if (_check_replay_guard(cid, oid, spos) > 0)
r = _rmattr(cid, oid, name.c_str(), spos);
@@ -2095,7 +2183,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_RMATTRS:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
if (_check_replay_guard(cid, oid, spos) > 0)
r = _rmattrs(cid, oid, spos);
}
@@ -2104,8 +2192,8 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_CLONE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
- hobject_t noid = i.get_oid();
+ ghobject_t oid = i.get_oid();
+ ghobject_t noid = i.get_oid();
r = _clone(cid, oid, noid, spos);
}
break;
@@ -2113,8 +2201,8 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_CLONERANGE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
- hobject_t noid = i.get_oid();
+ ghobject_t oid = i.get_oid();
+ ghobject_t noid = i.get_oid();
uint64_t off = i.get_length();
uint64_t len = i.get_length();
r = _clone_range(cid, oid, noid, off, len, off, spos);
@@ -2124,8 +2212,8 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_CLONERANGE2:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
- hobject_t noid = i.get_oid();
+ ghobject_t oid = i.get_oid();
+ ghobject_t noid = i.get_oid();
uint64_t srcoff = i.get_length();
uint64_t len = i.get_length();
uint64_t dstoff = i.get_length();
@@ -2153,7 +2241,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
{
coll_t ncid = i.get_cid();
coll_t ocid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
r = _collection_add(ncid, ocid, oid, spos);
}
break;
@@ -2161,7 +2249,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_COLL_REMOVE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
if (_check_replay_guard(cid, oid, spos) > 0)
r = _remove(cid, oid, spos);
}
@@ -2172,7 +2260,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
// WARNING: this is deprecated and buggy; only here to replay old journals.
coll_t ocid = i.get_cid();
coll_t ncid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
r = _collection_add(ocid, ncid, oid, spos);
if (r == 0 &&
(_check_replay_guard(ocid, oid, spos) > 0))
@@ -2180,6 +2268,16 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
}
break;
+ case Transaction::OP_COLL_MOVE_RENAME:
+ {
+ coll_t oldcid = i.get_cid();
+ ghobject_t oldoid = i.get_oid();
+ coll_t newcid = i.get_cid();
+ ghobject_t newoid = i.get_oid();
+ r = _collection_move_rename(oldcid, oldoid, newcid, newoid, spos);
+ }
+ break;
+
case Transaction::OP_COLL_SETATTR:
{
coll_t cid = i.get_cid();
@@ -2215,14 +2313,14 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_OMAP_CLEAR:
{
coll_t cid(i.get_cid());
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
r = _omap_clear(cid, oid, spos);
}
break;
case Transaction::OP_OMAP_SETKEYS:
{
coll_t cid(i.get_cid());
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
map<string, bufferlist> aset;
i.get_attrset(aset);
r = _omap_setkeys(cid, oid, aset, spos);
@@ -2231,7 +2329,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_OMAP_RMKEYS:
{
coll_t cid(i.get_cid());
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
set<string> keys;
i.get_keyset(keys);
r = _omap_rmkeys(cid, oid, keys, spos);
@@ -2240,7 +2338,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_OMAP_RMKEYRANGE:
{
coll_t cid(i.get_cid());
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
string first, last;
first = i.get_key();
last = i.get_key();
@@ -2250,7 +2348,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_OMAP_SETHEADER:
{
coll_t cid(i.get_cid());
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
bufferlist bl;
i.get_bl(bl);
r = _omap_setheader(cid, oid, bl, spos);
@@ -2370,7 +2468,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
// --------------------
// objects
-bool FileStore::exists(coll_t cid, const hobject_t& oid)
+bool FileStore::exists(coll_t cid, const ghobject_t& oid)
{
struct stat st;
if (stat(cid, oid, &st) == 0)
@@ -2380,7 +2478,7 @@ bool FileStore::exists(coll_t cid, const hobject_t& oid)
}
int FileStore::stat(
- coll_t cid, const hobject_t& oid, struct stat *st, bool allow_eio)
+ coll_t cid, const ghobject_t& oid, struct stat *st, bool allow_eio)
{
int r = lfn_stat(cid, oid, st);
assert(allow_eio || !m_filestore_fail_eio || r != -EIO);
@@ -2402,7 +2500,7 @@ int FileStore::stat(
int FileStore::read(
coll_t cid,
- const hobject_t& oid,
+ const ghobject_t& oid,
uint64_t offset,
size_t len,
bufferlist& bl,
@@ -2438,6 +2536,17 @@ int FileStore::read(
}
bptr.set_length(got); // properly size the buffer
bl.push_back(bptr); // put it in the target bufferlist
+
+ if (m_filestore_sloppy_crc && (!replaying || backend->can_checkpoint())) {
+ ostringstream ss;
+ int errors = backend->_crc_verify_read(**fd, offset, got, bl, &ss);
+ if (errors > 0) {
+ dout(0) << "FileStore::read " << cid << "/" << oid << " " << offset << "~"
+ << got << " ... BAD CRC:\n" << ss.str() << dendl;
+ assert(0 == "bad crc on read");
+ }
+ }
+
lfn_close(fd);
dout(10) << "FileStore::read " << cid << "/" << oid << " " << offset << "~"
@@ -2450,7 +2559,7 @@ int FileStore::read(
}
}
-int FileStore::fiemap(coll_t cid, const hobject_t& oid,
+int FileStore::fiemap(coll_t cid, const ghobject_t& oid,
uint64_t offset, size_t len,
bufferlist& bl)
{
@@ -2478,8 +2587,10 @@ int FileStore::fiemap(coll_t cid, const hobject_t& oid,
if (r < 0)
goto done;
- if (fiemap->fm_mapped_extents == 0)
+ if (fiemap->fm_mapped_extents == 0) {
+ free(fiemap);
goto done;
+ }
struct fiemap_extent *extent = &fiemap->fm_extents[0];
@@ -2513,6 +2624,7 @@ int FileStore::fiemap(coll_t cid, const hobject_t& oid,
i++;
extent++;
}
+ free(fiemap);
}
done:
@@ -2522,13 +2634,12 @@ done:
}
dout(10) << "fiemap " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << " num_extents=" << exomap.size() << " " << exomap << dendl;
- free(fiemap);
assert(!m_filestore_fail_eio || r != -EIO);
return r;
}
-int FileStore::_remove(coll_t cid, const hobject_t& oid,
+int FileStore::_remove(coll_t cid, const ghobject_t& oid,
const SequencerPosition &spos)
{
dout(15) << "remove " << cid << "/" << oid << dendl;
@@ -2537,7 +2648,7 @@ int FileStore::_remove(coll_t cid, const hobject_t& oid,
return r;
}
-int FileStore::_truncate(coll_t cid, const hobject_t& oid, uint64_t size)
+int FileStore::_truncate(coll_t cid, const ghobject_t& oid, uint64_t size)
{
dout(15) << "truncate " << cid << "/" << oid << " size " << size << dendl;
int r = lfn_truncate(cid, oid, size);
@@ -2546,7 +2657,7 @@ int FileStore::_truncate(coll_t cid, const hobject_t& oid, uint64_t size)
}
-int FileStore::_touch(coll_t cid, const hobject_t& oid)
+int FileStore::_touch(coll_t cid, const ghobject_t& oid)
{
dout(15) << "touch " << cid << "/" << oid << dendl;
@@ -2561,7 +2672,7 @@ int FileStore::_touch(coll_t cid, const hobject_t& oid)
return r;
}
-int FileStore::_write(coll_t cid, const hobject_t& oid,
+int FileStore::_write(coll_t cid, const ghobject_t& oid,
uint64_t offset, size_t len,
const bufferlist& bl, bool replica)
{
@@ -2599,8 +2710,14 @@ int FileStore::_write(coll_t cid, const hobject_t& oid,
if (r == 0)
r = bl.length();
+ if (r >= 0 && m_filestore_sloppy_crc) {
+ int rc = backend->_crc_update_write(**fd, offset, len, bl);
+ assert(rc >= 0);
+ }
+
// flush?
- if (!replaying)
+ if (!replaying &&
+ g_conf->filestore_wbthrottle_enable)
wbthrottle.queue_wb(fd, oid, offset, len, replica);
lfn_close(fd);
@@ -2609,7 +2726,7 @@ int FileStore::_write(coll_t cid, const hobject_t& oid,
return r;
}
-int FileStore::_zero(coll_t cid, const hobject_t& oid, uint64_t offset, size_t len)
+int FileStore::_zero(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len)
{
dout(15) << "zero " << cid << "/" << oid << " " << offset << "~" << len << dendl;
int ret = 0;
@@ -2629,6 +2746,11 @@ int FileStore::_zero(coll_t cid, const hobject_t& oid, uint64_t offset, size_t l
ret = -errno;
lfn_close(fd);
+ if (ret >= 0 && m_filestore_sloppy_crc) {
+ int rc = backend->_crc_update_zero(**fd, offset, len);
+ assert(rc >= 0);
+ }
+
if (ret == 0)
goto out; // yay!
if (ret != -EOPNOTSUPP)
@@ -2652,7 +2774,7 @@ int FileStore::_zero(coll_t cid, const hobject_t& oid, uint64_t offset, size_t l
return ret;
}
-int FileStore::_clone(coll_t cid, const hobject_t& oldoid, const hobject_t& newoid,
+int FileStore::_clone(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid,
const SequencerPosition& spos)
{
dout(15) << "clone " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << dendl;
@@ -2782,11 +2904,15 @@ int FileStore::_do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, u
break;
pos += r;
}
+ if (r >= 0 && m_filestore_sloppy_crc) {
+ int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff);
+ assert(rc >= 0);
+ }
dout(20) << "_do_copy_range " << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl;
return r;
}
-int FileStore::_clone_range(coll_t cid, const hobject_t& oldoid, const hobject_t& newoid,
+int FileStore::_clone_range(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid,
uint64_t srcoff, uint64_t len, uint64_t dstoff,
const SequencerPosition& spos)
{
@@ -3227,23 +3353,23 @@ int FileStore::_fsetattrs(int fd, map<string, bufferptr> &aset)
}
// debug EIO injection
-void FileStore::inject_data_error(const hobject_t &oid) {
+void FileStore::inject_data_error(const ghobject_t &oid) {
Mutex::Locker l(read_error_lock);
dout(10) << __func__ << ": init error on " << oid << dendl;
data_error_set.insert(oid);
}
-void FileStore::inject_mdata_error(const hobject_t &oid) {
+void FileStore::inject_mdata_error(const ghobject_t &oid) {
Mutex::Locker l(read_error_lock);
dout(10) << __func__ << ": init error on " << oid << dendl;
mdata_error_set.insert(oid);
}
-void FileStore::debug_obj_on_delete(const hobject_t &oid) {
+void FileStore::debug_obj_on_delete(const ghobject_t &oid) {
Mutex::Locker l(read_error_lock);
dout(10) << __func__ << ": clear error on " << oid << dendl;
data_error_set.erase(oid);
mdata_error_set.erase(oid);
}
-bool FileStore::debug_data_eio(const hobject_t &oid) {
+bool FileStore::debug_data_eio(const ghobject_t &oid) {
Mutex::Locker l(read_error_lock);
if (data_error_set.count(oid)) {
dout(10) << __func__ << ": inject error on " << oid << dendl;
@@ -3252,7 +3378,7 @@ bool FileStore::debug_data_eio(const hobject_t &oid) {
return false;
}
}
-bool FileStore::debug_mdata_eio(const hobject_t &oid) {
+bool FileStore::debug_mdata_eio(const ghobject_t &oid) {
Mutex::Locker l(read_error_lock);
if (mdata_error_set.count(oid)) {
dout(10) << __func__ << ": inject error on " << oid << dendl;
@@ -3265,7 +3391,7 @@ bool FileStore::debug_mdata_eio(const hobject_t &oid) {
// objects
-int FileStore::getattr(coll_t cid, const hobject_t& oid, const char *name, bufferptr &bp)
+int FileStore::getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferptr &bp)
{
dout(15) << "getattr " << cid << "/" << oid << " '" << name << "'" << dendl;
FDRef fd;
@@ -3277,7 +3403,7 @@ int FileStore::getattr(coll_t cid, const hobject_t& oid, const char *name, buffe
get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN);
r = _fgetattr(**fd, n, bp);
lfn_close(fd);
- if (r == -ENODATA && g_conf->filestore_xattr_use_omap) {
+ if (r == -ENODATA) {
map<string, bufferlist> got;
set<string> to_get;
to_get.insert(string(name));
@@ -3311,8 +3437,11 @@ int FileStore::getattr(coll_t cid, const hobject_t& oid, const char *name, buffe
}
}
-int FileStore::getattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>& aset, bool user_only)
+int FileStore::getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset, bool user_only)
{
+ set<string> omap_attrs;
+ map<string, bufferlist> omap_aset;
+ Index index;
dout(15) << "getattrs " << cid << "/" << oid << dendl;
FDRef fd;
int r = lfn_open(cid, oid, false, &fd);
@@ -3320,43 +3449,43 @@ int FileStore::getattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>&
goto out;
}
r = _fgetattrs(**fd, aset, user_only);
+ if (r < 0) {
+ goto out;
+ }
lfn_close(fd);
- if (g_conf->filestore_xattr_use_omap) {
- set<string> omap_attrs;
- map<string, bufferlist> omap_aset;
- Index index;
- int r = get_index(cid, &index);
- if (r < 0) {
- dout(10) << __func__ << " could not get index r = " << r << dendl;
- goto out;
- }
- r = object_map->get_all_xattrs(oid, &omap_attrs);
- if (r < 0 && r != -ENOENT) {
- dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
- goto out;
- }
- r = object_map->get_xattrs(oid, omap_attrs, &omap_aset);
- if (r < 0 && r != -ENOENT) {
- dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
- goto out;
- }
- assert(omap_attrs.size() == omap_aset.size());
- for (map<string, bufferlist>::iterator i = omap_aset.begin();
+ r = get_index(cid, &index);
+ if (r < 0) {
+ dout(10) << __func__ << " could not get index r = " << r << dendl;
+ goto out;
+ }
+ r = object_map->get_all_xattrs(oid, &omap_attrs);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
+ goto out;
+ }
+ r = object_map->get_xattrs(oid, omap_attrs, &omap_aset);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
+ goto out;
+ }
+ if (r == -ENOENT)
+ r = 0;
+ assert(omap_attrs.size() == omap_aset.size());
+ for (map<string, bufferlist>::iterator i = omap_aset.begin();
i != omap_aset.end();
++i) {
- string key;
- if (user_only) {
+ string key;
+ if (user_only) {
if (i->first[0] != '_')
continue;
if (i->first == "_")
continue;
key = i->first.substr(1, i->first.size());
- } else {
+ } else {
key = i->first;
- }
- aset.insert(make_pair(key,
- bufferptr(i->second.c_str(), i->second.length())));
}
+ aset.insert(make_pair(key,
+ bufferptr(i->second.c_str(), i->second.length())));
}
out:
dout(10) << "getattrs " << cid << "/" << oid << " = " << r << dendl;
@@ -3370,7 +3499,7 @@ int FileStore::getattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>&
}
}
-int FileStore::_setattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>& aset,
+int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset,
const SequencerPosition &spos)
{
map<string, bufferlist> omap_set;
@@ -3382,10 +3511,8 @@ int FileStore::_setattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>
if (r < 0) {
goto out;
}
- if (g_conf->filestore_xattr_use_omap) {
- r = _fgetattrs(**fd, inline_set, false);
- assert(!m_filestore_fail_eio || r != -EIO);
- }
+ r = _fgetattrs(**fd, inline_set, false);
+ assert(!m_filestore_fail_eio || r != -EIO);
dout(15) << "setattrs " << cid << "/" << oid << dendl;
r = 0;
for (map<string,bufferptr>::iterator p = aset.begin();
@@ -3393,8 +3520,8 @@ int FileStore::_setattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>
++p) {
char n[CHAIN_XATTR_MAX_NAME_LEN];
get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN);
- if (g_conf->filestore_xattr_use_omap) {
- if (p->second.length() > g_conf->filestore_max_inline_xattr_size) {
+
+ if (p->second.length() > m_filestore_max_inline_xattr_size) {
if (inline_set.count(p->first)) {
inline_set.erase(p->first);
r = chain_fremovexattr(**fd, n);
@@ -3403,10 +3530,10 @@ int FileStore::_setattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>
}
omap_set[p->first].push_back(p->second);
continue;
- }
+ }
- if (!inline_set.count(p->first) &&
- inline_set.size() >= g_conf->filestore_max_inline_xattrs) {
+ if (!inline_set.count(p->first) &&
+ inline_set.size() >= m_filestore_max_inline_xattrs) {
if (inline_set.count(p->first)) {
inline_set.erase(p->first);
r = chain_fremovexattr(**fd, n);
@@ -3415,10 +3542,9 @@ int FileStore::_setattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>
}
omap_set[p->first].push_back(p->second);
continue;
- }
- omap_remove.insert(p->first);
- inline_set.insert(*p);
}
+ omap_remove.insert(p->first);
+ inline_set.insert(*p);
inline_to_set.insert(*p);
@@ -3429,17 +3555,17 @@ int FileStore::_setattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>
goto out_close;
if (!omap_remove.empty()) {
- assert(g_conf->filestore_xattr_use_omap);
r = object_map->remove_xattrs(oid, omap_remove, &spos);
if (r < 0 && r != -ENOENT) {
dout(10) << __func__ << " could not remove_xattrs r = " << r << dendl;
assert(!m_filestore_fail_eio || r != -EIO);
goto out_close;
+ } else {
+ r = 0; // don't confuse the debug output
}
}
if (!omap_set.empty()) {
- assert(g_conf->filestore_xattr_use_omap);
r = object_map->set_xattrs(oid, omap_set, &spos);
if (r < 0) {
dout(10) << __func__ << " could not set_xattrs r = " << r << dendl;
@@ -3455,7 +3581,7 @@ int FileStore::_setattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>
}
-int FileStore::_rmattr(coll_t cid, const hobject_t& oid, const char *name,
+int FileStore::_rmattr(coll_t cid, const ghobject_t& oid, const char *name,
const SequencerPosition &spos)
{
dout(15) << "rmattr " << cid << "/" << oid << " '" << name << "'" << dendl;
@@ -3467,7 +3593,7 @@ int FileStore::_rmattr(coll_t cid, const hobject_t& oid, const char *name,
char n[CHAIN_XATTR_MAX_NAME_LEN];
get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN);
r = chain_fremovexattr(**fd, n);
- if (r == -ENODATA && g_conf->filestore_xattr_use_omap) {
+ if (r == -ENODATA) {
Index index;
r = get_index(cid, &index);
if (r < 0) {
@@ -3490,13 +3616,15 @@ int FileStore::_rmattr(coll_t cid, const hobject_t& oid, const char *name,
return r;
}
-int FileStore::_rmattrs(coll_t cid, const hobject_t& oid,
+int FileStore::_rmattrs(coll_t cid, const ghobject_t& oid,
const SequencerPosition &spos)
{
dout(15) << "rmattrs " << cid << "/" << oid << dendl;
map<string,bufferptr> aset;
FDRef fd;
+ set<string> omap_attrs;
+ Index index;
int r = lfn_open(cid, oid, false, &fd);
if (r < 0) {
goto out;
@@ -3513,26 +3641,24 @@ int FileStore::_rmattrs(coll_t cid, const hobject_t& oid,
}
lfn_close(fd);
- if (g_conf->filestore_xattr_use_omap) {
- set<string> omap_attrs;
- Index index;
- r = get_index(cid, &index);
- if (r < 0) {
- dout(10) << __func__ << " could not get index r = " << r << dendl;
- return r;
- }
- r = object_map->get_all_xattrs(oid, &omap_attrs);
- if (r < 0 && r != -ENOENT) {
- dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
- }
- r = object_map->remove_xattrs(oid, omap_attrs, &spos);
- if (r < 0 && r != -ENOENT) {
- dout(10) << __func__ << " could not remove omap_attrs r = " << r << dendl;
- return r;
- }
+ r = get_index(cid, &index);
+ if (r < 0) {
+ dout(10) << __func__ << " could not get index r = " << r << dendl;
+ return r;
+ }
+ r = object_map->get_all_xattrs(oid, &omap_attrs);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
}
+ r = object_map->remove_xattrs(oid, omap_attrs, &spos);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __func__ << " could not remove omap_attrs r = " << r << dendl;
+ return r;
+ }
+ if (r == -ENOENT)
+ r = 0;
out:
dout(10) << "rmattrs " << cid << "/" << oid << " = " << r << dendl;
return r;
@@ -3686,14 +3812,14 @@ int FileStore::_collection_remove_recursive(const coll_t &cid,
return r;
}
- vector<hobject_t> objects;
- hobject_t max;
+ vector<ghobject_t> objects;
+ ghobject_t max;
r = 0;
while (!max.is_max()) {
r = collection_list_partial(cid, max, 200, 300, 0, &objects, &max);
if (r < 0)
return r;
- for (vector<hobject_t>::iterator i = objects.begin();
+ for (vector<ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i) {
assert(_check_replay_guard(cid, *i, spos));
@@ -3765,7 +3891,7 @@ int FileStore::collection_version_current(coll_t c, uint32_t *version)
if (r < 0)
return r;
*version = index->collection_version();
- if (*version == on_disk_version)
+ if (*version == target_version)
return 1;
else
return 0;
@@ -3787,8 +3913,9 @@ int FileStore::list_collections(vector<coll_t>& ls)
return r;
}
- struct dirent sde, *de;
- while ((r = ::readdir_r(dir, &sde, &de)) == 0) {
+ char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
+ struct dirent *de;
+ while ((r = ::readdir_r(dir, (struct dirent *)&buf, &de)) == 0) {
if (!de)
break;
if (de->d_type == DT_UNKNOWN) {
@@ -3857,9 +3984,9 @@ bool FileStore::collection_empty(coll_t c)
int r = get_index(c, &index);
if (r < 0)
return false;
- vector<hobject_t> ls;
+ vector<ghobject_t> ls;
collection_list_handle_t handle;
- r = index->collection_list_partial(hobject_t(), 1, 1, 0, &ls, NULL);
+ r = index->collection_list_partial(ghobject_t(), 1, 1, 0, &ls, NULL);
if (r < 0) {
assert(!m_filestore_fail_eio || r != -EIO);
return false;
@@ -3867,14 +3994,14 @@ bool FileStore::collection_empty(coll_t c)
return ls.empty();
}
-int FileStore::collection_list_range(coll_t c, hobject_t start, hobject_t end,
- snapid_t seq, vector<hobject_t> *ls)
+int FileStore::collection_list_range(coll_t c, ghobject_t start, ghobject_t end,
+ snapid_t seq, vector<ghobject_t> *ls)
{
bool done = false;
- hobject_t next = start;
+ ghobject_t next = start;
while (!done) {
- vector<hobject_t> next_objects;
+ vector<ghobject_t> next_objects;
int r = collection_list_partial(c, next,
get_ideal_list_min(), get_ideal_list_max(),
seq, &next_objects, &next);
@@ -3901,10 +4028,11 @@ int FileStore::collection_list_range(coll_t c, hobject_t start, hobject_t end,
return 0;
}
-int FileStore::collection_list_partial(coll_t c, hobject_t start,
+int FileStore::collection_list_partial(coll_t c, ghobject_t start,
int min, int max, snapid_t seq,
- vector<hobject_t> *ls, hobject_t *next)
+ vector<ghobject_t> *ls, ghobject_t *next)
{
+ dout(10) << "collection_list_partial: " << c << dendl;
Index index;
int r = get_index(c, &index);
if (r < 0)
@@ -3916,10 +4044,12 @@ int FileStore::collection_list_partial(coll_t c, hobject_t start,
assert(!m_filestore_fail_eio || r != -EIO);
return r;
}
+ if (ls)
+ dout(20) << "objects: " << *ls << dendl;
return 0;
}
-int FileStore::collection_list(coll_t c, vector<hobject_t>& ls)
+int FileStore::collection_list(coll_t c, vector<ghobject_t>& ls)
{
Index index;
int r = get_index(c, &index);
@@ -3930,7 +4060,7 @@ int FileStore::collection_list(coll_t c, vector<hobject_t>& ls)
return r;
}
-int FileStore::omap_get(coll_t c, const hobject_t &hoid,
+int FileStore::omap_get(coll_t c, const ghobject_t &hoid,
bufferlist *header,
map<string, bufferlist> *out)
{
@@ -3949,7 +4079,7 @@ int FileStore::omap_get(coll_t c, const hobject_t &hoid,
int FileStore::omap_get_header(
coll_t c,
- const hobject_t &hoid,
+ const ghobject_t &hoid,
bufferlist *bl,
bool allow_eio)
{
@@ -3966,7 +4096,7 @@ int FileStore::omap_get_header(
return 0;
}
-int FileStore::omap_get_keys(coll_t c, const hobject_t &hoid, set<string> *keys)
+int FileStore::omap_get_keys(coll_t c, const ghobject_t &hoid, set<string> *keys)
{
dout(15) << __func__ << " " << c << "/" << hoid << dendl;
IndexedPath path;
@@ -3981,7 +4111,7 @@ int FileStore::omap_get_keys(coll_t c, const hobject_t &hoid, set<string> *keys)
return 0;
}
-int FileStore::omap_get_values(coll_t c, const hobject_t &hoid,
+int FileStore::omap_get_values(coll_t c, const ghobject_t &hoid,
const set<string> &keys,
map<string, bufferlist> *out)
{
@@ -3998,7 +4128,7 @@ int FileStore::omap_get_values(coll_t c, const hobject_t &hoid,
return 0;
}
-int FileStore::omap_check_keys(coll_t c, const hobject_t &hoid,
+int FileStore::omap_check_keys(coll_t c, const ghobject_t &hoid,
const set<string> &keys,
set<string> *out)
{
@@ -4016,7 +4146,7 @@ int FileStore::omap_check_keys(coll_t c, const hobject_t &hoid,
}
ObjectMap::ObjectMapIterator FileStore::get_omap_iterator(coll_t c,
- const hobject_t &hoid)
+ const ghobject_t &hoid)
{
dout(15) << __func__ << " " << c << "/" << hoid << dendl;
IndexedPath path;
@@ -4087,8 +4217,8 @@ int FileStore::_destroy_collection(coll_t c)
}
-int FileStore::_collection_add(coll_t c, coll_t oldcid, const hobject_t& o,
- const SequencerPosition& spos)
+int FileStore::_collection_add(coll_t c, coll_t oldcid, const ghobject_t& o,
+ const SequencerPosition& spos)
{
dout(15) << "collection_add " << c << "/" << o << " from " << oldcid << "/" << o << dendl;
@@ -4118,7 +4248,7 @@ int FileStore::_collection_add(coll_t c, coll_t oldcid, const hobject_t& o,
_set_replay_guard(**fd, spos, &o, true);
}
- r = lfn_link(oldcid, c, o);
+ r = lfn_link(oldcid, c, o, o);
if (replaying && !backend->can_checkpoint() &&
r == -EEXIST) // crashed between link() and set_replay_guard()
r = 0;
@@ -4135,6 +4265,73 @@ int FileStore::_collection_add(coll_t c, coll_t oldcid, const hobject_t& o,
return r;
}
+int FileStore::_collection_move_rename(coll_t oldcid, const ghobject_t& oldoid,
+ coll_t c, const ghobject_t& o,
+ const SequencerPosition& spos)
+{
+ dout(15) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" << oldoid << dendl;
+ int r = 0;
+ int dstcmp, srccmp;
+
+ dstcmp = _check_replay_guard(c, o, spos);
+ if (dstcmp < 0)
+ goto out_rm_src;
+
+ // check the src name too; it might have a newer guard, and we don't
+ // want to clobber it
+ srccmp = _check_replay_guard(oldcid, oldoid, spos);
+ if (srccmp < 0)
+ return 0;
+
+ {
+ // open guard on object so we don't any previous operations on the
+ // new name that will modify the source inode.
+ FDRef fd;
+ r = lfn_open(oldcid, oldoid, 0, &fd);
+ if (r < 0) {
+ // the source collection/object does not exist. If we are replaying, we
+ // should be safe, so just return 0 and move on.
+ assert(replaying);
+ dout(10) << __func__ << " " << c << "/" << o << " from "
+ << oldcid << "/" << oldoid << " (dne, continue replay) " << dendl;
+ return 0;
+ }
+ if (dstcmp > 0) { // if dstcmp == 0 the guard already says "in-progress"
+ _set_replay_guard(**fd, spos, &o, true);
+ }
+
+ r = lfn_link(oldcid, c, oldoid, o);
+ if (replaying && !backend->can_checkpoint() &&
+ r == -EEXIST) // crashed between link() and set_replay_guard()
+ r = 0;
+
+ _inject_failure();
+
+ // the name changed; link the omap content
+ r = object_map->clone(oldoid, o, &spos);
+ if (r == -ENOENT)
+ r = 0;
+
+ _inject_failure();
+
+ // close guard on object so we don't do this again
+ if (r == 0) {
+ _close_replay_guard(**fd, spos);
+ }
+ lfn_close(fd);
+ }
+
+ out_rm_src:
+ // remove source
+ if (_check_replay_guard(oldcid, oldoid, spos) > 0) {
+ r = lfn_unlink(oldcid, oldoid, spos, true);
+ }
+
+ dout(10) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" << oldoid
+ << " = " << r << dendl;
+ return r;
+}
+
void FileStore::_inject_failure()
{
if (m_filestore_kill_at.read()) {
@@ -4148,7 +4345,7 @@ void FileStore::_inject_failure()
}
}
-int FileStore::_omap_clear(coll_t cid, const hobject_t &hoid,
+int FileStore::_omap_clear(coll_t cid, const ghobject_t &hoid,
const SequencerPosition &spos) {
dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
IndexedPath path;
@@ -4161,7 +4358,7 @@ int FileStore::_omap_clear(coll_t cid, const hobject_t &hoid,
return 0;
}
-int FileStore::_omap_setkeys(coll_t cid, const hobject_t &hoid,
+int FileStore::_omap_setkeys(coll_t cid, const ghobject_t &hoid,
const map<string, bufferlist> &aset,
const SequencerPosition &spos) {
dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
@@ -4172,7 +4369,7 @@ int FileStore::_omap_setkeys(coll_t cid, const hobject_t &hoid,
return object_map->set_keys(hoid, aset, &spos);
}
-int FileStore::_omap_rmkeys(coll_t cid, const hobject_t &hoid,
+int FileStore::_omap_rmkeys(coll_t cid, const ghobject_t &hoid,
const set<string> &keys,
const SequencerPosition &spos) {
dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
@@ -4186,7 +4383,7 @@ int FileStore::_omap_rmkeys(coll_t cid, const hobject_t &hoid,
return 0;
}
-int FileStore::_omap_rmkeyrange(coll_t cid, const hobject_t &hoid,
+int FileStore::_omap_rmkeyrange(coll_t cid, const ghobject_t &hoid,
const string& first, const string& last,
const SequencerPosition &spos) {
dout(15) << __func__ << " " << cid << "/" << hoid << " [" << first << "," << last << "]" << dendl;
@@ -4203,7 +4400,7 @@ int FileStore::_omap_rmkeyrange(coll_t cid, const hobject_t &hoid,
return _omap_rmkeys(cid, hoid, keys, spos);
}
-int FileStore::_omap_setheader(coll_t cid, const hobject_t &hoid,
+int FileStore::_omap_setheader(coll_t cid, const ghobject_t &hoid,
const bufferlist &bl,
const SequencerPosition &spos)
{
@@ -4263,8 +4460,8 @@ int FileStore::_split_collection(coll_t cid,
_close_replay_guard(dest, spos);
}
if (g_conf->filestore_debug_verify_split) {
- vector<hobject_t> objects;
- hobject_t next;
+ vector<ghobject_t> objects;
+ ghobject_t next;
while (1) {
collection_list_partial(
cid,
@@ -4274,7 +4471,7 @@ int FileStore::_split_collection(coll_t cid,
&next);
if (objects.empty())
break;
- for (vector<hobject_t>::iterator i = objects.begin();
+ for (vector<ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i) {
dout(20) << __func__ << ": " << *i << " still in source "
@@ -4283,7 +4480,7 @@ int FileStore::_split_collection(coll_t cid,
}
objects.clear();
}
- next = hobject_t();
+ next = ghobject_t();
while (1) {
collection_list_partial(
dest,
@@ -4293,7 +4490,7 @@ int FileStore::_split_collection(coll_t cid,
&next);
if (objects.empty())
break;
- for (vector<hobject_t>::iterator i = objects.begin();
+ for (vector<ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i) {
dout(20) << __func__ << ": " << *i << " now in dest "
@@ -4359,6 +4556,8 @@ const char** FileStore::get_tracked_conf_keys() const
"filestore_kill_at",
"filestore_fail_eio",
"filestore_replica_fadvise",
+ "filestore_sloppy_crc",
+ "filestore_sloppy_crc_block_size",
NULL
};
return KEYS;
@@ -4367,6 +4566,17 @@ const char** FileStore::get_tracked_conf_keys() const
void FileStore::handle_conf_change(const struct md_config_t *conf,
const std::set <std::string> &changed)
{
+ if (changed.count("filestore_max_inline_xattr_size") ||
+ changed.count("filestore_max_inline_xattr_size_xfs") ||
+ changed.count("filestore_max_inline_xattr_size_btrfs") ||
+ changed.count("filestore_max_inline_xattr_size_other") ||
+ changed.count("filestore_max_inline_xattrs") ||
+ changed.count("filestore_max_inline_xattrs_xfs") ||
+ changed.count("filestore_max_inline_xattrs_btrfs") ||
+ changed.count("filestore_max_inline_xattrs_other")) {
+ Mutex::Locker l(lock);
+ set_xattr_limits_via_conf();
+ }
if (changed.count("filestore_min_sync_interval") ||
changed.count("filestore_max_sync_interval") ||
changed.count("filestore_queue_max_ops") ||
@@ -4375,6 +4585,8 @@ void FileStore::handle_conf_change(const struct md_config_t *conf,
changed.count("filestore_queue_committing_max_bytes") ||
changed.count("filestore_kill_at") ||
changed.count("filestore_fail_eio") ||
+ changed.count("filestore_sloppy_crc") ||
+ changed.count("filestore_sloppy_crc_block_size") ||
changed.count("filestore_replica_fadvise")) {
Mutex::Locker l(lock);
m_filestore_min_sync_interval = conf->filestore_min_sync_interval;
@@ -4386,6 +4598,8 @@ void FileStore::handle_conf_change(const struct md_config_t *conf,
m_filestore_kill_at.set(conf->filestore_kill_at);
m_filestore_fail_eio = conf->filestore_fail_eio;
m_filestore_replica_fadvise = conf->filestore_replica_fadvise;
+ m_filestore_sloppy_crc = conf->filestore_sloppy_crc;
+ m_filestore_sloppy_crc_block_size = conf->filestore_sloppy_crc_block_size;
}
if (changed.count("filestore_commit_timeout")) {
Mutex::Locker l(sync_entry_timeo_lock);
@@ -4441,3 +4655,77 @@ void FileStore::dump_transactions(list<ObjectStore::Transaction*>& ls, uint64_t
m_filestore_dump_fmt.flush(m_filestore_dump);
m_filestore_dump.flush();
}
+
+void FileStore::set_xattr_limits_via_conf()
+{
+ uint32_t fs_xattr_size;
+ uint32_t fs_xattrs;
+
+ assert(m_fs_type != FS_TYPE_NONE);
+
+ switch(m_fs_type) {
+ case FS_TYPE_XFS:
+ fs_xattr_size = g_conf->filestore_max_inline_xattr_size_xfs;
+ fs_xattrs = g_conf->filestore_max_inline_xattrs_xfs;
+ break;
+ case FS_TYPE_BTRFS:
+ fs_xattr_size = g_conf->filestore_max_inline_xattr_size_btrfs;
+ fs_xattrs = g_conf->filestore_max_inline_xattrs_btrfs;
+ break;
+ case FS_TYPE_ZFS:
+ case FS_TYPE_OTHER:
+ fs_xattr_size = g_conf->filestore_max_inline_xattr_size_other;
+ fs_xattrs = g_conf->filestore_max_inline_xattrs_other;
+ break;
+ default:
+ assert(!"Unknown fs type");
+ }
+
+ //Use override value if set
+ if (g_conf->filestore_max_inline_xattr_size)
+ m_filestore_max_inline_xattr_size = g_conf->filestore_max_inline_xattr_size;
+ else
+ m_filestore_max_inline_xattr_size = fs_xattr_size;
+
+ //Use override value if set
+ if (g_conf->filestore_max_inline_xattrs)
+ m_filestore_max_inline_xattrs = g_conf->filestore_max_inline_xattrs;
+ else
+ m_filestore_max_inline_xattrs = fs_xattrs;
+}
+
+// -- FSSuperblock --
+
+void FSSuperblock::encode(bufferlist &bl) const
+{
+ ENCODE_START(1, 1, bl);
+ compat_features.encode(bl);
+ ENCODE_FINISH(bl);
+}
+
+void FSSuperblock::decode(bufferlist::iterator &bl)
+{
+ DECODE_START(1, bl);
+ compat_features.decode(bl);
+ DECODE_FINISH(bl);
+}
+
+void FSSuperblock::dump(Formatter *f) const
+{
+ f->open_object_section("compat");
+ compat_features.dump(f);
+ f->close_section();
+}
+
+void FSSuperblock::generate_test_instances(list<FSSuperblock*>& o)
+{
+ FSSuperblock z;
+ o.push_back(new FSSuperblock(z));
+ CompatSet::FeatureSet feature_compat;
+ CompatSet::FeatureSet feature_ro_compat;
+ CompatSet::FeatureSet feature_incompat;
+ feature_incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS);
+ z.compat_features = CompatSet(feature_compat, feature_ro_compat,
+ feature_incompat);
+ o.push_back(new FSSuperblock(z));
+}
diff --git a/src/os/FileStore.h b/src/os/FileStore.h
index c603949b399..c489fdd5796 100644
--- a/src/os/FileStore.h
+++ b/src/os/FileStore.h
@@ -64,8 +64,36 @@ static const __SWORD_TYPE XFS_SUPER_MAGIC(0x58465342);
static const __SWORD_TYPE ZFS_SUPER_MAGIC(0x2fc12fc1);
#endif
+enum fs_types {
+ FS_TYPE_NONE = 0,
+ FS_TYPE_XFS,
+ FS_TYPE_BTRFS,
+ FS_TYPE_ZFS,
+ FS_TYPE_OTHER
+};
+
class FileStoreBackend;
+#define CEPH_FS_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(1, "sharded objects")
+
+class FSSuperblock {
+public:
+ CompatSet compat_features;
+
+ FSSuperblock() { }
+
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::iterator &bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<FSSuperblock*>& o);
+};
+WRITE_CLASS_ENCODER(FSSuperblock)
+
+inline ostream& operator<<(ostream& out, const FSSuperblock& sb)
+{
+ return out << "sb(" << sb.compat_features << ")";
+}
+
class FileStore : public JournalingObjectStore,
public md_config_obs_t
{
@@ -89,7 +117,7 @@ public:
return perf_tracker.get_cur_stats();
}
- static const uint32_t on_disk_version = 3;
+ static const uint32_t target_version = 3;
private:
string internal_name; ///< internal name, used to name the perfcounter instance
string basedir, journalpath;
@@ -281,25 +309,27 @@ private:
void op_queue_release_throttle(Op *o);
void _journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk);
friend struct C_JournaledAhead;
+ int write_version_stamp();
int open_journal();
PerfCounters *logger;
public:
- int lfn_find(coll_t cid, const hobject_t& oid, IndexedPath *path);
- int lfn_truncate(coll_t cid, const hobject_t& oid, off_t length);
- int lfn_stat(coll_t cid, const hobject_t& oid, struct stat *buf);
+ int lfn_find(coll_t cid, const ghobject_t& oid, IndexedPath *path);
+ int lfn_truncate(coll_t cid, const ghobject_t& oid, off_t length);
+ int lfn_stat(coll_t cid, const ghobject_t& oid, struct stat *buf);
int lfn_open(
coll_t cid,
- const hobject_t& oid,
+ const ghobject_t& oid,
bool create,
FDRef *outfd,
IndexedPath *path = 0,
Index *index = 0);
void lfn_close(FDRef fd);
- int lfn_link(coll_t c, coll_t cid, const hobject_t& o) ;
- int lfn_unlink(coll_t cid, const hobject_t& o, const SequencerPosition &spos);
+ int lfn_link(coll_t c, coll_t newcid, const ghobject_t& o, const ghobject_t& newoid) ;
+ int lfn_unlink(coll_t cid, const ghobject_t& o, const SequencerPosition &spos,
+ bool force_clear_omap=false);
public:
FileStore(const std::string &base, const std::string &jdev, const char *internal_name = "filestore", bool update_to=false);
@@ -309,7 +339,6 @@ public:
int _sanity_check_fs();
bool test_mount_in_use();
- int write_version_stamp();
int version_stamp_is_valid(uint32_t *version);
int update_version_stamp();
int read_op_seq(uint64_t *seq);
@@ -320,6 +349,22 @@ public:
int mkfs();
int mkjournal();
+ /**
+ * set_allow_sharded_objects()
+ *
+ * Before sharded ghobject_t can be specified this function must be called
+ *
+ * Once this function is called the FileStore is not mountable by prior releases
+ */
+ void set_allow_sharded_objects();
+
+ /**
+ * get_allow_sharded_objects()
+ *
+ * return value: true if set_allow_sharded_objects() called, otherwise false
+ */
+ bool get_allow_sharded_objects();
+
int statfs(struct statfs *buf);
int _do_transactions(
@@ -328,7 +373,9 @@ public:
int do_transactions(list<Transaction*> &tls, uint64_t op_seq) {
return _do_transactions(tls, op_seq, 0);
}
- unsigned _do_transaction(Transaction& t, uint64_t op_seq, int trans_num);
+ unsigned _do_transaction(
+ Transaction& t, uint64_t op_seq, int trans_num,
+ ThreadPool::TPHandle *handle);
int queue_transactions(Sequencer *osr, list<Transaction*>& tls,
TrackedOpRef op = TrackedOpRef());
@@ -344,7 +391,7 @@ public:
*/
void _set_replay_guard(int fd,
const SequencerPosition& spos,
- const hobject_t *hoid=0,
+ const ghobject_t *oid=0,
bool in_progress=false);
void _set_replay_guard(coll_t cid,
const SequencerPosition& spos,
@@ -374,42 +421,42 @@ public:
*/
int _check_replay_guard(int fd, const SequencerPosition& spos);
int _check_replay_guard(coll_t cid, const SequencerPosition& spos);
- int _check_replay_guard(coll_t cid, hobject_t oid, const SequencerPosition& pos);
+ int _check_replay_guard(coll_t cid, ghobject_t oid, const SequencerPosition& pos);
int _check_global_replay_guard(coll_t cid, const SequencerPosition& spos);
// ------------------
// objects
- int pick_object_revision_lt(hobject_t& oid) {
+ int pick_object_revision_lt(ghobject_t& oid) {
return 0;
}
- bool exists(coll_t cid, const hobject_t& oid);
+ bool exists(coll_t cid, const ghobject_t& oid);
int stat(
coll_t cid,
- const hobject_t& oid,
+ const ghobject_t& oid,
struct stat *st,
bool allow_eio = false);
int read(
coll_t cid,
- const hobject_t& oid,
+ const ghobject_t& oid,
uint64_t offset,
size_t len,
bufferlist& bl,
bool allow_eio = false);
- int fiemap(coll_t cid, const hobject_t& oid, uint64_t offset, size_t len, bufferlist& bl);
+ int fiemap(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl);
- int _touch(coll_t cid, const hobject_t& oid);
- int _write(coll_t cid, const hobject_t& oid, uint64_t offset, size_t len, const bufferlist& bl,
+ int _touch(coll_t cid, const ghobject_t& oid);
+ int _write(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, const bufferlist& bl,
bool replica = false);
- int _zero(coll_t cid, const hobject_t& oid, uint64_t offset, size_t len);
- int _truncate(coll_t cid, const hobject_t& oid, uint64_t size);
- int _clone(coll_t cid, const hobject_t& oldoid, const hobject_t& newoid,
+ int _zero(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len);
+ int _truncate(coll_t cid, const ghobject_t& oid, uint64_t size);
+ int _clone(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid,
const SequencerPosition& spos);
- int _clone_range(coll_t cid, const hobject_t& oldoid, const hobject_t& newoid,
+ int _clone_range(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid,
uint64_t srcoff, uint64_t len, uint64_t dstoff,
const SequencerPosition& spos);
int _do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
int _do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
- int _remove(coll_t cid, const hobject_t& oid, const SequencerPosition &spos);
+ int _remove(coll_t cid, const ghobject_t& oid, const SequencerPosition &spos);
int _fgetattr(int fd, const char *name, bufferptr& bp);
int _fgetattrs(int fd, map<string,bufferptr>& aset, bool user_only);
@@ -433,25 +480,25 @@ public:
// DEBUG read error injection, an object is removed from both on delete()
Mutex read_error_lock;
- set<hobject_t> data_error_set; // read() will return -EIO
- set<hobject_t> mdata_error_set; // getattr(),stat() will return -EIO
- void inject_data_error(const hobject_t &oid);
- void inject_mdata_error(const hobject_t &oid);
- void debug_obj_on_delete(const hobject_t &oid);
- bool debug_data_eio(const hobject_t &oid);
- bool debug_mdata_eio(const hobject_t &oid);
+ set<ghobject_t> data_error_set; // read() will return -EIO
+ set<ghobject_t> mdata_error_set; // getattr(),stat() will return -EIO
+ void inject_data_error(const ghobject_t &oid);
+ void inject_mdata_error(const ghobject_t &oid);
+ void debug_obj_on_delete(const ghobject_t &oid);
+ bool debug_data_eio(const ghobject_t &oid);
+ bool debug_mdata_eio(const ghobject_t &oid);
int snapshot(const string& name);
// attrs
- int getattr(coll_t cid, const hobject_t& oid, const char *name, bufferptr &bp);
- int getattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>& aset, bool user_only = false);
+ int getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferptr &bp);
+ int getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset, bool user_only = false);
- int _setattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>& aset,
+ int _setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset,
const SequencerPosition &spos);
- int _rmattr(coll_t cid, const hobject_t& oid, const char *name,
+ int _rmattr(coll_t cid, const ghobject_t& oid, const char *name,
const SequencerPosition &spos);
- int _rmattrs(coll_t cid, const hobject_t& oid,
+ int _rmattrs(coll_t cid, const ghobject_t& oid,
const SequencerPosition &spos);
int collection_getattr(coll_t c, const char *name, void *value, size_t size);
@@ -472,33 +519,36 @@ public:
int collection_stat(coll_t c, struct stat *st);
bool collection_exists(coll_t c);
bool collection_empty(coll_t c);
- int collection_list(coll_t c, vector<hobject_t>& o);
- int collection_list_partial(coll_t c, hobject_t start,
+ int collection_list(coll_t c, vector<ghobject_t>& oid);
+ int collection_list_partial(coll_t c, ghobject_t start,
int min, int max, snapid_t snap,
- vector<hobject_t> *ls, hobject_t *next);
- int collection_list_range(coll_t c, hobject_t start, hobject_t end,
- snapid_t seq, vector<hobject_t> *ls);
+ vector<ghobject_t> *ls, ghobject_t *next);
+ int collection_list_range(coll_t c, ghobject_t start, ghobject_t end,
+ snapid_t seq, vector<ghobject_t> *ls);
// omap (see ObjectStore.h for documentation)
- int omap_get(coll_t c, const hobject_t &hoid, bufferlist *header,
+ int omap_get(coll_t c, const ghobject_t &oid, bufferlist *header,
map<string, bufferlist> *out);
int omap_get_header(
coll_t c,
- const hobject_t &hoid,
+ const ghobject_t &oid,
bufferlist *out,
bool allow_eio = false);
- int omap_get_keys(coll_t c, const hobject_t &hoid, set<string> *keys);
- int omap_get_values(coll_t c, const hobject_t &hoid, const set<string> &keys,
+ int omap_get_keys(coll_t c, const ghobject_t &oid, set<string> *keys);
+ int omap_get_values(coll_t c, const ghobject_t &oid, const set<string> &keys,
map<string, bufferlist> *out);
- int omap_check_keys(coll_t c, const hobject_t &hoid, const set<string> &keys,
+ int omap_check_keys(coll_t c, const ghobject_t &oid, const set<string> &keys,
set<string> *out);
- ObjectMap::ObjectMapIterator get_omap_iterator(coll_t c, const hobject_t &hoid);
+ ObjectMap::ObjectMapIterator get_omap_iterator(coll_t c, const ghobject_t &oid);
int _create_collection(coll_t c);
int _create_collection(coll_t c, const SequencerPosition &spos);
int _destroy_collection(coll_t c);
- int _collection_add(coll_t c, coll_t ocid, const hobject_t& o,
+ int _collection_add(coll_t c, coll_t ocid, const ghobject_t& oid,
const SequencerPosition& spos);
+ int _collection_move_rename(coll_t oldcid, const ghobject_t& oldoid,
+ coll_t c, const ghobject_t& o,
+ const SequencerPosition& spos);
void dump_start(const std::string& file);
void dump_stop();
void dump_transactions(list<ObjectStore::Transaction*>& ls, uint64_t seq, OpSequencer *osr);
@@ -507,17 +557,17 @@ private:
void _inject_failure();
// omap
- int _omap_clear(coll_t cid, const hobject_t &hoid,
+ int _omap_clear(coll_t cid, const ghobject_t &oid,
const SequencerPosition &spos);
- int _omap_setkeys(coll_t cid, const hobject_t &hoid,
+ int _omap_setkeys(coll_t cid, const ghobject_t &oid,
const map<string, bufferlist> &aset,
const SequencerPosition &spos);
- int _omap_rmkeys(coll_t cid, const hobject_t &hoid, const set<string> &keys,
+ int _omap_rmkeys(coll_t cid, const ghobject_t &oid, const set<string> &keys,
const SequencerPosition &spos);
- int _omap_rmkeyrange(coll_t cid, const hobject_t &hoid,
+ int _omap_rmkeyrange(coll_t cid, const ghobject_t &oid,
const string& first, const string& last,
const SequencerPosition &spos);
- int _omap_setheader(coll_t cid, const hobject_t &hoid, const bufferlist &bl,
+ int _omap_setheader(coll_t cid, const ghobject_t &oid, const bufferlist &bl,
const SequencerPosition &spos);
int _split_collection(coll_t cid, uint32_t bits, uint32_t rem, coll_t dest,
const SequencerPosition &spos);
@@ -549,6 +599,34 @@ private:
std::ofstream m_filestore_dump;
JSONFormatter m_filestore_dump_fmt;
atomic_t m_filestore_kill_at;
+ bool m_filestore_sloppy_crc;
+ int m_filestore_sloppy_crc_block_size;
+ enum fs_types m_fs_type;
+
+ //Determined xattr handling based on fs type
+ void set_xattr_limits_via_conf();
+ uint32_t m_filestore_max_inline_xattr_size;
+ uint32_t m_filestore_max_inline_xattrs;
+
+ FSSuperblock superblock;
+
+ /**
+ * write_superblock()
+ *
+ * Write superblock to persisent storage
+ *
+ * return value: 0 on success, otherwise negative errno
+ */
+ int write_superblock();
+
+ /**
+ * read_superblock()
+ *
+ * Fill in FileStore::superblock by reading persistent storage
+ *
+ * return value: 0 on success, otherwise negative errno
+ */
+ int read_superblock();
friend class FileStoreBackend;
};
@@ -582,6 +660,9 @@ protected:
int _copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) {
return filestore->_do_copy_range(from, to, srcoff, len, dstoff);
}
+ int get_crc_block_size() {
+ return filestore->m_filestore_sloppy_crc_block_size;
+ }
public:
FileStoreBackend(FileStore *fs) : filestore(fs) {}
virtual ~FileStoreBackend() {};
@@ -597,6 +678,15 @@ public:
virtual bool has_fiemap() = 0;
virtual int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) = 0;
virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) = 0;
+
+ // hooks for (sloppy) crc tracking
+ virtual int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) = 0;
+ virtual int _crc_update_truncate(int fd, loff_t off) = 0;
+ virtual int _crc_update_zero(int fd, loff_t off, size_t len) = 0;
+ virtual int _crc_update_clone_range(int srcfd, int destfd,
+ loff_t srcoff, size_t len, loff_t dstoff) = 0;
+ virtual int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
+ ostream *out) = 0;
};
#endif
diff --git a/src/os/FlatIndex.cc b/src/os/FlatIndex.cc
index f4a5ce3ab7d..d4644abc627 100644
--- a/src/os/FlatIndex.cc
+++ b/src/os/FlatIndex.cc
@@ -134,18 +134,18 @@ static void lfn_translate(const char *path, const char *name, char *new_name, in
return;
}
-static int append_oname(const hobject_t &oid, char *s, int len)
+static int append_oname(const ghobject_t &oid, char *s, int len)
{
//assert(sizeof(oid) == 28);
char *end = s + len;
char *t = s + strlen(s);
- const char *i = oid.oid.name.c_str();
+ const char *i = oid.hobj.oid.name.c_str();
while (*i && t < end) {
if (*i == '\\') {
*t++ = '\\';
*t++ = '\\';
- } else if (*i == '.' && i == oid.oid.name.c_str()) { // only escape leading .
+ } else if (*i == '.' && i == oid.hobj.oid.name.c_str()) { // only escape leading .
*t++ = '\\';
*t++ = '.';
} else if (*i == '/') {
@@ -158,17 +158,17 @@ static int append_oname(const hobject_t &oid, char *s, int len)
int size = t - s;
- if (oid.snap == CEPH_NOSNAP)
+ if (oid.hobj.snap == CEPH_NOSNAP)
size += snprintf(t, end - t, "_head");
- else if (oid.snap == CEPH_SNAPDIR)
+ else if (oid.hobj.snap == CEPH_SNAPDIR)
size += snprintf(t, end - t, "_snapdir");
else
- size += snprintf(t, end - t, "_%llx", (long long unsigned)oid.snap);
+ size += snprintf(t, end - t, "_%llx", (long long unsigned)oid.hobj.snap);
return size;
}
-static bool parse_object(char *s, hobject_t& oid)
+static bool parse_object(char *s, ghobject_t& oid)
{
sobject_t o;
char *bar = s + strlen(s) - 1;
@@ -201,13 +201,13 @@ static bool parse_object(char *s, hobject_t& oid)
o.snap = CEPH_SNAPDIR;
else
o.snap = strtoull(bar+1, &s, 16);
- oid = hobject_t(o);
+ oid = ghobject_t(hobject_t(o));
return true;
}
return false;
}
-static int lfn_get(const char *coll_path, const hobject_t& oid, char *pathname, int len, char *lfn, int lfn_len, int *exist, int *is_lfn)
+static int lfn_get(const char *coll_path, const ghobject_t& oid, char *pathname, int len, char *lfn, int lfn_len, int *exist, int *is_lfn)
{
int i = 0;
strncpy(pathname, coll_path, len);
@@ -277,7 +277,7 @@ int FlatIndex::init() {
return 0;
}
-int FlatIndex::created(const hobject_t &hoid, const char *path) {
+int FlatIndex::created(const ghobject_t &hoid, const char *path) {
char long_name[PATH_MAX];
long_name[0] = '\0';
int actual_len = append_oname(hoid, long_name, sizeof(long_name));
@@ -292,7 +292,7 @@ int FlatIndex::created(const hobject_t &hoid, const char *path) {
return 0;
}
-int FlatIndex::unlink(const hobject_t &o) {
+int FlatIndex::unlink(const ghobject_t &o) {
char long_fn[PATH_MAX];
char short_fn[PATH_MAX];
char short_fn2[PATH_MAX];
@@ -346,7 +346,7 @@ int FlatIndex::unlink(const hobject_t &o) {
return 0;
}
-int FlatIndex::lookup(const hobject_t &hoid, IndexedPath *path, int *exist) {
+int FlatIndex::lookup(const ghobject_t &hoid, IndexedPath *path, int *exist) {
char long_fn[PATH_MAX];
char short_fn[PATH_MAX];
int r;
@@ -361,7 +361,7 @@ int FlatIndex::lookup(const hobject_t &hoid, IndexedPath *path, int *exist) {
}
static int get_hobject_from_oinfo(const char *dir, const char *file,
- hobject_t *o) {
+ ghobject_t *o) {
char path[PATH_MAX];
bufferptr bp(PATH_MAX);
snprintf(path, sizeof(path), "%s/%s", dir, file);
@@ -376,18 +376,19 @@ static int get_hobject_from_oinfo(const char *dir, const char *file,
return 0;
}
-int FlatIndex::collection_list_partial(const hobject_t &start,
+int FlatIndex::collection_list_partial(const ghobject_t &start,
int min_count,
int max_count,
snapid_t seq,
- vector<hobject_t> *ls,
- hobject_t *next) {
+ vector<ghobject_t> *ls,
+ ghobject_t *next) {
assert(0); // Should not be called
return 0;
}
-int FlatIndex::collection_list(vector<hobject_t> *ls) {
- char dir_name[PATH_MAX], buf[PATH_MAX], new_name[PATH_MAX];
+int FlatIndex::collection_list(vector<ghobject_t> *ls) {
+ char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
+ char dir_name[PATH_MAX], new_name[PATH_MAX];
strncpy(dir_name, base_path.c_str(), sizeof(dir_name));
dir_name[sizeof(dir_name)-1]='\0';
@@ -396,21 +397,21 @@ int FlatIndex::collection_list(vector<hobject_t> *ls) {
return -errno;
// first, build (ino, object) list
- vector< pair<ino_t,hobject_t> > inolist;
+ vector< pair<ino_t,ghobject_t> > inolist;
struct dirent *de;
- while (::readdir_r(dir, (struct dirent*)buf, &de) == 0) {
+ while (::readdir_r(dir, (struct dirent *)buf, &de) == 0) {
if (!de)
break;
// parse
if (de->d_name[0] == '.')
continue;
//cout << " got object " << de->d_name << std::endl;
- hobject_t o;
+ ghobject_t o;
lfn_translate(dir_name, de->d_name, new_name, sizeof(new_name));
if (parse_object(new_name, o)) {
get_hobject_from_oinfo(dir_name, de->d_name, &o);
- inolist.push_back(pair<ino_t,hobject_t>(de->d_ino, o));
+ inolist.push_back(pair<ino_t,ghobject_t>(de->d_ino, o));
ls->push_back(o);
}
}
@@ -421,7 +422,7 @@ int FlatIndex::collection_list(vector<hobject_t> *ls) {
// build final list
ls->resize(inolist.size());
int i = 0;
- for (vector< pair<ino_t,hobject_t> >::iterator p = inolist.begin(); p != inolist.end(); ++p)
+ for (vector< pair<ino_t,ghobject_t> >::iterator p = inolist.begin(); p != inolist.end(); ++p)
(*ls)[i++].swap(p->second);
::closedir(dir);
diff --git a/src/os/FlatIndex.h b/src/os/FlatIndex.h
index 7a10912dc28..657c273468b 100644
--- a/src/os/FlatIndex.h
+++ b/src/os/FlatIndex.h
@@ -52,35 +52,35 @@ public:
/// @see CollectionIndex
int created(
- const hobject_t &hoid,
+ const ghobject_t &oid,
const char *path
);
/// @see CollectionIndex
int unlink(
- const hobject_t &hoid
+ const ghobject_t &oid
);
/// @see CollectionIndex
int lookup(
- const hobject_t &hoid,
+ const ghobject_t &oid,
IndexedPath *path,
int *exist
);
/// @see CollectionIndex
int collection_list(
- vector<hobject_t> *ls
+ vector<ghobject_t> *ls
);
/// @see CollectionIndex
int collection_list_partial(
- const hobject_t &start,
+ const ghobject_t &start,
int min_count,
int max_count,
snapid_t seq,
- vector<hobject_t> *ls,
- hobject_t *next
+ vector<ghobject_t> *ls,
+ ghobject_t *next
);
};
diff --git a/src/os/GenericFileStoreBackend.cc b/src/os/GenericFileStoreBackend.cc
index d0b3df5f6f2..f19ba7d7760 100644
--- a/src/os/GenericFileStoreBackend.cc
+++ b/src/os/GenericFileStoreBackend.cc
@@ -12,7 +12,9 @@
*
*/
-#include <inttypes.h>
+#include "include/int_types.h"
+#include "include/types.h"
+
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
@@ -27,7 +29,6 @@
#include "include/compat.h"
#include "include/linux_fiemap.h"
-#include "include/types.h"
#include <iostream>
#include <fstream>
@@ -39,6 +40,12 @@
#include "common/config.h"
#include "common/sync_filesystem.h"
+#include "common/SloppyCRCMap.h"
+#include "os/chain_xattr.h"
+
+#define SLOPPY_CRC_XATTR "user.cephos.scrc"
+
+
#define dout_subsys ceph_subsys_filestore
#undef dout_prefix
#define dout_prefix *_dout << "genericfilestorebackend(" << get_basedir_path() << ") "
@@ -117,12 +124,12 @@ int GenericFileStoreBackend::detect_features()
dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl;
ioctl_fiemap = true;
}
+ free(fiemap);
}
if (!m_filestore_fiemap) {
dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl;
ioctl_fiemap = false;
}
- free(fiemap);
::unlink(fn);
TEMP_FAILURE_RETRY(::close(fd));
@@ -250,3 +257,110 @@ done_err:
free(fiemap);
return ret;
}
+
+
+int GenericFileStoreBackend::_crc_load_or_init(int fd, SloppyCRCMap *cm)
+{
+ char buf[100];
+ bufferptr bp;
+ int r = 0;
+ int l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, buf, sizeof(buf));
+ if (l == -ENODATA) {
+ return 0;
+ }
+ if (l >= 0) {
+ bp = buffer::create(l);
+ memcpy(bp.c_str(), buf, l);
+ } else if (l == -ERANGE) {
+ l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, 0, 0);
+ if (l > 0) {
+ bp = buffer::create(l);
+ l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, bp.c_str(), l);
+ }
+ }
+ bufferlist bl;
+ bl.append(bp);
+ bufferlist::iterator p = bl.begin();
+ try {
+ ::decode(*cm, p);
+ }
+ catch (buffer::error &e) {
+ r = -EIO;
+ }
+ if (r < 0)
+ derr << __func__ << " got " << cpp_strerror(r) << dendl;
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_save(int fd, SloppyCRCMap *cm)
+{
+ bufferlist bl;
+ ::encode(*cm, bl);
+ int r = chain_fsetxattr(fd, SLOPPY_CRC_XATTR, bl.c_str(), bl.length());
+ if (r < 0)
+ derr << __func__ << " got " << cpp_strerror(r) << dendl;
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl)
+{
+ SloppyCRCMap scm(get_crc_block_size());
+ int r = _crc_load_or_init(fd, &scm);
+ if (r < 0)
+ return r;
+ ostringstream ss;
+ scm.write(off, len, bl, &ss);
+ dout(30) << __func__ << "\n" << ss.str() << dendl;
+ r = _crc_save(fd, &scm);
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_update_truncate(int fd, loff_t off)
+{
+ SloppyCRCMap scm(get_crc_block_size());
+ int r = _crc_load_or_init(fd, &scm);
+ if (r < 0)
+ return r;
+ scm.truncate(off);
+ r = _crc_save(fd, &scm);
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_update_zero(int fd, loff_t off, size_t len)
+{
+ SloppyCRCMap scm(get_crc_block_size());
+ int r = _crc_load_or_init(fd, &scm);
+ if (r < 0)
+ return r;
+ scm.zero(off, len);
+ r = _crc_save(fd, &scm);
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_update_clone_range(int srcfd, int destfd,
+ loff_t srcoff, size_t len, loff_t dstoff)
+{
+ SloppyCRCMap scm_src(get_crc_block_size());
+ SloppyCRCMap scm_dst(get_crc_block_size());
+ int r = _crc_load_or_init(srcfd, &scm_src);
+ if (r < 0)
+ return r;
+ r = _crc_load_or_init(destfd, &scm_dst);
+ if (r < 0)
+ return r;
+ ostringstream ss;
+ scm_dst.clone_range(srcoff, len, dstoff, scm_src, &ss);
+ dout(30) << __func__ << "\n" << ss.str() << dendl;
+ r = _crc_save(destfd, &scm_dst);
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
+ ostream *out)
+{
+ SloppyCRCMap scm(get_crc_block_size());
+ int r = _crc_load_or_init(fd, &scm);
+ if (r < 0)
+ return r;
+ return scm.read(off, len, bl, out);
+}
diff --git a/src/os/GenericFileStoreBackend.h b/src/os/GenericFileStoreBackend.h
index 95aca971708..5a09c2497a8 100644
--- a/src/os/GenericFileStoreBackend.h
+++ b/src/os/GenericFileStoreBackend.h
@@ -17,6 +17,8 @@
#include "FileStore.h"
+class SloppyCRCMap;
+
class GenericFileStoreBackend : public FileStoreBackend {
private:
bool ioctl_fiemap;
@@ -25,6 +27,7 @@ private:
public:
GenericFileStoreBackend(FileStore *fs);
virtual ~GenericFileStoreBackend() {};
+
virtual int detect_features();
virtual int create_current();
virtual bool can_checkpoint() { return false; };
@@ -39,5 +42,17 @@ public:
virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) {
return _copy_range(from, to, srcoff, len, dstoff);
}
+
+private:
+ int _crc_load_or_init(int fd, SloppyCRCMap *cm);
+ int _crc_save(int fd, SloppyCRCMap *cm);
+public:
+ virtual int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl);
+ virtual int _crc_update_truncate(int fd, loff_t off);
+ virtual int _crc_update_zero(int fd, loff_t off, size_t len);
+ virtual int _crc_update_clone_range(int srcfd, int destfd,
+ loff_t srcoff, size_t len, loff_t dstoff);
+ virtual int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
+ ostream *out);
};
#endif
diff --git a/src/os/HashIndex.cc b/src/os/HashIndex.cc
index c279bab3a60..ea50cd038ca 100644
--- a/src/os/HashIndex.cc
+++ b/src/os/HashIndex.cc
@@ -66,7 +66,7 @@ int HashIndex::reset_attr(
return r;
if (!exists)
return 0;
- map<string, hobject_t> objects;
+ map<string, ghobject_t> objects;
set<string> subdirs;
r = list_objects(path, 0, 0, &objects);
if (r < 0)
@@ -98,7 +98,7 @@ int HashIndex::col_split_level(
int r = from.list_subdirs(path, &subdirs);
if (r < 0)
return r;
- map<string, hobject_t> objects;
+ map<string, ghobject_t> objects;
r = from.list_objects(path, 0, 0, &objects);
if (r < 0)
return r;
@@ -134,8 +134,8 @@ int HashIndex::col_split_level(
}
/* Then, do the same for each object */
- map<string, hobject_t> objs_to_move;
- for (map<string, hobject_t>::iterator i = objects.begin();
+ map<string, ghobject_t> objs_to_move;
+ for (map<string, ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i) {
if (i->second.match(inbits, match)) {
@@ -199,7 +199,7 @@ int HashIndex::col_split_level(
return r;
}
- for (map<string, hobject_t>::iterator i = objs_to_move.begin();
+ for (map<string, ghobject_t>::iterator i = objs_to_move.begin();
i != objs_to_move.end();
++i) {
from_info.objs--;
@@ -244,7 +244,7 @@ int HashIndex::_init() {
/* LFNIndex virtual method implementations */
int HashIndex::_created(const vector<string> &path,
- const hobject_t &hoid,
+ const ghobject_t &oid,
const string &mangled_name) {
subdir_info_s info;
int r;
@@ -267,10 +267,10 @@ int HashIndex::_created(const vector<string> &path,
}
int HashIndex::_remove(const vector<string> &path,
- const hobject_t &hoid,
+ const ghobject_t &oid,
const string &mangled_name) {
int r;
- r = remove_object(path, hoid);
+ r = remove_object(path, oid);
if (r < 0)
return r;
subdir_info_s info;
@@ -291,12 +291,12 @@ int HashIndex::_remove(const vector<string> &path,
}
}
-int HashIndex::_lookup(const hobject_t &hoid,
+int HashIndex::_lookup(const ghobject_t &oid,
vector<string> *path,
string *mangled_name,
int *exists_out) {
vector<string> path_comp;
- get_path_components(hoid, &path_comp);
+ get_path_components(oid, &path_comp);
vector<string>::iterator next = path_comp.begin();
int exists;
while (1) {
@@ -313,22 +313,22 @@ int HashIndex::_lookup(const hobject_t &hoid,
break;
path->push_back(*(next++));
}
- return get_mangled_name(*path, hoid, mangled_name, exists_out);
+ return get_mangled_name(*path, oid, mangled_name, exists_out);
}
-int HashIndex::_collection_list(vector<hobject_t> *ls) {
+int HashIndex::_collection_list(vector<ghobject_t> *ls) {
vector<string> path;
return list_by_hash(path, 0, 0, 0, 0, ls);
}
-int HashIndex::_collection_list_partial(const hobject_t &start,
+int HashIndex::_collection_list_partial(const ghobject_t &start,
int min_count,
int max_count,
snapid_t seq,
- vector<hobject_t> *ls,
- hobject_t *next) {
+ vector<ghobject_t> *ls,
+ ghobject_t *next) {
vector<string> path;
- hobject_t _next;
+ ghobject_t _next;
if (!next)
next = &_next;
*next = start;
@@ -345,7 +345,7 @@ int HashIndex::recursive_remove(const vector<string> &path) {
int r = list_subdirs(path, &subdirs);
if (r < 0)
return r;
- map<string, hobject_t> objects;
+ map<string, ghobject_t> objects;
r = list_objects(path, 0, 0, &objects);
if (r < 0)
return r;
@@ -475,7 +475,7 @@ int HashIndex::initiate_split(const vector<string> &path, subdir_info_s info) {
int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
int level = info.hash_level;
- map<string, hobject_t> objects;
+ map<string, ghobject_t> objects;
vector<string> dst = path;
int r;
dst.push_back("");
@@ -486,17 +486,17 @@ int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
r = list_subdirs(path, &subdirs);
if (r < 0)
return r;
- map<string, map<string, hobject_t> > mapped;
- map<string, hobject_t> moved;
+ map<string, map<string, ghobject_t> > mapped;
+ map<string, ghobject_t> moved;
int num_moved = 0;
- for (map<string, hobject_t>::iterator i = objects.begin();
+ for (map<string, ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i) {
vector<string> new_path;
get_path_components(i->second, &new_path);
mapped[new_path[level]][i->first] = i->second;
}
- for (map<string, map<string, hobject_t> >::iterator i = mapped.begin();
+ for (map<string, map<string, ghobject_t> >::iterator i = mapped.begin();
i != mapped.end();
) {
dst[level] = i->first;
@@ -505,7 +505,7 @@ int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
subdir_info_s temp;
// subdir has already been fully copied
if (subdirs.count(i->first) && !get_info(dst, &temp)) {
- for (map<string, hobject_t>::iterator j = i->second.begin();
+ for (map<string, ghobject_t>::iterator j = i->second.begin();
j != i->second.end();
++j) {
moved[j->first] = j->second;
@@ -533,7 +533,7 @@ int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
return r;
} // else subdir has been created but only partially copied
- for (map<string, hobject_t>::iterator j = i->second.begin();
+ for (map<string, ghobject_t>::iterator j = i->second.begin();
j != i->second.end();
++j) {
moved[j->first] = j->second;
@@ -574,12 +574,12 @@ int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
return end_split_or_merge(path);
}
-void HashIndex::get_path_components(const hobject_t &hoid,
+void HashIndex::get_path_components(const ghobject_t &oid,
vector<string> *path) {
char buf[MAX_HASH_LEVEL + 1];
- snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)hoid.get_filestore_key());
+ snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)oid.hobj.get_filestore_key());
- // Path components are the hex characters of hoid.hash, least
+ // Path components are the hex characters of oid.hobj.hash, least
// significant first
for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
path->push_back(string(&buf[i], 1));
@@ -596,9 +596,9 @@ string HashIndex::get_hash_str(uint32_t hash) {
return retval;
}
-string HashIndex::get_path_str(const hobject_t &hoid) {
- assert(!hoid.is_max());
- return get_hash_str(hoid.hash);
+string HashIndex::get_path_str(const ghobject_t &oid) {
+ assert(!oid.is_max());
+ return get_hash_str(oid.hobj.hash);
}
uint32_t HashIndex::hash_prefix_to_hash(string prefix) {
@@ -616,12 +616,12 @@ uint32_t HashIndex::hash_prefix_to_hash(string prefix) {
int HashIndex::get_path_contents_by_hash(const vector<string> &path,
const string *lower_bound,
- const hobject_t *next_object,
+ const ghobject_t *next_object,
const snapid_t *seq,
set<string> *hash_prefixes,
- set<pair<string, hobject_t> > *objects) {
+ set<pair<string, ghobject_t> > *objects) {
set<string> subdirs;
- map<string, hobject_t> rev_objects;
+ map<string, ghobject_t> rev_objects;
int r;
string cur_prefix;
for (vector<string>::const_iterator i = path.begin();
@@ -632,7 +632,7 @@ int HashIndex::get_path_contents_by_hash(const vector<string> &path,
r = list_objects(path, 0, 0, &rev_objects);
if (r < 0)
return r;
- for (map<string, hobject_t>::iterator i = rev_objects.begin();
+ for (map<string, ghobject_t>::iterator i = rev_objects.begin();
i != rev_objects.end();
++i) {
string hash_prefix = get_path_str(i->second);
@@ -640,10 +640,10 @@ int HashIndex::get_path_contents_by_hash(const vector<string> &path,
continue;
if (next_object && i->second < *next_object)
continue;
- if (seq && i->second.snap < *seq)
+ if (seq && i->second.hobj.snap < *seq)
continue;
hash_prefixes->insert(hash_prefix);
- objects->insert(pair<string, hobject_t>(hash_prefix, i->second));
+ objects->insert(pair<string, ghobject_t>(hash_prefix, i->second));
}
r = list_subdirs(path, &subdirs);
if (r < 0)
@@ -667,13 +667,13 @@ int HashIndex::list_by_hash(const vector<string> &path,
int min_count,
int max_count,
snapid_t seq,
- hobject_t *next,
- vector<hobject_t> *out) {
+ ghobject_t *next,
+ vector<ghobject_t> *out) {
assert(out);
vector<string> next_path = path;
next_path.push_back("");
set<string> hash_prefixes;
- set<pair<string, hobject_t> > objects;
+ set<pair<string, ghobject_t> > objects;
int r = get_path_contents_by_hash(path,
NULL,
next,
@@ -686,16 +686,16 @@ int HashIndex::list_by_hash(const vector<string> &path,
for (set<string>::iterator i = hash_prefixes.begin();
i != hash_prefixes.end();
++i) {
- set<pair<string, hobject_t> >::iterator j = objects.lower_bound(
- make_pair(*i, hobject_t()));
+ set<pair<string, ghobject_t> >::iterator j = objects.lower_bound(
+ make_pair(*i, ghobject_t()));
if (j == objects.end() || j->first != *i) {
if (min_count > 0 && out->size() > (unsigned)min_count) {
if (next)
- *next = hobject_t("", "", CEPH_NOSNAP, hash_prefix_to_hash(*i), -1, "");
+ *next = ghobject_t(hobject_t("", "", CEPH_NOSNAP, hash_prefix_to_hash(*i), -1, ""));
return 0;
}
*(next_path.rbegin()) = *(i->rbegin());
- hobject_t next_recurse;
+ ghobject_t next_recurse;
if (next)
next_recurse = *next;
r = list_by_hash(next_path,
@@ -727,6 +727,6 @@ int HashIndex::list_by_hash(const vector<string> &path,
}
}
if (next)
- *next = hobject_t::get_max();
+ *next = ghobject_t(hobject_t::get_max());
return 0;
}
diff --git a/src/os/HashIndex.h b/src/os/HashIndex.h
index fcabd9f7198..6f5bca077d4 100644
--- a/src/os/HashIndex.h
+++ b/src/os/HashIndex.h
@@ -39,7 +39,7 @@
* given by the hex characters in the hash beginning with the least
* significant.
*
- * ex: hobject_t("object", CEPH_NO_SNAP, 0xA4CEE0D2)
+ * ex: ghobject_t("object", CEPH_NO_SNAP, 0xA4CEE0D2)
* would be located in (root)/2/D/0/
*
* Subdirectories are created when the number of objects in a directory
@@ -163,30 +163,30 @@ protected:
int _created(
const vector<string> &path,
- const hobject_t &hoid,
+ const ghobject_t &oid,
const string &mangled_name
);
int _remove(
const vector<string> &path,
- const hobject_t &hoid,
+ const ghobject_t &oid,
const string &mangled_name
);
int _lookup(
- const hobject_t &hoid,
+ const ghobject_t &oid,
vector<string> *path,
string *mangled_name,
int *exists
);
int _collection_list(
- vector<hobject_t> *ls
+ vector<ghobject_t> *ls
);
int _collection_list_partial(
- const hobject_t &start,
+ const ghobject_t &start,
int min_count,
int max_count,
snapid_t seq,
- vector<hobject_t> *ls,
- hobject_t *next
+ vector<ghobject_t> *ls,
+ ghobject_t *next
);
private:
/// Recursively remove path and its subdirs
@@ -262,7 +262,7 @@ private:
/// Determine path components from hoid hash
void get_path_components(
- const hobject_t &hoid, ///< [in] Object for which to get path components
+ const ghobject_t &oid, ///< [in] Object for which to get path components
vector<string> *path ///< [out] Path components for hoid.
);
@@ -278,12 +278,12 @@ private:
/**
- * Get string representation of hobject_t/hash
+ * Get string representation of ghobject_t/hash
*
* e.g: 0x01234567 -> "76543210"
*/
static string get_path_str(
- const hobject_t &hoid ///< [in] Object to get hash string for
+ const ghobject_t &oid ///< [in] Object to get hash string for
); ///< @return Hash string for hoid.
/// Get string from hash, @see get_path_str
@@ -319,20 +319,20 @@ private:
int get_path_contents_by_hash(
const vector<string> &path, /// [in] Path to list
const string *lower_bound, /// [in] list > *lower_bound
- const hobject_t *next_object, /// [in] list > *next_object
+ const ghobject_t *next_object, /// [in] list > *next_object
const snapid_t *seq, /// [in] list >= *seq
set<string> *hash_prefixes, /// [out] prefixes in dir
- set<pair<string, hobject_t> > *objects /// [out] objects
+ set<pair<string, ghobject_t> > *objects /// [out] objects
);
- /// List objects in collection in hobject_t order
+ /// List objects in collection in ghobject_t order
int list_by_hash(
const vector<string> &path, /// [in] Path to list
int min_count, /// [in] List at least min_count
int max_count, /// [in] List at most max_count
snapid_t seq, /// [in] list only objects where snap >= seq
- hobject_t *next, /// [in,out] List objects >= *next
- vector<hobject_t> *out /// [out] Listed objects
+ ghobject_t *next, /// [in,out] List objects >= *next
+ vector<ghobject_t> *out /// [out] Listed objects
); ///< @return Error Code, 0 on success
};
diff --git a/src/os/IndexManager.cc b/src/os/IndexManager.cc
index 412721a04c8..83bbfc9703e 100644
--- a/src/os/IndexManager.cc
+++ b/src/os/IndexManager.cc
@@ -75,7 +75,7 @@ int IndexManager::init_index(coll_t c, const char *path, uint32_t version) {
return r;
HashIndex index(c, path, g_conf->filestore_merge_threshold,
g_conf->filestore_split_multiple,
- CollectionIndex::HASH_INDEX_TAG_2,
+ version,
g_conf->filestore_index_retry_probability);
return index.init();
}
diff --git a/src/os/KeyValueDB.h b/src/os/KeyValueDB.h
index f62bca996a5..e98463aa763 100644
--- a/src/os/KeyValueDB.h
+++ b/src/os/KeyValueDB.h
@@ -165,6 +165,8 @@ public:
);
}
+ virtual uint64_t get_estimated_size(map<string,uint64_t> &extra) = 0;
+
virtual ~KeyValueDB() {}
protected:
diff --git a/src/os/LFNIndex.cc b/src/os/LFNIndex.cc
index 09d0f02267f..83e1c144754 100644
--- a/src/os/LFNIndex.cc
+++ b/src/os/LFNIndex.cc
@@ -73,7 +73,7 @@ int LFNIndex::init()
return _init();
}
-int LFNIndex::created(const hobject_t &hoid, const char *path)
+int LFNIndex::created(const ghobject_t &oid, const char *path)
{
WRAP_RETRY(
vector<string> path_comp;
@@ -81,38 +81,39 @@ int LFNIndex::created(const hobject_t &hoid, const char *path)
r = decompose_full_path(path, &path_comp, 0, &short_name);
if (r < 0)
goto out;
- r = lfn_created(path_comp, hoid, short_name);
+ r = lfn_created(path_comp, oid, short_name);
if (r < 0)
goto out;
- r = _created(path_comp, hoid, short_name);
+ r = _created(path_comp, oid, short_name);
if (r < 0)
goto out;
);
}
-int LFNIndex::unlink(const hobject_t &hoid)
+int LFNIndex::unlink(const ghobject_t &oid)
{
WRAP_RETRY(
vector<string> path;
string short_name;
- r = _lookup(hoid, &path, &short_name, NULL);
+ r = _lookup(oid, &path, &short_name, NULL);
if (r < 0) {
goto out;
}
- r = _remove(path, hoid, short_name);
+ r = _remove(path, oid, short_name);
if (r < 0) {
goto out;
}
);
}
-int LFNIndex::lookup(const hobject_t &hoid,
+int LFNIndex::lookup(const ghobject_t &oid,
IndexedPath *out_path,
- int *exist) {
+ int *exist)
+{
WRAP_RETRY(
vector<string> path;
string short_name;
- r = _lookup(hoid, &path, &short_name, exist);
+ r = _lookup(oid, &path, &short_name, exist);
if (r < 0)
goto out;
string full_path = get_full_path(path, short_name);
@@ -135,18 +136,18 @@ int LFNIndex::lookup(const hobject_t &hoid,
);
}
-int LFNIndex::collection_list(vector<hobject_t> *ls)
+int LFNIndex::collection_list(vector<ghobject_t> *ls)
{
return _collection_list(ls);
}
-int LFNIndex::collection_list_partial(const hobject_t &start,
+int LFNIndex::collection_list_partial(const ghobject_t &start,
int min_count,
int max_count,
snapid_t seq,
- vector<hobject_t> *ls,
- hobject_t *next)
+ vector<ghobject_t> *ls,
+ ghobject_t *next)
{
return _collection_list_partial(start, min_count, max_count, seq, ls, next);
}
@@ -171,13 +172,14 @@ int LFNIndex::fsync_dir(const vector<string> &path)
int LFNIndex::link_object(const vector<string> &from,
const vector<string> &to,
- const hobject_t &hoid,
- const string &from_short_name) {
+ const ghobject_t &oid,
+ const string &from_short_name)
+{
int r;
string from_path = get_full_path(from, from_short_name);
string to_path;
maybe_inject_failure();
- r = lfn_get_name(to, hoid, 0, &to_path, 0);
+ r = lfn_get_name(to, oid, 0, &to_path, 0);
if (r < 0)
return r;
maybe_inject_failure();
@@ -190,10 +192,11 @@ int LFNIndex::link_object(const vector<string> &from,
}
int LFNIndex::remove_objects(const vector<string> &dir,
- const map<string, hobject_t> &to_remove,
- map<string, hobject_t> *remaining) {
+ const map<string, ghobject_t> &to_remove,
+ map<string, ghobject_t> *remaining)
+{
set<string> clean_chains;
- for (map<string, hobject_t>::const_iterator to_clean = to_remove.begin();
+ for (map<string, ghobject_t>::const_iterator to_clean = to_remove.begin();
to_clean != to_remove.end();
++to_clean) {
if (!lfn_is_hashed_filename(to_clean->first)) {
@@ -207,7 +210,7 @@ int LFNIndex::remove_objects(const vector<string> &dir,
if (clean_chains.count(lfn_get_short_name(to_clean->second, 0)))
continue;
set<int> holes;
- map<int, pair<string, hobject_t> > chain;
+ map<int, pair<string, ghobject_t> > chain;
for (int i = 0; ; ++i) {
string short_name = lfn_get_short_name(to_clean->second, i);
if (remaining->count(short_name)) {
@@ -219,7 +222,7 @@ int LFNIndex::remove_objects(const vector<string> &dir,
}
}
- map<int, pair<string, hobject_t > >::reverse_iterator candidate = chain.rbegin();
+ map<int, pair<string, ghobject_t > >::reverse_iterator candidate = chain.rbegin();
for (set<int>::iterator i = holes.begin();
i != holes.end();
++i) {
@@ -241,7 +244,7 @@ int LFNIndex::remove_objects(const vector<string> &dir,
if (r < 0)
return -errno;
remaining->erase(candidate->second.first);
- remaining->insert(pair<string, hobject_t>(
+ remaining->insert(pair<string, ghobject_t>(
lfn_get_short_name(candidate->second.second, *i),
candidate->second.second));
++candidate;
@@ -253,13 +256,14 @@ int LFNIndex::remove_objects(const vector<string> &dir,
}
int LFNIndex::move_objects(const vector<string> &from,
- const vector<string> &to) {
- map<string, hobject_t> to_move;
+ const vector<string> &to)
+{
+ map<string, ghobject_t> to_move;
int r;
r = list_objects(from, 0, NULL, &to_move);
if (r < 0)
return r;
- for (map<string,hobject_t>::iterator i = to_move.begin();
+ for (map<string,ghobject_t>::iterator i = to_move.begin();
i != to_move.end();
++i) {
string from_path = get_full_path(from, i->first);
@@ -280,7 +284,7 @@ int LFNIndex::move_objects(const vector<string> &from,
r = fsync_dir(to);
if (r < 0)
return r;
- for (map<string,hobject_t>::iterator i = to_move.begin();
+ for (map<string,ghobject_t>::iterator i = to_move.begin();
i != to_move.end();
++i) {
maybe_inject_failure();
@@ -293,21 +297,23 @@ int LFNIndex::move_objects(const vector<string> &from,
}
int LFNIndex::remove_object(const vector<string> &from,
- const hobject_t &hoid) {
+ const ghobject_t &oid)
+{
string short_name;
int r, exist;
maybe_inject_failure();
- r = get_mangled_name(from, hoid, &short_name, &exist);
+ r = get_mangled_name(from, oid, &short_name, &exist);
maybe_inject_failure();
if (r < 0)
return r;
- return lfn_unlink(from, hoid, short_name);
+ return lfn_unlink(from, oid, short_name);
}
int LFNIndex::get_mangled_name(const vector<string> &from,
- const hobject_t &hoid,
- string *mangled_name, int *exists) {
- return lfn_get_name(from, hoid, mangled_name, 0, exists);
+ const ghobject_t &oid,
+ string *mangled_name, int *exists)
+{
+ return lfn_get_name(from, oid, mangled_name, 0, exists);
}
int LFNIndex::move_subdir(
@@ -315,7 +321,8 @@ int LFNIndex::move_subdir(
LFNIndex &dest,
const vector<string> &path,
string dir
- ) {
+ )
+{
vector<string> sub_path(path.begin(), path.end());
sub_path.push_back(dir);
string from_path(from.get_full_path_subdir(sub_path));
@@ -330,8 +337,9 @@ int LFNIndex::move_object(
LFNIndex &from,
LFNIndex &dest,
const vector<string> &path,
- const pair<string, hobject_t> &obj
- ) {
+ const pair<string, ghobject_t> &obj
+ )
+{
string from_path(from.get_full_path(path, obj.first));
string to_path;
string to_name;
@@ -358,7 +366,8 @@ int LFNIndex::move_object(
static int get_hobject_from_oinfo(const char *dir, const char *file,
- hobject_t *o) {
+ ghobject_t *o)
+{
char path[PATH_MAX];
bufferptr bp(PATH_MAX);
snprintf(path, sizeof(path), "%s/%s", dir, file);
@@ -375,10 +384,11 @@ static int get_hobject_from_oinfo(const char *dir, const char *file,
int LFNIndex::list_objects(const vector<string> &to_list, int max_objs,
- long *handle, map<string, hobject_t> *out) {
+ long *handle, map<string, ghobject_t> *out)
+{
string to_list_path = get_full_path_subdir(to_list);
DIR *dir = ::opendir(to_list_path.c_str());
- char buf[PATH_MAX];
+ char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
int r;
if (!dir) {
return -errno;
@@ -402,7 +412,7 @@ int LFNIndex::list_objects(const vector<string> &to_list, int max_objs,
if (de->d_name[0] == '.')
continue;
string short_name(de->d_name);
- hobject_t obj;
+ ghobject_t obj;
if (lfn_is_object(short_name)) {
r = lfn_translate(to_list, short_name, &obj);
if (r < 0) {
@@ -416,7 +426,7 @@ int LFNIndex::list_objects(const vector<string> &to_list, int max_objs,
if (index_version == HASH_INDEX_TAG)
get_hobject_from_oinfo(to_list_path.c_str(), short_name.c_str(), &obj);
- out->insert(pair<string, hobject_t>(short_name, obj));
+ out->insert(pair<string, ghobject_t>(short_name, obj));
++listed;
} else {
continue;
@@ -435,10 +445,11 @@ int LFNIndex::list_objects(const vector<string> &to_list, int max_objs,
}
int LFNIndex::list_subdirs(const vector<string> &to_list,
- set<string> *out) {
+ set<string> *out)
+{
string to_list_path = get_full_path_subdir(to_list);
DIR *dir = ::opendir(to_list_path.c_str());
- char buf[PATH_MAX];
+ char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
if (!dir)
return -errno;
@@ -449,7 +460,7 @@ int LFNIndex::list_subdirs(const vector<string> &to_list,
}
string short_name(de->d_name);
string demangled_name;
- hobject_t obj;
+ ghobject_t obj;
if (lfn_is_subdir(short_name, &demangled_name)) {
out->insert(demangled_name);
}
@@ -501,7 +512,8 @@ int LFNIndex::path_exists(const vector<string> &to_check, int *exists)
int LFNIndex::add_attr_path(const vector<string> &path,
const string &attr_name,
- bufferlist &attr_value) {
+ bufferlist &attr_value)
+{
string full_path = get_full_path_subdir(path);
maybe_inject_failure();
return chain_setxattr(full_path.c_str(), mangle_attr_name(attr_name).c_str(),
@@ -511,7 +523,8 @@ int LFNIndex::add_attr_path(const vector<string> &path,
int LFNIndex::get_attr_path(const vector<string> &path,
const string &attr_name,
- bufferlist &attr_value) {
+ bufferlist &attr_value)
+{
string full_path = get_full_path_subdir(path);
size_t size = 1024; // Initial
while (1) {
@@ -536,22 +549,24 @@ int LFNIndex::get_attr_path(const vector<string> &path,
}
int LFNIndex::remove_attr_path(const vector<string> &path,
- const string &attr_name) {
+ const string &attr_name)
+{
string full_path = get_full_path_subdir(path);
string mangled_attr_name = mangle_attr_name(attr_name);
maybe_inject_failure();
return chain_removexattr(full_path.c_str(), mangled_attr_name.c_str());
}
-string LFNIndex::lfn_generate_object_name_keyless(const hobject_t &hoid)
+string LFNIndex::lfn_generate_object_name_keyless(const ghobject_t &oid)
{
char s[FILENAME_MAX_LEN];
char *end = s + sizeof(s);
char *t = s;
- const char *i = hoid.oid.name.c_str();
+ assert(oid.generation == ghobject_t::NO_GEN);
+ const char *i = oid.hobj.oid.name.c_str();
// Escape subdir prefix
- if (hoid.oid.name.substr(0, 4) == "DIR_") {
+ if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
*t++ = '\\';
*t++ = 'd';
i += 4;
@@ -560,7 +575,7 @@ string LFNIndex::lfn_generate_object_name_keyless(const hobject_t &hoid)
if (*i == '\\') {
*t++ = '\\';
*t++ = '\\';
- } else if (*i == '.' && i == hoid.oid.name.c_str()) { // only escape leading .
+ } else if (*i == '.' && i == oid.hobj.oid.name.c_str()) { // only escape leading .
*t++ = '\\';
*t++ = '.';
} else if (*i == '/') {
@@ -571,13 +586,13 @@ string LFNIndex::lfn_generate_object_name_keyless(const hobject_t &hoid)
i++;
}
- if (hoid.snap == CEPH_NOSNAP)
+ if (oid.hobj.snap == CEPH_NOSNAP)
t += snprintf(t, end - t, "_head");
- else if (hoid.snap == CEPH_SNAPDIR)
+ else if (oid.hobj.snap == CEPH_SNAPDIR)
t += snprintf(t, end - t, "_snapdir");
else
- t += snprintf(t, end - t, "_%llx", (long long unsigned)hoid.snap);
- snprintf(t, end - t, "_%.*X", (int)(sizeof(hoid.hash)*2), hoid.hash);
+ t += snprintf(t, end - t, "_%llx", (long long unsigned)oid.hobj.snap);
+ snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.hash)*2), oid.hobj.hash);
return string(s);
}
@@ -601,94 +616,112 @@ static void append_escaped(string::const_iterator begin,
}
}
-string LFNIndex::lfn_generate_object_name(const hobject_t &hoid)
+string LFNIndex::lfn_generate_object_name(const ghobject_t &oid)
{
if (index_version == HASH_INDEX_TAG)
- return lfn_generate_object_name_keyless(hoid);
+ return lfn_generate_object_name_keyless(oid);
if (index_version == HASH_INDEX_TAG_2)
- return lfn_generate_object_name_poolless(hoid);
+ return lfn_generate_object_name_poolless(oid);
string full_name;
- string::const_iterator i = hoid.oid.name.begin();
- if (hoid.oid.name.substr(0, 4) == "DIR_") {
+ string::const_iterator i = oid.hobj.oid.name.begin();
+ if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
full_name.append("\\d");
i += 4;
- } else if (hoid.oid.name[0] == '.') {
+ } else if (oid.hobj.oid.name[0] == '.') {
full_name.append("\\.");
++i;
}
- append_escaped(i, hoid.oid.name.end(), &full_name);
+ append_escaped(i, oid.hobj.oid.name.end(), &full_name);
full_name.append("_");
- append_escaped(hoid.get_key().begin(), hoid.get_key().end(), &full_name);
+ append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name);
full_name.append("_");
char buf[PATH_MAX];
char *t = buf;
char *end = t + sizeof(buf);
- if (hoid.snap == CEPH_NOSNAP)
+ if (oid.hobj.snap == CEPH_NOSNAP)
t += snprintf(t, end - t, "head");
- else if (hoid.snap == CEPH_SNAPDIR)
+ else if (oid.hobj.snap == CEPH_SNAPDIR)
t += snprintf(t, end - t, "snapdir");
else
- t += snprintf(t, end - t, "%llx", (long long unsigned)hoid.snap);
- snprintf(t, end - t, "_%.*X", (int)(sizeof(hoid.hash)*2), hoid.hash);
+ t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
+ snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.hash)*2), oid.hobj.hash);
full_name += string(buf);
full_name.append("_");
- append_escaped(hoid.nspace.begin(), hoid.nspace.end(), &full_name);
+ append_escaped(oid.hobj.nspace.begin(), oid.hobj.nspace.end(), &full_name);
full_name.append("_");
t = buf;
end = t + sizeof(buf);
- if (hoid.pool == -1)
+ if (oid.hobj.pool == -1)
t += snprintf(t, end - t, "none");
else
- t += snprintf(t, end - t, "%llx", (long long unsigned)hoid.pool);
+ t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.pool);
full_name += string(buf);
+ if (oid.generation != ghobject_t::NO_GEN) {
+ assert(oid.shard_id != ghobject_t::NO_SHARD);
+ full_name.append("_");
+
+ t = buf;
+ end = t + sizeof(buf);
+ t += snprintf(t, end - t, "%llx", (long long unsigned)oid.generation);
+ full_name += string(buf);
+
+ full_name.append("_");
+
+ t = buf;
+ end = t + sizeof(buf);
+ t += snprintf(t, end - t, "%x", (int)oid.shard_id);
+ full_name += string(buf);
+ }
+
return full_name;
}
-string LFNIndex::lfn_generate_object_name_poolless(const hobject_t &hoid)
+string LFNIndex::lfn_generate_object_name_poolless(const ghobject_t &oid)
{
if (index_version == HASH_INDEX_TAG)
- return lfn_generate_object_name_keyless(hoid);
+ return lfn_generate_object_name_keyless(oid);
+ assert(oid.generation == ghobject_t::NO_GEN);
string full_name;
- string::const_iterator i = hoid.oid.name.begin();
- if (hoid.oid.name.substr(0, 4) == "DIR_") {
+ string::const_iterator i = oid.hobj.oid.name.begin();
+ if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
full_name.append("\\d");
i += 4;
- } else if (hoid.oid.name[0] == '.') {
+ } else if (oid.hobj.oid.name[0] == '.') {
full_name.append("\\.");
++i;
}
- append_escaped(i, hoid.oid.name.end(), &full_name);
+ append_escaped(i, oid.hobj.oid.name.end(), &full_name);
full_name.append("_");
- append_escaped(hoid.get_key().begin(), hoid.get_key().end(), &full_name);
+ append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name);
full_name.append("_");
char snap_with_hash[PATH_MAX];
char *t = snap_with_hash;
char *end = t + sizeof(snap_with_hash);
- if (hoid.snap == CEPH_NOSNAP)
+ if (oid.hobj.snap == CEPH_NOSNAP)
t += snprintf(t, end - t, "head");
- else if (hoid.snap == CEPH_SNAPDIR)
+ else if (oid.hobj.snap == CEPH_SNAPDIR)
t += snprintf(t, end - t, "snapdir");
else
- t += snprintf(t, end - t, "%llx", (long long unsigned)hoid.snap);
- snprintf(t, end - t, "_%.*X", (int)(sizeof(hoid.hash)*2), hoid.hash);
+ t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
+ snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.hash)*2), oid.hobj.hash);
full_name += string(snap_with_hash);
return full_name;
}
int LFNIndex::lfn_get_name(const vector<string> &path,
- const hobject_t &hoid,
+ const ghobject_t &oid,
string *mangled_name, string *out_path,
int *exists)
{
string subdir_path = get_full_path_subdir(path);
- string full_name = lfn_generate_object_name(hoid);
+ string full_name = lfn_generate_object_name(oid);
int r;
if (!lfn_must_hash(full_name)) {
@@ -718,7 +751,7 @@ int LFNIndex::lfn_get_name(const vector<string> &path,
string candidate_path;
char buf[FILENAME_MAX_LEN + 1];
for ( ; ; ++i) {
- candidate = lfn_get_short_name(hoid, i);
+ candidate = lfn_get_short_name(oid, i);
candidate_path = get_full_path(path, candidate);
r = chain_getxattr(candidate_path.c_str(), get_lfn_attr().c_str(), buf, sizeof(buf));
if (r < 0) {
@@ -757,20 +790,20 @@ int LFNIndex::lfn_get_name(const vector<string> &path,
}
int LFNIndex::lfn_created(const vector<string> &path,
- const hobject_t &hoid,
+ const ghobject_t &oid,
const string &mangled_name)
{
if (!lfn_is_hashed_filename(mangled_name))
return 0;
string full_path = get_full_path(path, mangled_name);
- string full_name = lfn_generate_object_name(hoid);
+ string full_name = lfn_generate_object_name(oid);
maybe_inject_failure();
return chain_setxattr(full_path.c_str(), get_lfn_attr().c_str(),
full_name.c_str(), full_name.size());
}
int LFNIndex::lfn_unlink(const vector<string> &path,
- const hobject_t &hoid,
+ const ghobject_t &oid,
const string &mangled_name)
{
if (!lfn_is_hashed_filename(mangled_name)) {
@@ -787,7 +820,7 @@ int LFNIndex::lfn_unlink(const vector<string> &path,
int i = 0;
for ( ; ; ++i) {
- string candidate = lfn_get_short_name(hoid, i);
+ string candidate = lfn_get_short_name(oid, i);
if (candidate == mangled_name)
break;
}
@@ -795,7 +828,7 @@ int LFNIndex::lfn_unlink(const vector<string> &path,
++i;
for ( ; ; ++i) {
struct stat buf;
- string to_check = lfn_get_short_name(hoid, i);
+ string to_check = lfn_get_short_name(oid, i);
string to_check_path = get_full_path(path, to_check);
int r = ::stat(to_check_path.c_str(), &buf);
if (r < 0) {
@@ -817,7 +850,7 @@ int LFNIndex::lfn_unlink(const vector<string> &path,
return 0;
} else {
string rename_to = get_full_path(path, mangled_name);
- string rename_from = get_full_path(path, lfn_get_short_name(hoid, i - 1));
+ string rename_from = get_full_path(path, lfn_get_short_name(oid, i - 1));
maybe_inject_failure();
int r = ::rename(rename_from.c_str(), rename_to.c_str());
maybe_inject_failure();
@@ -830,7 +863,7 @@ int LFNIndex::lfn_unlink(const vector<string> &path,
int LFNIndex::lfn_translate(const vector<string> &path,
const string &short_name,
- hobject_t *out)
+ ghobject_t *out)
{
if (!lfn_is_hashed_filename(short_name)) {
return lfn_parse_object_name(short_name, out);
@@ -863,7 +896,7 @@ bool LFNIndex::lfn_is_subdir(const string &name, string *demangled)
return 0;
}
-static int parse_object(const char *s, hobject_t& o)
+static int parse_object(const char *s, ghobject_t& o)
{
const char *hash = s + strlen(s) - 1;
while (*hash != '_' &&
@@ -899,28 +932,28 @@ static int parse_object(const char *s, hobject_t& o)
i++;
}
*t = 0;
- o.oid.name = string(buf, t-buf);
+ o.hobj.oid.name = string(buf, t-buf);
if (strncmp(bar+1, "head", 4) == 0)
- o.snap = CEPH_NOSNAP;
+ o.hobj.snap = CEPH_NOSNAP;
else if (strncmp(bar+1, "snapdir", 7) == 0)
- o.snap = CEPH_SNAPDIR;
+ o.hobj.snap = CEPH_SNAPDIR;
else
- o.snap = strtoull(bar+1, NULL, 16);
- sscanf(hash, "_%X", &o.hash);
+ o.hobj.snap = strtoull(bar+1, NULL, 16);
+ sscanf(hash, "_%X", &o.hobj.hash);
return 1;
}
return 0;
}
-bool LFNIndex::lfn_parse_object_name_keyless(const string &long_name, hobject_t *out)
+bool LFNIndex::lfn_parse_object_name_keyless(const string &long_name, ghobject_t *out)
{
bool r = parse_object(long_name.c_str(), *out);
int64_t pool = -1;
pg_t pg;
if (coll().is_pg_prefix(pg))
pool = (int64_t)pg.pool();
- out->pool = pool;
+ out->hobj.pool = pool;
if (!r) return r;
string temp = lfn_generate_object_name(*out);
return r;
@@ -928,7 +961,8 @@ bool LFNIndex::lfn_parse_object_name_keyless(const string &long_name, hobject_t
static bool append_unescaped(string::const_iterator begin,
string::const_iterator end,
- string *out) {
+ string *out)
+{
for (string::const_iterator i = begin; i != end; ++i) {
if (*i == '\\') {
++i;
@@ -950,7 +984,8 @@ static bool append_unescaped(string::const_iterator begin,
}
bool LFNIndex::lfn_parse_object_name_poolless(const string &long_name,
- hobject_t *out) {
+ ghobject_t *out)
+{
string name;
string key;
uint32_t hash;
@@ -1011,12 +1046,12 @@ bool LFNIndex::lfn_parse_object_name_poolless(const string &long_name,
pg_t pg;
if (coll().is_pg_prefix(pg))
pool = (int64_t)pg.pool();
- (*out) = hobject_t(name, key, snap, hash, pool, "");
+ (*out) = ghobject_t(hobject_t(name, key, snap, hash, pool, ""));
return true;
}
-bool LFNIndex::lfn_parse_object_name(const string &long_name, hobject_t *out)
+bool LFNIndex::lfn_parse_object_name(const string &long_name, ghobject_t *out)
{
string name;
string key;
@@ -1024,6 +1059,8 @@ bool LFNIndex::lfn_parse_object_name(const string &long_name, hobject_t *out)
uint32_t hash;
snapid_t snap;
uint64_t pool;
+ gen_t generation = ghobject_t::NO_GEN;
+ shard_t shard_id = ghobject_t::NO_SHARD;
if (index_version == HASH_INDEX_TAG)
return lfn_parse_object_name_keyless(long_name, out);
@@ -1081,10 +1118,28 @@ bool LFNIndex::lfn_parse_object_name(const string &long_name, hobject_t *out)
current = ++end;
for ( ; end != long_name.end() && *end != '_'; ++end) ;
- if (end != long_name.end())
- return false;
string pstring(current, end);
+ // Optional generation/shard_id
+ string genstring, shardstring;
+ if (end != long_name.end()) {
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return false;
+ genstring = string(current, end);
+
+ generation = (gen_t)strtoull(genstring.c_str(), NULL, 16);
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end != long_name.end())
+ return false;
+ shardstring = string(current, end);
+
+ shard_id = (shard_t)strtoul(shardstring.c_str(), NULL, 16);
+ }
+
if (snap_str == "head")
snap = CEPH_NOSNAP;
else if (snap_str == "snapdir")
@@ -1098,7 +1153,7 @@ bool LFNIndex::lfn_parse_object_name(const string &long_name, hobject_t *out)
else
pool = strtoull(pstring.c_str(), NULL, 16);
- (*out) = hobject_t(name, key, snap, hash, (int64_t)pool, ns);
+ (*out) = ghobject_t(hobject_t(name, key, snap, hash, (int64_t)pool, ns), generation, shard_id);
return true;
}
@@ -1170,9 +1225,9 @@ void LFNIndex::build_filename(const char *old_filename, int i, char *filename, i
}
}
-string LFNIndex::lfn_get_short_name(const hobject_t &hoid, int i)
+string LFNIndex::lfn_get_short_name(const ghobject_t &oid, int i)
{
- string long_name = lfn_generate_object_name(hoid);
+ string long_name = lfn_generate_object_name(oid);
assert(lfn_must_hash(long_name));
char buf[FILENAME_SHORT_LEN + 4];
build_filename(long_name.c_str(), i, buf, sizeof(buf));
@@ -1212,7 +1267,7 @@ string LFNIndex::demangle_path_component(const string &component)
}
int LFNIndex::decompose_full_path(const char *in, vector<string> *out,
- hobject_t *hoid, string *shortname)
+ ghobject_t *oid, string *shortname)
{
const char *beginning = in + get_base_path().size();
const char *end = beginning;
@@ -1228,8 +1283,8 @@ int LFNIndex::decompose_full_path(const char *in, vector<string> *out,
}
}
*shortname = string(beginning, end - beginning);
- if (hoid) {
- int r = lfn_translate(*out, *shortname, hoid);
+ if (oid) {
+ int r = lfn_translate(*out, *shortname, oid);
if (r < 0)
return r;
}
diff --git a/src/os/LFNIndex.h b/src/os/LFNIndex.h
index b73ff4db268..f436446bf0f 100644
--- a/src/os/LFNIndex.h
+++ b/src/os/LFNIndex.h
@@ -165,35 +165,35 @@ public:
/// @see CollectionIndex
int created(
- const hobject_t &hoid,
+ const ghobject_t &oid,
const char *path
);
/// @see CollectionIndex
int unlink(
- const hobject_t &hoid
+ const ghobject_t &oid
);
/// @see CollectionIndex
int lookup(
- const hobject_t &hoid,
+ const ghobject_t &oid,
IndexedPath *path,
int *exist
);
/// @see CollectionIndex
int collection_list(
- vector<hobject_t> *ls
+ vector<ghobject_t> *ls
);
/// @see CollectionIndex
int collection_list_partial(
- const hobject_t &start,
+ const ghobject_t &start,
int min_count,
int max_count,
snapid_t seq,
- vector<hobject_t> *ls,
- hobject_t *next
+ vector<ghobject_t> *ls,
+ ghobject_t *next
);
virtual int _split(
@@ -221,20 +221,20 @@ protected:
/// Will be called upon object creation
virtual int _created(
const vector<string> &path, ///< [in] Path to subdir.
- const hobject_t &hoid, ///< [in] Object created.
+ const ghobject_t &oid, ///< [in] Object created.
const string &mangled_name ///< [in] Mangled filename.
) = 0;
/// Will be called to remove an object
virtual int _remove(
const vector<string> &path, ///< [in] Path to subdir.
- const hobject_t &hoid, ///< [in] Object to remove.
+ const ghobject_t &oid, ///< [in] Object to remove.
const string &mangled_name ///< [in] Mangled filename.
) = 0;
- /// Return the path and mangled_name for hoid.
+ /// Return the path and mangled_name for oid.
virtual int _lookup(
- const hobject_t &hoid,///< [in] Object for lookup.
+ const ghobject_t &oid,///< [in] Object for lookup.
vector<string> *path, ///< [out] Path to the object.
string *mangled_name, ///< [out] Mangled filename.
int *exists ///< [out] True if the object exists.
@@ -252,17 +252,17 @@ protected:
*/
/// List contents of collection.
virtual int _collection_list(
- vector<hobject_t> *ls ///< [out] Listed objects.
+ vector<ghobject_t> *ls ///< [out] Listed objects.
) = 0;
/// @see CollectionIndex
virtual int _collection_list_partial(
- const hobject_t &start,
+ const ghobject_t &start,
int min_count,
int max_count,
snapid_t seq,
- vector<hobject_t> *ls,
- hobject_t *next
+ vector<ghobject_t> *ls,
+ ghobject_t *next
) = 0;
protected:
@@ -278,8 +278,8 @@ protected:
int link_object(
const vector<string> &from, ///< [in] Source subdirectory.
const vector<string> &to, ///< [in] Dest subdirectory.
- const hobject_t &hoid, ///< [in] Object to move.
- const string &from_short_name ///< [in] Mangled filename of hoid.
+ const ghobject_t &oid, ///< [in] Object to move.
+ const string &from_short_name ///< [in] Mangled filename of oid.
); ///< @return Error Code, 0 on success
/**
@@ -296,8 +296,8 @@ protected:
*/
int remove_objects(
const vector<string> &dir,
- const map<string, hobject_t> &to_remove,
- map<string, hobject_t> *remaining
+ const map<string, ghobject_t> &to_remove,
+ map<string, ghobject_t> *remaining
);
@@ -322,11 +322,11 @@ protected:
*/
int remove_object(
const vector<string> &from, ///< [in] Directory from which to remove.
- const hobject_t &to_remove ///< [in] Object to remove.
+ const ghobject_t &to_remove ///< [in] Object to remove.
);
/**
- * Gets the filename corresponding to hoid in from.
+ * Gets the filename corresponding to oid in from.
*
* The filename may differ between subdirectories. Furthermore,
* file creations ore removals in from may invalidate the name.
@@ -334,7 +334,7 @@ protected:
*/
int get_mangled_name(
const vector<string> &from, ///< [in] Subdirectory
- const hobject_t &hoid, ///< [in] Object
+ const ghobject_t &oid, ///< [in] Object
string *mangled_name, ///< [out] Filename
int *exists ///< [out] 1 if the file exists, else 0
);
@@ -352,7 +352,7 @@ protected:
LFNIndex &from, ///< [in] from index
LFNIndex &dest, ///< [in] to index
const vector<string> &path, ///< [in] path to split
- const pair<string, hobject_t> &obj ///< [in] obj to move
+ const pair<string, ghobject_t> &obj ///< [in] obj to move
);
/**
@@ -369,7 +369,7 @@ protected:
const vector<string> &to_list,
int max_objects,
long *handle,
- map<string, hobject_t> *out
+ map<string, ghobject_t> *out
);
/// Lists subdirectories.
@@ -425,43 +425,43 @@ private:
}
/**
- * Gets the filename corresponsing to hoid in path.
+ * Gets the filename corresponsing to oid in path.
*
- * @param [in] path Path in which to get filename for hoid.
- * @param [in] hoid Object for which to get filename.
- * @param [out] mangled_name Filename for hoid, pass NULL if not needed.
- * @param [out] full_path Fullpath for hoid, pass NULL if not needed.
+ * @param [in] path Path in which to get filename for oid.
+ * @param [in] oid Object for which to get filename.
+ * @param [out] mangled_name Filename for oid, pass NULL if not needed.
+ * @param [out] full_path Fullpath for oid, pass NULL if not needed.
* @param [out] exists 1 if the file exists, 0 otherwise, pass NULL if
* not needed
* @return Error Code, 0 on success.
*/
int lfn_get_name(
const vector<string> &path,
- const hobject_t &hoid,
+ const ghobject_t &oid,
string *mangled_name,
string *full_path,
int *exists
);
- /// Adjusts path contents when hoid is created at name mangled_name.
+ /// Adjusts path contents when oid is created at name mangled_name.
int lfn_created(
const vector<string> &path, ///< [in] Path to adjust.
- const hobject_t &hoid, ///< [in] Object created.
+ const ghobject_t &oid, ///< [in] Object created.
const string &mangled_name ///< [in] Filename of created object.
);
- /// Removes hoid from path while adjusting path contents
+ /// Removes oid from path while adjusting path contents
int lfn_unlink(
- const vector<string> &path, ///< [in] Path containing hoid.
- const hobject_t &hoid, ///< [in] Object to remove.
+ const vector<string> &path, ///< [in] Path containing oid.
+ const ghobject_t &oid, ///< [in] Object to remove.
const string &mangled_name ///< [in] Filename of object to remove.
);
- ///Transate a file into and hobject_t.
+ ///Transate a file into and ghobject_t.
int lfn_translate(
const vector<string> &path, ///< [in] Path containing the file.
const string &short_name, ///< [in] Filename to translate.
- hobject_t *out ///< [out] Object found.
+ ghobject_t *out ///< [out] Object found.
); ///< @return Negative error code on error, 0 if not an object, 1 else
/* manglers/demanglers */
@@ -478,35 +478,35 @@ private:
/// Generate object name
string lfn_generate_object_name_keyless(
- const hobject_t &hoid ///< [in] Object for which to generate.
+ const ghobject_t &oid ///< [in] Object for which to generate.
); ///< @return Generated object name.
/// Generate object name
string lfn_generate_object_name_poolless(
- const hobject_t &hoid ///< [in] Object for which to generate.
+ const ghobject_t &oid ///< [in] Object for which to generate.
); ///< @return Generated object name.
/// Generate object name
string lfn_generate_object_name(
- const hobject_t &hoid ///< [in] Object for which to generate.
+ const ghobject_t &oid ///< [in] Object for which to generate.
); ///< @return Generated object name.
/// Parse object name
bool lfn_parse_object_name_keyless(
const string &long_name, ///< [in] Name to parse
- hobject_t *out ///< [out] Resulting Object
+ ghobject_t *out ///< [out] Resulting Object
); ///< @return True if successfull, False otherwise.
/// Parse object name
bool lfn_parse_object_name_poolless(
const string &long_name, ///< [in] Name to parse
- hobject_t *out ///< [out] Resulting Object
+ ghobject_t *out ///< [out] Resulting Object
); ///< @return True if successfull, False otherwise.
/// Parse object name
bool lfn_parse_object_name(
const string &long_name, ///< [in] Name to parse
- hobject_t *out ///< [out] Resulting Object
+ ghobject_t *out ///< [out] Resulting Object
); ///< @return True if successfull, False otherwise.
/// Checks whether short_name is a hashed filename.
@@ -521,7 +521,7 @@ private:
/// Generate hashed name.
string lfn_get_short_name(
- const hobject_t &hoid, ///< [in] Object for which to generate.
+ const ghobject_t &oid, ///< [in] Object for which to generate.
int i ///< [in] Index of hashed name to generate.
); ///< @return Hashed filename.
@@ -554,7 +554,7 @@ private:
int decompose_full_path(
const char *in, ///< [in] Full path to object.
vector<string> *out, ///< [out] Path to object at in.
- hobject_t *hoid, ///< [out] Object at in.
+ ghobject_t *oid, ///< [out] Object at in.
string *shortname ///< [out] Filename of object at in.
); ///< @return Error Code, 0 on success.
diff --git a/src/os/LevelDBStore.h b/src/os/LevelDBStore.h
index f3809cf3496..89718ce1987 100644
--- a/src/os/LevelDBStore.h
+++ b/src/os/LevelDBStore.h
@@ -20,6 +20,12 @@
#include "leveldb/filter_policy.h"
#endif
+#include <errno.h>
+#include "common/errno.h"
+#include "common/dout.h"
+#include "include/assert.h"
+#include "common/Formatter.h"
+
#include "common/ceph_context.h"
class PerfCounters;
@@ -300,6 +306,68 @@ public:
return limit;
}
+ virtual uint64_t get_estimated_size(map<string,uint64_t> &extra) {
+ DIR *store_dir = opendir(path.c_str());
+ if (!store_dir) {
+ lderr(cct) << __func__ << " something happened opening the store: "
+ << cpp_strerror(errno) << dendl;
+ return 0;
+ }
+
+ uint64_t total_size = 0;
+ uint64_t sst_size = 0;
+ uint64_t log_size = 0;
+ uint64_t misc_size = 0;
+
+ struct dirent *entry = NULL;
+ while ((entry = readdir(store_dir)) != NULL) {
+ string n(entry->d_name);
+
+ if (n == "." || n == "..")
+ continue;
+
+ string fpath = path + '/' + n;
+ struct stat s;
+ int err = stat(fpath.c_str(), &s);
+ // we may race against leveldb while reading files; this should only
+ // happen when those files are being updated, data is being shuffled
+ // and files get removed, in which case there's not much of a problem
+ // as we'll get to them next time around.
+ if ((err < 0) && (err != -ENOENT)) {
+ lderr(cct) << __func__ << " error obtaining stats for " << fpath
+ << ": " << cpp_strerror(errno) << dendl;
+ goto err;
+ }
+
+ size_t pos = n.find_last_of('.');
+ if (pos == string::npos) {
+ misc_size += s.st_size;
+ continue;
+ }
+
+ string ext = n.substr(pos+1);
+ if (ext == "sst") {
+ sst_size += s.st_size;
+ } else if (ext == "log") {
+ log_size += s.st_size;
+ } else {
+ misc_size += s.st_size;
+ }
+ }
+
+ total_size = sst_size + log_size + misc_size;
+
+ extra["sst"] = sst_size;
+ extra["log"] = log_size;
+ extra["misc"] = misc_size;
+ extra["total"] = total_size;
+
+err:
+ closedir(store_dir);
+ return total_size;
+ }
+
+
protected:
WholeSpaceIterator _get_iterator() {
return std::tr1::shared_ptr<KeyValueDB::WholeSpaceIteratorImpl>(
diff --git a/src/os/Makefile.am b/src/os/Makefile.am
new file mode 100644
index 00000000000..4f12a6a3278
--- /dev/null
+++ b/src/os/Makefile.am
@@ -0,0 +1,50 @@
+libos_la_SOURCES = \
+ os/FileJournal.cc \
+ os/FileStore.cc \
+ os/chain_xattr.cc \
+ os/ObjectStore.cc \
+ os/JournalingObjectStore.cc \
+ os/LFNIndex.cc \
+ os/HashIndex.cc \
+ os/IndexManager.cc \
+ os/FlatIndex.cc \
+ os/DBObjectMap.cc \
+ os/LevelDBStore.cc \
+ os/WBThrottle.cc \
+ os/BtrfsFileStoreBackend.cc \
+ os/GenericFileStoreBackend.cc \
+ os/ZFSFileStoreBackend.cc \
+ common/TrackedOp.cc
+noinst_LTLIBRARIES += libos.la
+
+noinst_HEADERS += \
+ os/btrfs_ioctl.h \
+ os/chain_xattr.h \
+ os/CollectionIndex.h \
+ os/FileJournal.h \
+ os/FileStore.h \
+ os/BtrfsFileStoreBackend.h \
+ os/GenericFileStoreBackend.h \
+ os/ZFSFileStoreBackend.h \
+ os/FlatIndex.h \
+ os/HashIndex.h \
+ os/FDCache.h \
+ os/WBThrottle.h \
+ os/IndexManager.h \
+ os/Journal.h \
+ os/JournalingObjectStore.h \
+ os/LFNIndex.h \
+ os/ObjectStore.h \
+ os/SequencerPosition.h \
+ os/ObjectMap.h \
+ os/DBObjectMap.h \
+ os/KeyValueDB.h \
+ os/LevelDBStore.h
+
+if WITH_LIBZFS
+libos_zfs_a_SOURCES = os/ZFS.cc
+libos_zfs_a_CXXFLAGS = ${AM_CXXFLAGS} ${LIBZFS_CFLAGS}
+noinst_LIBRARIES += libos_zfs.a
+noinst_HEADERS += os/ZFS.h
+endif
+
diff --git a/src/os/ObjectMap.h b/src/os/ObjectMap.h
index 5cc1e495de1..7717aac7437 100644
--- a/src/os/ObjectMap.h
+++ b/src/os/ObjectMap.h
@@ -30,102 +30,102 @@ class ObjectMap {
public:
/// Set keys and values from specified map
virtual int set_keys(
- const hobject_t &hoid, ///< [in] object containing map
+ const ghobject_t &oid, ///< [in] object containing map
const map<string, bufferlist> &set, ///< [in] key to value map to set
const SequencerPosition *spos=0 ///< [in] sequencer position
) = 0;
/// Set header
virtual int set_header(
- const hobject_t &hoid, ///< [in] object containing map
+ const ghobject_t &oid, ///< [in] object containing map
const bufferlist &bl, ///< [in] header to set
const SequencerPosition *spos=0 ///< [in] sequencer position
) = 0;
/// Retrieve header
virtual int get_header(
- const hobject_t &hoid, ///< [in] object containing map
+ const ghobject_t &oid, ///< [in] object containing map
bufferlist *bl ///< [out] header to set
) = 0;
- /// Clear all map keys and values from hoid
+ /// Clear all map keys and values from oid
virtual int clear(
- const hobject_t &hoid, ///< [in] object containing map
+ const ghobject_t &oid, ///< [in] object containing map
const SequencerPosition *spos=0 ///< [in] sequencer position
) = 0;
- /// Clear all map keys and values from hoid
+ /// Clear all map keys and values from oid
virtual int rm_keys(
- const hobject_t &hoid, ///< [in] object containing map
+ const ghobject_t &oid, ///< [in] object containing map
const set<string> &to_clear, ///< [in] Keys to clear
const SequencerPosition *spos=0 ///< [in] sequencer position
) = 0;
/// Get all keys and values
virtual int get(
- const hobject_t &hoid, ///< [in] object containing map
+ const ghobject_t &oid, ///< [in] object containing map
bufferlist *header, ///< [out] Returned Header
map<string, bufferlist> *out ///< [out] Returned keys and values
) = 0;
/// Get values for supplied keys
virtual int get_keys(
- const hobject_t &hoid, ///< [in] object containing map
- set<string> *keys ///< [out] Keys defined on hoid
+ const ghobject_t &oid, ///< [in] object containing map
+ set<string> *keys ///< [out] Keys defined on oid
) = 0;
/// Get values for supplied keys
virtual int get_values(
- const hobject_t &hoid, ///< [in] object containing map
+ const ghobject_t &oid, ///< [in] object containing map
const set<string> &keys, ///< [in] Keys to get
map<string, bufferlist> *out ///< [out] Returned keys and values
) = 0;
/// Check key existence
virtual int check_keys(
- const hobject_t &hoid, ///< [in] object containing map
+ const ghobject_t &oid, ///< [in] object containing map
const set<string> &keys, ///< [in] Keys to check
- set<string> *out ///< [out] Subset of keys defined on hoid
+ set<string> *out ///< [out] Subset of keys defined on oid
) = 0;
/// Get xattrs
virtual int get_xattrs(
- const hobject_t &hoid, ///< [in] object
+ const ghobject_t &oid, ///< [in] object
const set<string> &to_get, ///< [in] keys to get
map<string, bufferlist> *out ///< [out] subset of attrs/vals defined
) = 0;
/// Get all xattrs
virtual int get_all_xattrs(
- const hobject_t &hoid, ///< [in] object
+ const ghobject_t &oid, ///< [in] object
set<string> *out ///< [out] attrs and values
) = 0;
/// set xattrs in to_set
virtual int set_xattrs(
- const hobject_t &hoid, ///< [in] object
+ const ghobject_t &oid, ///< [in] object
const map<string, bufferlist> &to_set,///< [in] attrs/values to set
const SequencerPosition *spos=0 ///< [in] sequencer position
) = 0;
/// remove xattrs in to_remove
virtual int remove_xattrs(
- const hobject_t &hoid, ///< [in] object
+ const ghobject_t &oid, ///< [in] object
const set<string> &to_remove, ///< [in] attrs to remove
const SequencerPosition *spos=0 ///< [in] sequencer position
) = 0;
- /// Clone keys efficiently from hoid map to target map
+ /// Clone keys efficiently from oid map to target map
virtual int clone(
- const hobject_t &hoid, ///< [in] object containing map
- const hobject_t &target, ///< [in] target of clone
+ const ghobject_t &oid, ///< [in] object containing map
+ const ghobject_t &target, ///< [in] target of clone
const SequencerPosition *spos=0 ///< [in] sequencer position
) { return 0; }
/// Ensure all previous writes are durable
virtual int sync(
- const hobject_t *hoid=0, ///< [in] object
+ const ghobject_t *oid=0, ///< [in] object
const SequencerPosition *spos=0 ///< [in] Sequencer
) { return 0; }
@@ -144,7 +144,7 @@ public:
virtual ~ObjectMapIteratorImpl() {}
};
typedef std::tr1::shared_ptr<ObjectMapIteratorImpl> ObjectMapIterator;
- virtual ObjectMapIterator get_iterator(const hobject_t &hoid) {
+ virtual ObjectMapIterator get_iterator(const ghobject_t &oid) {
return ObjectMapIterator();
}
diff --git a/src/os/ObjectStore.cc b/src/os/ObjectStore.cc
index 9d8b989225b..1a1bbcb0b67 100644
--- a/src/os/ObjectStore.cc
+++ b/src/os/ObjectStore.cc
@@ -15,6 +15,7 @@
#include <tr1/memory>
#include "ObjectStore.h"
#include "common/Formatter.h"
+#include "FileStore.h"
ostream& operator<<(ostream& out, const ObjectStore::Sequencer& s)
{
@@ -77,7 +78,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_TOUCH:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
f->dump_string("op_name", "touch");
f->dump_stream("collection") << cid;
f->dump_stream("oid") << oid;
@@ -87,7 +88,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_WRITE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
uint64_t off = i.get_length();
uint64_t len = i.get_length();
bufferlist bl;
@@ -104,7 +105,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_ZERO:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
uint64_t off = i.get_length();
uint64_t len = i.get_length();
f->dump_string("op_name", "zero");
@@ -118,7 +119,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_TRIMCACHE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
uint64_t off = i.get_length();
uint64_t len = i.get_length();
f->dump_string("op_name", "trim_cache");
@@ -132,7 +133,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_TRUNCATE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
uint64_t off = i.get_length();
f->dump_string("op_name", "truncate");
f->dump_stream("collection") << cid;
@@ -144,7 +145,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_REMOVE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
f->dump_string("op_name", "remove");
f->dump_stream("collection") << cid;
f->dump_stream("oid") << oid;
@@ -154,7 +155,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_SETATTR:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
string name = i.get_attrname();
bufferlist bl;
i.get_bl(bl);
@@ -169,7 +170,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_SETATTRS:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
map<string, bufferptr> aset;
i.get_attrset(aset);
f->dump_string("op_name", "setattrs");
@@ -187,7 +188,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_RMATTR:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
string name = i.get_attrname();
f->dump_string("op_name", "rmattr");
f->dump_stream("collection") << cid;
@@ -199,7 +200,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_RMATTRS:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
f->dump_string("op_name", "rmattrs");
f->dump_stream("collection") << cid;
f->dump_stream("oid") << oid;
@@ -209,8 +210,8 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_CLONE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
- hobject_t noid = i.get_oid();
+ ghobject_t oid = i.get_oid();
+ ghobject_t noid = i.get_oid();
f->dump_string("op_name", "clone");
f->dump_stream("collection") << cid;
f->dump_stream("src_oid") << oid;
@@ -221,8 +222,8 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_CLONERANGE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
- hobject_t noid = i.get_oid();
+ ghobject_t oid = i.get_oid();
+ ghobject_t noid = i.get_oid();
uint64_t off = i.get_length();
uint64_t len = i.get_length();
f->dump_string("op_name", "clonerange");
@@ -237,8 +238,8 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_CLONERANGE2:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
- hobject_t noid = i.get_oid();
+ ghobject_t oid = i.get_oid();
+ ghobject_t noid = i.get_oid();
uint64_t srcoff = i.get_length();
uint64_t len = i.get_length();
uint64_t dstoff = i.get_length();
@@ -272,7 +273,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
{
coll_t ocid = i.get_cid();
coll_t ncid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
f->dump_string("op_name", "collection_add");
f->dump_stream("src_collection") << ocid;
f->dump_stream("dst_collection") << ncid;
@@ -283,7 +284,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_COLL_REMOVE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
f->dump_string("op_name", "collection_remove");
f->dump_stream("collection") << cid;
f->dump_stream("oid") << oid;
@@ -294,7 +295,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
{
coll_t ocid = i.get_cid();
coll_t ncid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
f->open_object_section("collection_move");
f->dump_stream("src_collection") << ocid;
f->dump_stream("dst_collection") << ncid;
@@ -344,7 +345,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_OMAP_CLEAR:
{
coll_t cid(i.get_cid());
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
f->dump_string("op_name", "omap_clear");
f->dump_stream("collection") << cid;
f->dump_stream("oid") << oid;
@@ -354,7 +355,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_OMAP_SETKEYS:
{
coll_t cid(i.get_cid());
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
map<string, bufferlist> aset;
i.get_attrset(aset);
f->dump_string("op_name", "omap_setkeys");
@@ -372,7 +373,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_OMAP_RMKEYS:
{
coll_t cid(i.get_cid());
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
set<string> keys;
i.get_keyset(keys);
f->dump_string("op_name", "omap_rmkeys");
@@ -384,7 +385,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_OMAP_SETHEADER:
{
coll_t cid(i.get_cid());
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
bufferlist bl;
i.get_bl(bl);
f->dump_string("op_name", "omap_setheader");
@@ -425,7 +426,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_OMAP_RMKEYRANGE:
{
coll_t cid(i.get_cid());
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
string first, last;
first = i.get_key();
last = i.get_key();
@@ -460,9 +461,9 @@ void ObjectStore::Transaction::generate_test_instances(list<ObjectStore::Transac
t = new Transaction;
coll_t c("foocoll");
coll_t c2("foocoll2");
- hobject_t o1("obj", "", 123, 456, -1, "");
- hobject_t o2("obj2", "", 123, 456, -1, "");
- hobject_t o3("obj3", "", 123, 456, -1, "");
+ ghobject_t o1(hobject_t("obj", "", 123, 456, -1, ""));
+ ghobject_t o2(hobject_t("obj2", "", 123, 456, -1, ""));
+ ghobject_t o3(hobject_t("obj3", "", 123, 456, -1, ""));
t->touch(c, o1);
bufferlist bl;
bl.append("some data");
@@ -497,3 +498,44 @@ void ObjectStore::Transaction::generate_test_instances(list<ObjectStore::Transac
o.push_back(t);
}
+int ObjectStore::collection_list(coll_t c, vector<hobject_t>& o)
+{
+ vector<ghobject_t> go;
+ int ret = collection_list(c, go);
+ if (ret == 0) {
+ o.reserve(go.size());
+ for (vector<ghobject_t>::iterator i = go.begin(); i != go.end() ; i++)
+ o.push_back(i->hobj);
+ }
+ return ret;
+}
+
+int ObjectStore::collection_list_partial(coll_t c, hobject_t start,
+ int min, int max, snapid_t snap,
+ vector<hobject_t> *ls, hobject_t *next)
+{
+ vector<ghobject_t> go;
+ ghobject_t gnext, gstart(start);
+ int ret = collection_list_partial(c, gstart, min, max, snap, &go, &gnext);
+ if (ret == 0) {
+ *next = gnext.hobj;
+ ls->reserve(go.size());
+ for (vector<ghobject_t>::iterator i = go.begin(); i != go.end() ; i++)
+ ls->push_back(i->hobj);
+ }
+ return ret;
+}
+
+int ObjectStore::collection_list_range(coll_t c, hobject_t start, hobject_t end,
+ snapid_t seq, vector<hobject_t> *ls)
+{
+ vector<ghobject_t> go;
+ ghobject_t gstart(start), gend(end);
+ int ret = collection_list_range(c, gstart, gend, seq, &go);
+ if (ret == 0) {
+ ls->reserve(go.size());
+ for (vector<ghobject_t>::iterator i = go.begin(); i != go.end() ; i++)
+ ls->push_back(i->hobj);
+ }
+ return ret;
+}
diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h
index eb5b40c5a69..07473b344f5 100644
--- a/src/os/ObjectStore.h
+++ b/src/os/ObjectStore.h
@@ -159,6 +159,7 @@ public:
OP_SPLIT_COLLECTION2 = 36, /* cid, bits, destination
doesn't create the destination */
OP_OMAP_RMKEYRANGE = 37, // cid, oid, firstkey, lastkey
+ OP_COLL_MOVE_RENAME = 38, // oldcid, oldoid, newcid, newoid
};
private:
@@ -339,21 +340,23 @@ public:
void get_bl(bufferlist& bl) {
::decode(bl, p);
}
- hobject_t get_oid() {
- hobject_t hoid;
+ ghobject_t get_oid() {
+ ghobject_t oid;
if (sobject_encoding) {
sobject_t soid;
::decode(soid, p);
- hoid.snap = soid.snap;
- hoid.oid = soid.oid;
+ oid.hobj.snap = soid.snap;
+ oid.hobj.oid = soid.oid;
+ oid.generation = ghobject_t::NO_GEN;
+ oid.shard_id = ghobject_t::NO_SHARD;
} else {
- ::decode(hoid, p);
+ ::decode(oid, p);
if (use_pool_override && pool_override != -1 &&
- hoid.pool == -1) {
- hoid.pool = pool_override;
+ oid.hobj.pool == -1) {
+ oid.hobj.pool = pool_override;
}
}
- return hoid;
+ return oid;
}
coll_t get_cid() {
coll_t c;
@@ -407,14 +410,14 @@ public:
::encode(op, tbl);
ops++;
}
- void touch(coll_t cid, const hobject_t& oid) {
+ void touch(coll_t cid, const ghobject_t& oid) {
__u32 op = OP_TOUCH;
::encode(op, tbl);
::encode(cid, tbl);
::encode(oid, tbl);
ops++;
}
- void write(coll_t cid, const hobject_t& oid, uint64_t off, uint64_t len, const bufferlist& data) {
+ void write(coll_t cid, const ghobject_t& oid, uint64_t off, uint64_t len, const bufferlist& data) {
__u32 op = OP_WRITE;
::encode(op, tbl);
::encode(cid, tbl);
@@ -430,7 +433,7 @@ public:
::encode(data, tbl);
ops++;
}
- void zero(coll_t cid, const hobject_t& oid, uint64_t off, uint64_t len) {
+ void zero(coll_t cid, const ghobject_t& oid, uint64_t off, uint64_t len) {
__u32 op = OP_ZERO;
::encode(op, tbl);
::encode(cid, tbl);
@@ -439,7 +442,7 @@ public:
::encode(len, tbl);
ops++;
}
- void truncate(coll_t cid, const hobject_t& oid, uint64_t off) {
+ void truncate(coll_t cid, const ghobject_t& oid, uint64_t off) {
__u32 op = OP_TRUNCATE;
::encode(op, tbl);
::encode(cid, tbl);
@@ -447,18 +450,18 @@ public:
::encode(off, tbl);
ops++;
}
- void remove(coll_t cid, const hobject_t& oid) {
+ void remove(coll_t cid, const ghobject_t& oid) {
__u32 op = OP_REMOVE;
::encode(op, tbl);
::encode(cid, tbl);
::encode(oid, tbl);
ops++;
}
- void setattr(coll_t cid, const hobject_t& oid, const char* name, bufferlist& val) {
+ void setattr(coll_t cid, const ghobject_t& oid, const char* name, bufferlist& val) {
string n(name);
setattr(cid, oid, n, val);
}
- void setattr(coll_t cid, const hobject_t& oid, const string& s, bufferlist& val) {
+ void setattr(coll_t cid, const ghobject_t& oid, const string& s, bufferlist& val) {
__u32 op = OP_SETATTR;
::encode(op, tbl);
::encode(cid, tbl);
@@ -467,7 +470,7 @@ public:
::encode(val, tbl);
ops++;
}
- void setattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>& attrset) {
+ void setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& attrset) {
__u32 op = OP_SETATTRS;
::encode(op, tbl);
::encode(cid, tbl);
@@ -475,11 +478,19 @@ public:
::encode(attrset, tbl);
ops++;
}
- void rmattr(coll_t cid, const hobject_t& oid, const char *name) {
+ void setattrs(coll_t cid, const hobject_t& oid, map<string,bufferlist>& attrset) {
+ __u32 op = OP_SETATTRS;
+ ::encode(op, tbl);
+ ::encode(cid, tbl);
+ ::encode(oid, tbl);
+ ::encode(attrset, tbl);
+ ops++;
+ }
+ void rmattr(coll_t cid, const ghobject_t& oid, const char *name) {
string n(name);
rmattr(cid, oid, n);
}
- void rmattr(coll_t cid, const hobject_t& oid, const string& s) {
+ void rmattr(coll_t cid, const ghobject_t& oid, const string& s) {
__u32 op = OP_RMATTR;
::encode(op, tbl);
::encode(cid, tbl);
@@ -487,14 +498,14 @@ public:
::encode(s, tbl);
ops++;
}
- void rmattrs(coll_t cid, const hobject_t& oid) {
+ void rmattrs(coll_t cid, const ghobject_t& oid) {
__u32 op = OP_RMATTR;
::encode(op, tbl);
::encode(cid, tbl);
::encode(oid, tbl);
ops++;
}
- void clone(coll_t cid, const hobject_t& oid, hobject_t noid) {
+ void clone(coll_t cid, const ghobject_t& oid, ghobject_t noid) {
__u32 op = OP_CLONE;
::encode(op, tbl);
::encode(cid, tbl);
@@ -502,7 +513,7 @@ public:
::encode(noid, tbl);
ops++;
}
- void clone_range(coll_t cid, const hobject_t& oid, hobject_t noid,
+ void clone_range(coll_t cid, const ghobject_t& oid, ghobject_t noid,
uint64_t srcoff, uint64_t srclen, uint64_t dstoff) {
__u32 op = OP_CLONERANGE2;
::encode(op, tbl);
@@ -526,7 +537,7 @@ public:
::encode(cid, tbl);
ops++;
}
- void collection_add(coll_t cid, coll_t ocid, const hobject_t& oid) {
+ void collection_add(coll_t cid, coll_t ocid, const ghobject_t& oid) {
__u32 op = OP_COLL_ADD;
::encode(op, tbl);
::encode(cid, tbl);
@@ -534,18 +545,27 @@ public:
::encode(oid, tbl);
ops++;
}
- void collection_remove(coll_t cid, const hobject_t& oid) {
+ void collection_remove(coll_t cid, const ghobject_t& oid) {
__u32 op = OP_COLL_REMOVE;
::encode(op, tbl);
::encode(cid, tbl);
::encode(oid, tbl);
ops++;
}
- void collection_move(coll_t cid, coll_t oldcid, const hobject_t& oid) {
+ void collection_move(coll_t cid, coll_t oldcid, const ghobject_t& oid) {
collection_add(cid, oldcid, oid);
collection_remove(oldcid, oid);
return;
}
+ void collection_move_rename(coll_t oldcid, const ghobject_t& oldoid,
+ coll_t cid, const ghobject_t& oid) {
+ __u32 op = OP_COLL_MOVE_RENAME;
+ ::encode(op, tbl);
+ ::encode(oldcid, tbl);
+ ::encode(oldoid, tbl);
+ ::encode(cid, tbl);
+ ::encode(oid, tbl);
+ }
void collection_setattr(coll_t cid, const char* name, bufferlist& val) {
string n(name);
@@ -578,6 +598,13 @@ public:
::encode(aset, tbl);
ops++;
}
+ void collection_setattrs(coll_t cid, map<string,bufferlist>& aset) {
+ __u32 op = OP_COLL_SETATTRS;
+ ::encode(op, tbl);
+ ::encode(cid, tbl);
+ ::encode(aset, tbl);
+ ops++;
+ }
void collection_rename(coll_t cid, coll_t ncid) {
__u32 op = OP_COLL_RENAME;
::encode(op, tbl);
@@ -586,55 +613,55 @@ public:
ops++;
}
- /// Remove omap from hoid
+ /// Remove omap from oid
void omap_clear(
- coll_t cid, ///< [in] Collection containing hoid
- const hobject_t &hoid ///< [in] Object from which to remove omap
+ coll_t cid, ///< [in] Collection containing oid
+ const ghobject_t &oid ///< [in] Object from which to remove omap
) {
__u32 op = OP_OMAP_CLEAR;
::encode(op, tbl);
::encode(cid, tbl);
- ::encode(hoid, tbl);
+ ::encode(oid, tbl);
ops++;
}
- /// Set keys on hoid omap. Replaces duplicate keys.
+ /// Set keys on oid omap. Replaces duplicate keys.
void omap_setkeys(
- coll_t cid, ///< [in] Collection containing hoid
- const hobject_t &hoid, ///< [in] Object to update
+ coll_t cid, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object to update
const map<string, bufferlist> &attrset ///< [in] Replacement keys and values
) {
__u32 op = OP_OMAP_SETKEYS;
::encode(op, tbl);
::encode(cid, tbl);
- ::encode(hoid, tbl);
+ ::encode(oid, tbl);
::encode(attrset, tbl);
ops++;
}
- /// Remove keys from hoid omap
+ /// Remove keys from oid omap
void omap_rmkeys(
- coll_t cid, ///< [in] Collection containing hoid
- const hobject_t &hoid, ///< [in] Object from which to remove the omap
+ coll_t cid, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object from which to remove the omap
const set<string> &keys ///< [in] Keys to clear
) {
__u32 op = OP_OMAP_RMKEYS;
::encode(op, tbl);
::encode(cid, tbl);
- ::encode(hoid, tbl);
+ ::encode(oid, tbl);
::encode(keys, tbl);
ops++;
}
- /// Remove key range from hoid omap
+ /// Remove key range from oid omap
void omap_rmkeyrange(
- coll_t cid, ///< [in] Collection containing hoid
- const hobject_t &hoid, ///< [in] Object from which to remove the omap
+ coll_t cid, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object from which to remove the omap
const string& first, ///< [in] first key in range
const string& last ///< [in] first key past range
) {
__u32 op = OP_OMAP_RMKEYRANGE;
::encode(op, tbl);
::encode(cid, tbl);
- ::encode(hoid, tbl);
+ ::encode(oid, tbl);
::encode(first, tbl);
::encode(last, tbl);
ops++;
@@ -642,14 +669,14 @@ public:
/// Set omap header
void omap_setheader(
- coll_t cid, ///< [in] Collection containing hoid
- const hobject_t &hoid, ///< [in] Object from which to remove the omap
+ coll_t cid, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object from which to remove the omap
const bufferlist &bl ///< [in] Header value
) {
__u32 op = OP_OMAP_SETHEADER;
::encode(op, tbl);
::encode(cid, tbl);
- ::encode(hoid, tbl);
+ ::encode(oid, tbl);
::encode(bl, tbl);
ops++;
}
@@ -832,6 +859,8 @@ public:
virtual int get_max_object_name_length() = 0;
virtual int mkfs() = 0; // wipe
virtual int mkjournal() = 0; // journal only
+ virtual void set_allow_sharded_objects() = 0;
+ virtual bool get_allow_sharded_objects() = 0;
virtual int statfs(struct statfs *buf) = 0;
@@ -850,32 +879,32 @@ public:
virtual int get_ideal_list_max() { return 64; }
// objects
- virtual bool exists(coll_t cid, const hobject_t& oid) = 0; // useful?
+ virtual bool exists(coll_t cid, const ghobject_t& oid) = 0; // useful?
virtual int stat(
coll_t cid,
- const hobject_t& oid,
+ const ghobject_t& oid,
struct stat *st,
bool allow_eio = false) = 0; // struct stat?
virtual int read(
coll_t cid,
- const hobject_t& oid,
+ const ghobject_t& oid,
uint64_t offset,
size_t len,
bufferlist& bl,
bool allow_eio = false) = 0;
- virtual int fiemap(coll_t cid, const hobject_t& oid, uint64_t offset, size_t len, bufferlist& bl) = 0;
+ virtual int fiemap(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl) = 0;
- virtual int getattr(coll_t cid, const hobject_t& oid, const char *name, bufferptr& value) = 0;
- int getattr(coll_t cid, const hobject_t& oid, const char *name, bufferlist& value) {
+ virtual int getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferptr& value) = 0;
+ int getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferlist& value) {
bufferptr bp;
int r = getattr(cid, oid, name, bp);
if (bp.length())
value.push_back(bp);
return r;
}
- virtual int getattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>& aset, bool user_only = false) {return 0;};
+ virtual int getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset, bool user_only = false) {return 0;};
// collections
@@ -890,7 +919,7 @@ public:
virtual int collection_getattr(coll_t cid, const char *name, bufferlist& bl) = 0;
virtual int collection_getattrs(coll_t cid, map<string,bufferptr> &aset) = 0;
virtual bool collection_empty(coll_t c) = 0;
- virtual int collection_list(coll_t c, vector<hobject_t>& o) = 0;
+ virtual int collection_list(coll_t c, vector<ghobject_t>& o) = 0;
/**
* list partial contents of collection relative to a hash offset/position
@@ -904,9 +933,9 @@ public:
* @param next [out] next item sorts >= this value
* @return zero on success, or negative error
*/
- virtual int collection_list_partial(coll_t c, hobject_t start,
+ virtual int collection_list_partial(coll_t c, ghobject_t start,
int min, int max, snapid_t snap,
- vector<hobject_t> *ls, hobject_t *next) = 0;
+ vector<ghobject_t> *ls, ghobject_t *next) = 0;
/**
* list contents of a collection that fall in the range [start, end)
@@ -918,47 +947,57 @@ public:
* @param ls [out] result
* @return zero on success, or negative error
*/
- virtual int collection_list_range(coll_t c, hobject_t start, hobject_t end,
- snapid_t seq, vector<hobject_t> *ls) = 0;
+ virtual int collection_list_range(coll_t c, ghobject_t start, ghobject_t end,
+ snapid_t seq, vector<ghobject_t> *ls) = 0;
+
+ //TODO: Remove
+ int collection_list(coll_t c, vector<hobject_t>& o);
+
+ int collection_list_partial(coll_t c, hobject_t start,
+ int min, int max, snapid_t snap,
+ vector<hobject_t> *ls, hobject_t *next);
+
+ int collection_list_range(coll_t c, hobject_t start, hobject_t end,
+ snapid_t seq, vector<hobject_t> *ls);
/// OMAP
/// Get omap contents
virtual int omap_get(
- coll_t c, ///< [in] Collection containing hoid
- const hobject_t &hoid, ///< [in] Object containing omap
+ coll_t c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
bufferlist *header, ///< [out] omap header
map<string, bufferlist> *out /// < [out] Key to value map
) = 0;
/// Get omap header
virtual int omap_get_header(
- coll_t c, ///< [in] Collection containing hoid
- const hobject_t &hoid, ///< [in] Object containing omap
+ coll_t c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
bufferlist *header, ///< [out] omap header
bool allow_eio = false ///< [in] don't assert on eio
) = 0;
- /// Get keys defined on hoid
+ /// Get keys defined on oid
virtual int omap_get_keys(
- coll_t c, ///< [in] Collection containing hoid
- const hobject_t &hoid, ///< [in] Object containing omap
- set<string> *keys ///< [out] Keys defined on hoid
+ coll_t c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ set<string> *keys ///< [out] Keys defined on oid
) = 0;
/// Get key values
virtual int omap_get_values(
- coll_t c, ///< [in] Collection containing hoid
- const hobject_t &hoid, ///< [in] Object containing omap
+ coll_t c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
const set<string> &keys, ///< [in] Keys to get
map<string, bufferlist> *out ///< [out] Returned keys and values
) = 0;
- /// Filters keys into out which are defined on hoid
+ /// Filters keys into out which are defined on oid
virtual int omap_check_keys(
- coll_t c, ///< [in] Collection containing hoid
- const hobject_t &hoid, ///< [in] Object containing omap
+ coll_t c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
const set<string> &keys, ///< [in] Keys to check
- set<string> *out ///< [out] Subset of keys defined on hoid
+ set<string> *out ///< [out] Subset of keys defined on oid
) = 0;
/**
@@ -972,7 +1011,7 @@ public:
*/
virtual ObjectMap::ObjectMapIterator get_omap_iterator(
coll_t c, ///< [in] collection
- const hobject_t &hoid ///< [in] object
+ const ghobject_t &oid ///< [in] object
) = 0;
virtual void sync(Context *onsync) {}
@@ -988,8 +1027,8 @@ public:
virtual uuid_d get_fsid() = 0;
// DEBUG
- virtual void inject_data_error(const hobject_t &oid) {}
- virtual void inject_mdata_error(const hobject_t &oid) {}
+ virtual void inject_data_error(const ghobject_t &oid) {}
+ virtual void inject_mdata_error(const ghobject_t &oid) {}
};
diff --git a/src/os/WBThrottle.cc b/src/os/WBThrottle.cc
index 23e24765cc2..e02c17677bb 100644
--- a/src/os/WBThrottle.cc
+++ b/src/os/WBThrottle.cc
@@ -116,7 +116,7 @@ void WBThrottle::handle_conf_change(const md_config_t *conf,
}
bool WBThrottle::get_next_should_flush(
- boost::tuple<hobject_t, FDRef, PendingWB> *next)
+ boost::tuple<ghobject_t, FDRef, PendingWB> *next)
{
assert(lock.is_locked());
assert(next);
@@ -128,9 +128,9 @@ bool WBThrottle::get_next_should_flush(
if (stopping)
return false;
assert(!pending_wbs.empty());
- hobject_t obj(pop_object());
+ ghobject_t obj(pop_object());
- map<hobject_t, pair<PendingWB, FDRef> >::iterator i =
+ map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
pending_wbs.find(obj);
*next = boost::make_tuple(obj, i->second.second, i->second.first);
pending_wbs.erase(i);
@@ -141,32 +141,32 @@ bool WBThrottle::get_next_should_flush(
void *WBThrottle::entry()
{
Mutex::Locker l(lock);
- boost::tuple<hobject_t, FDRef, PendingWB> wb;
+ boost::tuple<ghobject_t, FDRef, PendingWB> wb;
while (get_next_should_flush(&wb)) {
clearing = wb.get<0>();
lock.Unlock();
- ::fsync(**wb.get<1>());
+ ::fdatasync(**wb.get<1>());
if (wb.get<2>().nocache)
posix_fadvise(**wb.get<1>(), 0, 0, POSIX_FADV_DONTNEED);
lock.Lock();
- clearing = hobject_t();
+ clearing = ghobject_t();
cur_ios -= wb.get<2>().ios;
logger->dec(l_wbthrottle_ios_dirtied, wb.get<2>().ios);
cur_size -= wb.get<2>().size;
logger->dec(l_wbthrottle_bytes_dirtied, wb.get<2>().size);
logger->dec(l_wbthrottle_inodes_dirtied);
cond.Signal();
- wb = boost::tuple<hobject_t, FDRef, PendingWB>();
+ wb = boost::tuple<ghobject_t, FDRef, PendingWB>();
}
return 0;
}
void WBThrottle::queue_wb(
- FDRef fd, const hobject_t &hoid, uint64_t offset, uint64_t len,
+ FDRef fd, const ghobject_t &hoid, uint64_t offset, uint64_t len,
bool nocache)
{
Mutex::Locker l(lock);
- map<hobject_t, pair<PendingWB, FDRef> >::iterator wbiter =
+ map<ghobject_t, pair<PendingWB, FDRef> >::iterator wbiter =
pending_wbs.find(hoid);
if (wbiter == pending_wbs.end()) {
wbiter = pending_wbs.insert(
@@ -192,7 +192,7 @@ void WBThrottle::queue_wb(
void WBThrottle::clear()
{
Mutex::Locker l(lock);
- for (map<hobject_t, pair<PendingWB, FDRef> >::iterator i =
+ for (map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
pending_wbs.begin();
i != pending_wbs.end();
++i) {
@@ -208,12 +208,12 @@ void WBThrottle::clear()
cond.Signal();
}
-void WBThrottle::clear_object(const hobject_t &hoid)
+void WBThrottle::clear_object(const ghobject_t &hoid)
{
Mutex::Locker l(lock);
while (clearing == hoid)
cond.Wait(lock);
- map<hobject_t, pair<PendingWB, FDRef> >::iterator i =
+ map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
pending_wbs.find(hoid);
if (i == pending_wbs.end())
return;
diff --git a/src/os/WBThrottle.h b/src/os/WBThrottle.h
index 070de08e123..e418cf98d2a 100644
--- a/src/os/WBThrottle.h
+++ b/src/os/WBThrottle.h
@@ -20,7 +20,7 @@
#include <tr1/memory>
#include "include/buffer.h"
#include "common/Formatter.h"
-#include "os/hobject.h"
+#include "common/hobject.h"
#include "include/interval_set.h"
#include "FDCache.h"
#include "common/Thread.h"
@@ -44,7 +44,7 @@ enum {
* Tracks, throttles, and flushes outstanding IO
*/
class WBThrottle : Thread, public md_config_obs_t {
- hobject_t clearing;
+ ghobject_t clearing;
/* *_limits.first is the start_flusher limit and
* *_limits.second is the hard limit
@@ -89,36 +89,36 @@ class WBThrottle : Thread, public md_config_obs_t {
/**
* Flush objects in lru order
*/
- list<hobject_t> lru;
- map<hobject_t, list<hobject_t>::iterator> rev_lru;
- void remove_object(const hobject_t &hoid) {
+ list<ghobject_t> lru;
+ map<ghobject_t, list<ghobject_t>::iterator> rev_lru;
+ void remove_object(const ghobject_t &oid) {
assert(lock.is_locked());
- map<hobject_t, list<hobject_t>::iterator>::iterator iter =
- rev_lru.find(hoid);
+ map<ghobject_t, list<ghobject_t>::iterator>::iterator iter =
+ rev_lru.find(oid);
if (iter == rev_lru.end())
return;
lru.erase(iter->second);
rev_lru.erase(iter);
}
- hobject_t pop_object() {
+ ghobject_t pop_object() {
assert(!lru.empty());
- hobject_t hoid(lru.front());
+ ghobject_t oid(lru.front());
lru.pop_front();
- rev_lru.erase(hoid);
- return hoid;
+ rev_lru.erase(oid);
+ return oid;
}
- void insert_object(const hobject_t &hoid) {
- assert(rev_lru.find(hoid) == rev_lru.end());
- lru.push_back(hoid);
- rev_lru.insert(make_pair(hoid, --lru.end()));
+ void insert_object(const ghobject_t &oid) {
+ assert(rev_lru.find(oid) == rev_lru.end());
+ lru.push_back(oid);
+ rev_lru.insert(make_pair(oid, --lru.end()));
}
- map<hobject_t, pair<PendingWB, FDRef> > pending_wbs;
+ map<ghobject_t, pair<PendingWB, FDRef> > pending_wbs;
/// get next flush to perform
bool get_next_should_flush(
- boost::tuple<hobject_t, FDRef, PendingWB> *next ///< [out] next to flush
+ boost::tuple<ghobject_t, FDRef, PendingWB> *next ///< [out] next to flush
); ///< @return false if we are shutting down
public:
enum FS {
@@ -141,10 +141,10 @@ public:
set_from_conf();
}
- /// Queue wb on hoid, fd taking throttle (does not block)
+ /// Queue wb on oid, fd taking throttle (does not block)
void queue_wb(
- FDRef fd, ///< [in] FDRef to hoid
- const hobject_t &hoid, ///< [in] object
+ FDRef fd, ///< [in] FDRef to oid
+ const ghobject_t &oid, ///< [in] object
uint64_t offset, ///< [in] offset written
uint64_t len, ///< [in] length written
bool nocache ///< [in] try to clear out of cache after write
@@ -154,7 +154,7 @@ public:
void clear();
/// Clear object
- void clear_object(const hobject_t &hoid);
+ void clear_object(const ghobject_t &oid);
/// Block until there is throttle available
void throttle();
diff --git a/src/os/ZFSFileStoreBackend.cc b/src/os/ZFSFileStoreBackend.cc
index 0f01bd20ee7..aa52b8d2933 100644
--- a/src/os/ZFSFileStoreBackend.cc
+++ b/src/os/ZFSFileStoreBackend.cc
@@ -1,7 +1,9 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
-#include <inttypes.h>
+#include "include/int_types.h"
+#include "include/types.h"
+
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
@@ -12,7 +14,6 @@
#include "include/compat.h"
#include "include/linux_fiemap.h"
-#include "include/types.h"
#include "include/color.h"
#include "include/buffer.h"
#include "include/assert.h"
diff --git a/src/os/chain_xattr.cc b/src/os/chain_xattr.cc
index 96f334f8d00..62733e390d3 100644
--- a/src/os/chain_xattr.cc
+++ b/src/os/chain_xattr.cc
@@ -3,7 +3,8 @@
#include "chain_xattr.h"
-#include <inttypes.h>
+#include "include/int_types.h"
+
#include <unistd.h>
#include <stdlib.h>
#include <sys/types.h>
@@ -21,10 +22,6 @@
#include <linux/fs.h>
#endif
-#if defined(__FreeBSD__)
-#include "include/inttypes.h"
-#endif
-
#include "common/xattr.h"
/*
diff --git a/src/osd/Ager.cc b/src/osd/Ager.cc
index a90b1c1855d..f94da1f4218 100644
--- a/src/osd/Ager.cc
+++ b/src/osd/Ager.cc
@@ -8,7 +8,6 @@
#include "common/Clock.h"
#include "common/debug.h"
-#include "global/global_context.h"
// ick
#include <sys/types.h>
@@ -59,7 +58,7 @@ uint64_t Ager::age_fill(float pc, utime_t until) {
bl.push_back(bp);
uint64_t wrote = 0;
while (1) {
- if (ceph_clock_now(g_ceph_context) > until) break;
+ if (ceph_clock_now(cct) > until) break;
struct statfs st;
store->statfs(&st);
@@ -176,7 +175,7 @@ void Ager::age(int time,
srand(0);
- utime_t start = ceph_clock_now(g_ceph_context);
+ utime_t start = ceph_clock_now(cct);
utime_t until = start;
until.sec_ref() += time;
@@ -223,7 +222,7 @@ void Ager::age(int time,
uint64_t wrote = 0;
for (int c=1; c<=count; c++) {
- if (ceph_clock_now(g_ceph_context) > until) break;
+ if (ceph_clock_now(cct) > until) break;
//if (c == 7) start_debug = true;
@@ -253,7 +252,7 @@ void Ager::age(int time,
// dump freelist?
/*
- if (ceph_clock_now(g_ceph_context) > nextfl) {
+ if (ceph_clock_now(cct) > nextfl) {
elapsed += freelist_inc;
save_freelist(elapsed);
nextfl.sec_ref() += freelist_inc;
diff --git a/src/osd/Ager.h b/src/osd/Ager.h
index 55db507993f..face0a62be4 100644
--- a/src/osd/Ager.h
+++ b/src/osd/Ager.h
@@ -7,12 +7,14 @@
#include "include/Distribution.h"
#include "os/ObjectStore.h"
#include "common/Clock.h"
+#include "common/ceph_context.h"
#include <list>
#include <vector>
using namespace std;
class Ager {
+ CephContext *cct;
ObjectStore *store;
private:
@@ -28,7 +30,7 @@ class Ager {
file_object_t age_get_oid();
public:
- Ager(ObjectStore *s) : store(s), did_distn(false) {}
+ Ager(CephContext *cct_, ObjectStore *s) : cct(cct_), store(s), did_distn(false) {}
void age(int time,
float high_water, // fill to this %
diff --git a/src/osd/ClassHandler.cc b/src/osd/ClassHandler.cc
index a9a920ba078..b1afe1e0626 100644
--- a/src/osd/ClassHandler.cc
+++ b/src/osd/ClassHandler.cc
@@ -39,7 +39,7 @@ int ClassHandler::open_class(const string& cname, ClassData **pcls)
int ClassHandler::open_all_classes()
{
dout(10) << __func__ << dendl;
- DIR *dir = ::opendir(g_conf->osd_class_dir.c_str());
+ DIR *dir = ::opendir(cct->_conf->osd_class_dir.c_str());
if (!dir)
return -errno;
@@ -101,7 +101,7 @@ int ClassHandler::_load_class(ClassData *cls)
cls->status == ClassData::CLASS_MISSING) {
char fname[PATH_MAX];
snprintf(fname, sizeof(fname), "%s/" CLS_PREFIX "%s" CLS_SUFFIX,
- g_conf->osd_class_dir.c_str(),
+ cct->_conf->osd_class_dir.c_str(),
cls->name.c_str());
dout(10) << "_load_class " << cls->name << " from " << fname << dendl;
diff --git a/src/osd/ClassHandler.h b/src/osd/ClassHandler.h
index ae416bc6bfc..93cf3c07fbc 100644
--- a/src/osd/ClassHandler.h
+++ b/src/osd/ClassHandler.h
@@ -7,11 +7,14 @@
#include "common/Cond.h"
#include "common/Mutex.h"
+#include "common/ceph_context.h"
class ClassHandler
{
public:
+ CephContext *cct;
+
struct ClassData;
struct ClassMethod {
@@ -76,7 +79,7 @@ private:
int _load_class(ClassData *cls);
public:
- ClassHandler() : mutex("ClassHandler") {}
+ ClassHandler(CephContext *cct_) : cct(cct_), mutex("ClassHandler") {}
int open_all_classes();
diff --git a/src/osd/ErasureCodeInterface.h b/src/osd/ErasureCodeInterface.h
new file mode 100644
index 00000000000..656ee91987e
--- /dev/null
+++ b/src/osd/ErasureCodeInterface.h
@@ -0,0 +1,240 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef CEPH_ERASURE_CODE_INTERFACE_H
+#define CEPH_ERASURE_CODE_INTERFACE_H
+
+/*! @file ErasureCodeInterface.h
+ @brief Interface provided by erasure code plugins
+
+ The erasure coded pools rely on plugins implementing
+ **ErasureCodeInterface** to encode and decode content. All codes
+ are systematic (i.e. the data is not mangled and can be
+ reconstructed by concatenating chunks ).
+
+ All methods return **0** on success and a negative value on
+ error. If the value returned on error is not explained in
+ **ErasureCodeInterface**, the sources or the documentation of the
+ interface implementer (i.e. the plugin ) must be read to figure
+ out what it means. It is recommended that each error code matches
+ an *errno* value that relates to the cause of the error.
+
+ Assuming the interface implementer provides three data chunks ( K
+ = 3 ) and two coding chunks ( M = 2 ), a buffer could be encoded as
+ follows:
+
+ ~~~~~~~~~~~~~~~~{.c}
+ set<int> want_to_encode(0, 1, 2, // data chunks
+ 3, 4 // coding chunks
+ );
+ bufferlist in = "ABCDEF";
+ map<int, bufferlist> encoded
+ encode(want_to_encode, in, &encoded);
+ encoded[0] == "AB" // data chunk 0
+ encoded[1] == "CD" // data chunk 1
+ encoded[2] == "EF" // data chunk 2
+ encoded[3] // coding chunk 0
+ encoded[4] // coding chunk 1
+ ~~~~~~~~~~~~~~~~
+
+ The **minimum_to_decode_with_cost** method can be used to minimize
+ the cost of fetching the chunks necessary to retrieve a given
+ content. For instance, if encoded[2] (contained **EF**) is missing
+ and accessing encoded[3] (the first coding chunk) is more
+ expensive than accessing encoded[4] (the second coding chunk),
+ **minimum_to_decode_with_cost** is expected to chose the first
+ coding chunk.
+
+ ~~~~~~~~~~~~~~~~{.c}
+ set<int> want_to_read(2); // want the chunk containing "EF"
+ map<int,int> available(
+ 0 => 1, // data chunk 0 : available and costs 1
+ 1 => 1, // data chunk 1 : available and costs 1
+ // data chunk 2 : missing
+ 3 => 9, // coding chunk 1 : available and costs 9
+ 4 => 1, // coding chunk 2 : available and costs 1
+ );
+ set<int> minimum;
+ minimum_to_decode_with_cost(want_to_read,
+ available,
+ &minimum);
+ minimum == set<int>(0, 1, 4); // NOT set<int>(0, 1, 3);
+ ~~~~~~~~~~~~~~~~
+
+ It sets **minimum** with three chunks to reconstruct the desired
+ data chunk and will pick the second coding chunk ( 4 ) because it
+ is less expensive ( 1 < 9 ) to retrieve than the first coding
+ chunk ( 3 ). The caller is responsible for retrieving the chunks
+ and call **decode** to reconstruct the second data chunk.
+
+ ~~~~~~~~~~~~~~~~{.c}
+ map<int,bufferlist> chunks;
+ for i in minimum.keys():
+ chunks[i] = fetch_chunk(i); // get chunk from storage
+ map<int, bufferlist> decoded;
+ decode(want_to_read, chunks, &decoded);
+ decoded[2] == "EF"
+ ~~~~~~~~~~~~~~~~
+
+ The semantic of the cost value is defined by the caller and must
+ be known to the implementer. For instance, it may be more
+ expensive to retrieve two chunks with cost 1 + 9 = 10 than two
+ chunks with cost 6 + 6 = 12.
+ */
+
+#include <map>
+#include <set>
+#include <tr1/memory>
+#include "include/buffer.h"
+
+using namespace std;
+
+namespace ceph {
+
+ class ErasureCodeInterface {
+ public:
+ virtual ~ErasureCodeInterface() {}
+
+ /**
+ * Compute the smallest subset of **available** chunks that needs
+ * to be retrieved in order to successfully decode
+ * **want_to_read** chunks.
+ *
+ * It is strictly equivalent to calling
+ * **minimum_to_decode_with_cost** where each **available** chunk
+ * has the same cost.
+ *
+ * @see minimum_to_decode_with_cost
+ *
+ * @param [in] want_to_read chunk indexes to be decoded
+ * @param [in] available chunk indexes containing valid data
+ * @param [out] minimum chunk indexes to retrieve
+ * @return **0** on success or a negative errno on error.
+ */
+ virtual int minimum_to_decode(const set<int> &want_to_read,
+ const set<int> &available,
+ set<int> *minimum) = 0;
+
+ /**
+ * Compute the smallest subset of **available** chunks that needs
+ * to be retrieved in order to successfully decode
+ * **want_to_read** chunks. If there are more than one possible
+ * subset, select the subset that minimizes the overall retrieval
+ * cost.
+ *
+ * The **available** parameter maps chunk indexes to their
+ * retrieval cost. The higher the cost value, the more costly it
+ * is to retrieve the chunk content.
+ *
+ * Returns -EIO if there are not enough chunk indexes in
+ * **available** to decode **want_to_read**.
+ *
+ * Returns 0 on success.
+ *
+ * The **minimum** argument must be a pointer to an empty set.
+ *
+ * @param [in] want_to_read chunk indexes to be decoded
+ * @param [in] available map chunk indexes containing valid data
+ * to their retrieval cost
+ * @param [out] minimum chunk indexes to retrieve
+ * @return **0** on success or a negative errno on error.
+ */
+ virtual int minimum_to_decode_with_cost(const set<int> &want_to_read,
+ const map<int, int> &available,
+ set<int> *minimum) = 0;
+
+ /**
+ * Encode the content of **in** and store the result in
+ * **encoded**. All buffers pointed to by **encoded** have the
+ * same size. The **encoded** map contains at least all chunk
+ * indexes found in the **want_to_encode** set.
+ *
+ * The **encoded** map is expected to be a pointer to an empty
+ * map.
+ *
+ * Assuming the **in** parameter is **length** bytes long,
+ * the concatenation of the first **length** bytes of the
+ * **encoded** buffers is equal to the content of the **in**
+ * parameter.
+ *
+ * The **encoded** map may contain more chunks than required by
+ * **want_to_encode** and the caller is expected to permanently
+ * store all of them, not just the chunks listed in
+ * **want_to_encode**.
+ *
+ * The **encoded** map may contain pointers to data stored in
+ * the **in** parameter. If the caller modifies the content of
+ * **in** after calling the encode method, it may have a side
+ * effect on the content of **encoded**.
+ *
+ * The **encoded** map may contain pointers to buffers allocated
+ * by the encode method. They will be freed when **encoded** is
+ * freed. The allocation method is not specified.
+ *
+ * Returns 0 on success.
+ *
+ * @param [in] want_to_encode chunk indexes to be encoded
+ * @param [in] in data to be encoded
+ * @param [out] encoded map chunk indexes to chunk data
+ * @return **0** on success or a negative errno on error.
+ */
+ virtual int encode(const set<int> &want_to_encode,
+ const bufferlist &in,
+ map<int, bufferlist> *encoded) = 0;
+
+ /**
+ * Decode the **chunks** and store at least **want_to_read**
+ * chunks in **decoded**.
+ *
+ * The **decoded** map must be a pointer to an empty map.
+ *
+ * There must be enough **chunks** ( as returned by
+ * **minimum_to_decode** or **minimum_to_decode_with_cost** ) to
+ * perform a successful decoding of all chunks listed in
+ * **want_to_read**.
+ *
+ * All buffers pointed by **in** must have the same size.
+ *
+ * On success, the **decoded** map may contain more chunks than
+ * required by **want_to_read** and they can safely be used by the
+ * caller.
+ *
+ * If a chunk is listed in **want_to_read** and there is a
+ * corresponding **bufferlist** in **chunks**, it will be
+ * referenced in **decoded**. If not it will be reconstructed from
+ * the existing chunks.
+ *
+ * Because **decoded** may contain pointers to data found in
+ * **chunks**, modifying the content of **chunks** after calling
+ * decode may have a side effect on the content of **decoded**.
+ *
+ * Returns 0 on success.
+ *
+ * @param [in] want_to_read chunk indexes to be decoded
+ * @param [in] chunks map chunk indexes to chunk data
+ * @param [out] decoded map chunk indexes to chunk data
+ * @return **0** on success or a negative errno on error.
+ */
+ virtual int decode(const set<int> &want_to_read,
+ const map<int, bufferlist> &chunks,
+ map<int, bufferlist> *decoded) = 0;
+ };
+
+ typedef std::tr1::shared_ptr<ErasureCodeInterface> ErasureCodeInterfaceRef;
+
+}
+
+#endif
diff --git a/src/osd/ErasureCodePlugin.cc b/src/osd/ErasureCodePlugin.cc
new file mode 100644
index 00000000000..d8b9ae0fbbd
--- /dev/null
+++ b/src/osd/ErasureCodePlugin.cc
@@ -0,0 +1,137 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include "common/debug.h"
+
+#include <dlfcn.h>
+
+#include "ErasureCodePlugin.h"
+
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout)
+
+static ostream& _prefix(std::ostream* _dout)
+{
+ return *_dout << "ErasureCodePlugin: ";
+}
+
+#define PLUGIN_PREFIX "libec_"
+#define PLUGIN_SUFFIX ".so"
+#define PLUGIN_INIT_FUNCTION "__erasure_code_init"
+
+ErasureCodePluginRegistry ErasureCodePluginRegistry::singleton;
+
+ErasureCodePluginRegistry::ErasureCodePluginRegistry() :
+ lock("ErasureCodePluginRegistry::lock"),
+ loading(false)
+{
+}
+
+ErasureCodePluginRegistry::~ErasureCodePluginRegistry()
+{
+ for (std::map<std::string,ErasureCodePlugin*>::iterator i = plugins.begin();
+ i != plugins.end();
+ i++) {
+ void *library = i->second->library;
+ delete i->second;
+ dlclose(library);
+ }
+}
+
+int ErasureCodePluginRegistry::add(const std::string &name,
+ ErasureCodePlugin* plugin)
+{
+ if (plugins.find(name) != plugins.end())
+ return -EEXIST;
+ plugins[name] = plugin;
+ return 0;
+}
+
+ErasureCodePlugin *ErasureCodePluginRegistry::get(const std::string &name)
+{
+ if (plugins.find(name) != plugins.end())
+ return plugins[name];
+ else
+ return 0;
+}
+
+int ErasureCodePluginRegistry::factory(const std::string &plugin_name,
+ const map<std::string,std::string> &parameters,
+ ErasureCodeInterfaceRef *erasure_code)
+{
+ Mutex::Locker l(lock);
+ int r = 0;
+ ErasureCodePlugin *plugin = get(plugin_name);
+ if (plugin == 0) {
+ loading = true;
+ r = load(plugin_name, parameters, &plugin);
+ loading = false;
+ if (r != 0)
+ return r;
+ }
+
+ return plugin->factory(parameters, erasure_code);
+}
+
+int ErasureCodePluginRegistry::load(const std::string &plugin_name,
+ const map<std::string,std::string> &parameters,
+ ErasureCodePlugin **plugin)
+{
+ assert(parameters.count("erasure-code-directory") != 0);
+ std::string fname = parameters.find("erasure-code-directory")->second
+ + "/" PLUGIN_PREFIX
+ + plugin_name + PLUGIN_SUFFIX;
+ dout(10) << "load " << plugin_name << " from " << fname << dendl;
+
+ void *library = dlopen(fname.c_str(), RTLD_NOW);
+ if (!library) {
+ derr << "load dlopen(" << fname
+ << "): " << dlerror() << dendl;
+ return -EIO;
+ }
+
+ int (*erasure_code_init)(const char *) =
+ (int (*)(const char *))dlsym(library, PLUGIN_INIT_FUNCTION);
+ if (erasure_code_init) {
+ std::string name = plugin_name;
+ int r = erasure_code_init(name.c_str());
+ if (r != 0) {
+ derr << "erasure_code_init(" << plugin_name
+ << "): " << strerror(-r) << dendl;
+ return r;
+ }
+ } else {
+ derr << "load dlsym(" << fname
+ << ", " << PLUGIN_INIT_FUNCTION
+ << "): " << dlerror() << dendl;
+ dlclose(library);
+ return -ENOENT;
+ }
+
+ *plugin = get(plugin_name);
+ if (*plugin == 0) {
+ derr << "load " << PLUGIN_INIT_FUNCTION << "()"
+ << "did not register " << plugin_name << dendl;
+ dlclose(library);
+ return -EBADF;
+ }
+
+ (*plugin)->library = library;
+
+ return 0;
+}
+
diff --git a/src/osd/ErasureCodePlugin.h b/src/osd/ErasureCodePlugin.h
new file mode 100644
index 00000000000..a2feb71695a
--- /dev/null
+++ b/src/osd/ErasureCodePlugin.h
@@ -0,0 +1,70 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef CEPH_ERASURE_CODE_PLUGIN_H
+#define CEPH_ERASURE_CODE_PLUGIN_H
+
+#include "common/Mutex.h"
+#include "ErasureCodeInterface.h"
+
+extern "C" {
+ int __erasure_code_init(char *plugin_name);
+}
+
+namespace ceph {
+
+ class ErasureCodePlugin {
+ public:
+ void *library;
+
+ ErasureCodePlugin() :
+ library(0) {}
+ virtual ~ErasureCodePlugin() {}
+
+ virtual int factory(const map<std::string,std::string> &parameters,
+ ErasureCodeInterfaceRef *erasure_code) = 0;
+ };
+
+ class ErasureCodePluginRegistry {
+ public:
+ Mutex lock;
+ bool loading;
+ std::map<std::string,ErasureCodePlugin*> plugins;
+
+ static ErasureCodePluginRegistry singleton;
+
+ ErasureCodePluginRegistry();
+ ~ErasureCodePluginRegistry();
+
+ static ErasureCodePluginRegistry &instance() {
+ return singleton;
+ }
+
+ int factory(const std::string &plugin,
+ const map<std::string,std::string> &parameters,
+ ErasureCodeInterfaceRef *erasure_code);
+
+ int add(const std::string &name, ErasureCodePlugin *plugin);
+ ErasureCodePlugin *get(const std::string &name);
+
+ int load(const std::string &plugin_name,
+ const map<std::string,std::string> &parameters,
+ ErasureCodePlugin **plugin);
+
+ };
+}
+
+#endif
diff --git a/src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.cc b/src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.cc
new file mode 100644
index 00000000000..f2be1ed06e7
--- /dev/null
+++ b/src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.cc
@@ -0,0 +1,427 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <errno.h>
+#include <algorithm>
+#include "common/debug.h"
+#include "ErasureCodeJerasure.h"
+extern "C" {
+#include "jerasure.h"
+#include "reed_sol.h"
+#include "galois.h"
+#include "cauchy.h"
+#include "liberation.h"
+}
+
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout)
+
+static ostream& _prefix(std::ostream* _dout)
+{
+ return *_dout << "ErasureCodeJerasure: ";
+}
+
+void ErasureCodeJerasure::init(const map<std::string,std::string> &parameters)
+{
+ dout(10) << "technique=" << technique << dendl;
+ parse(parameters);
+ prepare();
+}
+
+int ErasureCodeJerasure::minimum_to_decode(const set<int> &want_to_read,
+ const set<int> &available_chunks,
+ set<int> *minimum)
+{
+ if (includes(available_chunks.begin(), available_chunks.end(),
+ want_to_read.begin(), want_to_read.end())) {
+ *minimum = want_to_read;
+ } else {
+ if (available_chunks.size() < (unsigned)k)
+ return -EIO;
+ set<int>::iterator i;
+ unsigned j;
+ for (i = available_chunks.begin(), j = 0; j < (unsigned)k; i++, j++)
+ minimum->insert(*i);
+ }
+ return 0;
+}
+
+int ErasureCodeJerasure::minimum_to_decode_with_cost(const set<int> &want_to_read,
+ const map<int, int> &available,
+ set<int> *minimum)
+{
+ set <int> available_chunks;
+ for (map<int, int>::const_iterator i = available.begin();
+ i != available.end();
+ i++)
+ available_chunks.insert(i->first);
+ return minimum_to_decode(want_to_read, available_chunks, minimum);
+}
+
+int ErasureCodeJerasure::encode(const set<int> &want_to_encode,
+ const bufferlist &in,
+ map<int, bufferlist> *encoded)
+{
+ unsigned alignment = get_alignment();
+ unsigned tail = in.length() % alignment;
+ unsigned padded_length = in.length() + ( tail ? ( alignment - tail ) : 0 );
+ dout(10) << "encode adjusted buffer length from " << in.length()
+ << " to " << padded_length << dendl;
+ assert(padded_length % k == 0);
+ unsigned blocksize = padded_length / k;
+ unsigned length = blocksize * ( k + m );
+ bufferlist out(in);
+ bufferptr pad(length - in.length());
+ pad.zero(0, padded_length - in.length());
+ out.push_back(pad);
+ char *chunks[k + m];
+ for (int i = 0; i < k + m; i++) {
+ bufferlist &chunk = (*encoded)[i];
+ chunk.substr_of(out, i * blocksize, blocksize);
+ chunks[i] = chunk.c_str();
+ }
+ jerasure_encode(&chunks[0], &chunks[k], blocksize);
+ for (int i = 0; i < k + m; i++) {
+ if (want_to_encode.count(i) == 0)
+ encoded->erase(i);
+ }
+ return 0;
+}
+
+int ErasureCodeJerasure::decode(const set<int> &want_to_read,
+ const map<int, bufferlist> &chunks,
+ map<int, bufferlist> *decoded)
+{
+ unsigned blocksize = (*chunks.begin()).second.length();
+ int erasures[k + m + 1];
+ int erasures_count = 0;
+ char *data[k];
+ char *coding[m];
+ for (int i = 0; i < k + m; i++) {
+ if (chunks.find(i) == chunks.end()) {
+ erasures[erasures_count] = i;
+ erasures_count++;
+ bufferptr ptr(blocksize);
+ (*decoded)[i].push_front(ptr);
+ } else {
+ (*decoded)[i] = chunks.find(i)->second;
+ }
+ if (i < k)
+ data[i] = (*decoded)[i].c_str();
+ else
+ coding[i - k] = (*decoded)[i].c_str();
+ }
+ erasures[erasures_count] = -1;
+
+ if (erasures_count > 0)
+ return jerasure_decode(erasures, data, coding, blocksize);
+ else
+ return 0;
+}
+
+int ErasureCodeJerasure::to_int(const std::string &name,
+ const map<std::string,std::string> &parameters,
+ int default_value)
+{
+ if (parameters.find(name) == parameters.end() ||
+ parameters.find(name)->second.size() == 0) {
+ dout(10) << name << " defaults to " << default_value << dendl;
+ return default_value;
+ }
+ const std::string value = parameters.find(name)->second;
+ std::string p = value;
+ std::string err;
+ int r = strict_strtol(p.c_str(), 10, &err);
+ if (!err.empty()) {
+ derr << "could not convert " << name << "=" << value
+ << " to int because " << err
+ << ", set to default " << default_value << dendl;
+ return default_value;
+ }
+ dout(10) << name << " set to " << r << dendl;
+ return r;
+}
+
+bool ErasureCodeJerasure::is_prime(int value)
+{
+ int prime55[] = {
+ 2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,
+ 73,79,83,89,97,101,103,107,109,113,127,131,137,139,149,
+ 151,157,163,167,173,179,
+ 181,191,193,197,199,211,223,227,229,233,239,241,251,257
+ };
+ int i;
+ for (i = 0; i < 55; i++)
+ if (value == prime55[i])
+ return true;
+ return false;
+}
+
+//
+// ErasureCodeJerasureReedSolomonVandermonde
+//
+void ErasureCodeJerasureReedSolomonVandermonde::jerasure_encode(char **data,
+ char **coding,
+ int blocksize)
+{
+ jerasure_matrix_encode(k, m, w, matrix, data, coding, blocksize);
+}
+
+int ErasureCodeJerasureReedSolomonVandermonde::jerasure_decode(int *erasures,
+ char **data,
+ char **coding,
+ int blocksize)
+{
+ return jerasure_matrix_decode(k, m, w, matrix, 1,
+ erasures, data, coding, blocksize);
+}
+
+unsigned ErasureCodeJerasureReedSolomonVandermonde::get_alignment()
+{
+ return k*w*sizeof(int);
+}
+
+void ErasureCodeJerasureReedSolomonVandermonde::parse(const map<std::string,std::string> &parameters)
+{
+ k = to_int("erasure-code-k", parameters, DEFAULT_K);
+ m = to_int("erasure-code-m", parameters, DEFAULT_M);
+ w = to_int("erasure-code-w", parameters, DEFAULT_W);
+ if (w != 8 && w != 16 && w != 32) {
+ derr << "ReedSolomonVandermonde: w=" << w
+ << " must be one of {8, 16, 32} : revert to 8 " << dendl;
+ w = 8;
+ }
+}
+
+void ErasureCodeJerasureReedSolomonVandermonde::prepare()
+{
+ matrix = reed_sol_vandermonde_coding_matrix(k, m, w);
+}
+
+//
+// ErasureCodeJerasureReedSolomonRAID6
+//
+void ErasureCodeJerasureReedSolomonRAID6::jerasure_encode(char **data,
+ char **coding,
+ int blocksize)
+{
+ reed_sol_r6_encode(k, w, data, coding, blocksize);
+}
+
+int ErasureCodeJerasureReedSolomonRAID6::jerasure_decode(int *erasures,
+ char **data,
+ char **coding,
+ int blocksize)
+{
+ return jerasure_matrix_decode(k, m, w, matrix, 1, erasures, data, coding, blocksize);
+}
+
+unsigned ErasureCodeJerasureReedSolomonRAID6::get_alignment()
+{
+ return k*w*sizeof(int);
+}
+
+void ErasureCodeJerasureReedSolomonRAID6::parse(const map<std::string,std::string> &parameters)
+{
+ k = to_int("erasure-code-k", parameters, DEFAULT_K);
+ m = 2;
+ w = to_int("erasure-code-w", parameters, DEFAULT_W);
+ if (w != 8 && w != 16 && w != 32) {
+ derr << "ReedSolomonRAID6: w=" << w
+ << " must be one of {8, 16, 32} : revert to 8 " << dendl;
+ w = 8;
+ }
+}
+
+void ErasureCodeJerasureReedSolomonRAID6::prepare()
+{
+ matrix = reed_sol_r6_coding_matrix(k, w);
+}
+
+//
+// ErasureCodeJerasureCauchy
+//
+void ErasureCodeJerasureCauchy::jerasure_encode(char **data,
+ char **coding,
+ int blocksize)
+{
+ jerasure_schedule_encode(k, m, w, schedule,
+ data, coding, blocksize, packetsize);
+}
+
+int ErasureCodeJerasureCauchy::jerasure_decode(int *erasures,
+ char **data,
+ char **coding,
+ int blocksize)
+{
+ return jerasure_schedule_decode_lazy(k, m, w, bitmatrix,
+ erasures, data, coding, blocksize, packetsize, 1);
+}
+
+unsigned ErasureCodeJerasureCauchy::get_alignment()
+{
+ return k*w*packetsize*sizeof(int);
+}
+
+void ErasureCodeJerasureCauchy::parse(const map<std::string,std::string> &parameters)
+{
+ k = to_int("erasure-code-k", parameters, DEFAULT_K);
+ m = to_int("erasure-code-m", parameters, DEFAULT_M);
+ w = to_int("erasure-code-w", parameters, DEFAULT_W);
+ packetsize = to_int("erasure-code-packetsize", parameters, DEFAULT_PACKETSIZE);
+}
+
+void ErasureCodeJerasureCauchy::prepare_schedule(int *matrix)
+{
+ bitmatrix = jerasure_matrix_to_bitmatrix(k, m, w, matrix);
+ schedule = jerasure_smart_bitmatrix_to_schedule(k, m, w, bitmatrix);
+}
+
+//
+// ErasureCodeJerasureCauchyOrig
+//
+void ErasureCodeJerasureCauchyOrig::prepare()
+{
+ int *matrix = cauchy_original_coding_matrix(k, m, w);
+ prepare_schedule(matrix);
+ free(matrix);
+}
+
+//
+// ErasureCodeJerasureCauchyGood
+//
+void ErasureCodeJerasureCauchyGood::prepare()
+{
+ int *matrix = cauchy_good_general_coding_matrix(k, m, w);
+ prepare_schedule(matrix);
+ free(matrix);
+}
+
+//
+// ErasureCodeJerasureLiberation
+//
+ErasureCodeJerasureLiberation::~ErasureCodeJerasureLiberation()
+{
+ if (bitmatrix)
+ free(bitmatrix);
+ if (schedule)
+ jerasure_free_schedule(schedule);
+}
+
+void ErasureCodeJerasureLiberation::jerasure_encode(char **data,
+ char **coding,
+ int blocksize)
+{
+ jerasure_schedule_encode(k, m, w, schedule, data,
+ coding, blocksize, packetsize);
+}
+
+int ErasureCodeJerasureLiberation::jerasure_decode(int *erasures,
+ char **data,
+ char **coding,
+ int blocksize)
+{
+ return jerasure_schedule_decode_lazy(k, m, w, bitmatrix, erasures, data,
+ coding, blocksize, packetsize, 1);
+}
+
+unsigned ErasureCodeJerasureLiberation::get_alignment()
+{
+ return k*w*packetsize*sizeof(int);
+}
+
+void ErasureCodeJerasureLiberation::parse(const map<std::string,std::string> &parameters)
+{
+ k = to_int("erasure-code-k", parameters, DEFAULT_K);
+ m = to_int("erasure-code-m", parameters, DEFAULT_M);
+ w = to_int("erasure-code-w", parameters, DEFAULT_W);
+ packetsize = to_int("erasure-code-packetsize", parameters, DEFAULT_PACKETSIZE);
+
+ bool error = false;
+ if (k > w) {
+ derr << "k=" << k << " must be less than or equal to w=" << w << dendl;
+ error = true;
+ }
+ if (w <= 2 || !is_prime(w)) {
+ derr << "w=" << w << " must be greater than two and be prime" << dendl;
+ error = true;
+ }
+ if (packetsize == 0) {
+ derr << "packetsize=" << packetsize << " must be set" << dendl;
+ error = true;
+ }
+ if ((packetsize%(sizeof(int))) != 0) {
+ derr << "packetsize=" << packetsize
+ << " must be a multiple of sizeof(int) = " << sizeof(int) << dendl;
+ error = true;
+ }
+ if (error) {
+ derr << "reverting to k=" << DEFAULT_K << ", w="
+ << DEFAULT_W << ", packetsize=" << DEFAULT_PACKETSIZE << dendl;
+ k = DEFAULT_K;
+ w = DEFAULT_W;
+ packetsize = DEFAULT_PACKETSIZE;
+ }
+}
+
+void ErasureCodeJerasureLiberation::prepare()
+{
+ bitmatrix = liberation_coding_bitmatrix(k, w);
+ schedule = jerasure_smart_bitmatrix_to_schedule(k, m, w, bitmatrix);
+}
+
+//
+// ErasureCodeJerasureBlaumRoth
+//
+void ErasureCodeJerasureBlaumRoth::prepare()
+{
+ bitmatrix = blaum_roth_coding_bitmatrix(k, w);
+ schedule = jerasure_smart_bitmatrix_to_schedule(k, m, w, bitmatrix);
+}
+
+//
+// ErasureCodeJerasureLiber8tion
+//
+void ErasureCodeJerasureLiber8tion::parse(const map<std::string,std::string> &parameters)
+{
+ k = to_int("erasure-code-k", parameters, DEFAULT_K);
+ m = DEFAULT_M;
+ w = DEFAULT_W;
+ packetsize = to_int("erasure-code-packetsize", parameters, DEFAULT_PACKETSIZE);
+
+ bool error = false;
+ if (k > w) {
+ derr << "k=" << k << " must be less than or equal to w=" << w << dendl;
+ error = true;
+ }
+ if (packetsize == 0) {
+ derr << "packetsize=" << packetsize << " must be set" << dendl;
+ error = true;
+ }
+ if (error) {
+ derr << "reverting to k=" << DEFAULT_K << ", packetsize="
+ << DEFAULT_PACKETSIZE << dendl;
+ k = DEFAULT_K;
+ packetsize = DEFAULT_PACKETSIZE;
+ }
+}
+
+void ErasureCodeJerasureLiber8tion::prepare()
+{
+ bitmatrix = liber8tion_coding_bitmatrix(k);
+ schedule = jerasure_smart_bitmatrix_to_schedule(k, m, w, bitmatrix);
+}
diff --git a/src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.h b/src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.h
new file mode 100644
index 00000000000..fc76ed7b1e2
--- /dev/null
+++ b/src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.h
@@ -0,0 +1,227 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef CEPH_ERASURE_CODE_JERASURE_H
+#define CEPH_ERASURE_CODE_JERASURE_H
+
+#include "osd/ErasureCodeInterface.h"
+
+class ErasureCodeJerasure : public ErasureCodeInterface {
+public:
+ int k;
+ int m;
+ int w;
+ const char *technique;
+
+ ErasureCodeJerasure(const char *_technique) :
+ technique(_technique)
+ {}
+
+ virtual ~ErasureCodeJerasure() {}
+
+ virtual int minimum_to_decode(const set<int> &want_to_read,
+ const set<int> &available_chunks,
+ set<int> *minimum);
+
+ virtual int minimum_to_decode_with_cost(const set<int> &want_to_read,
+ const map<int, int> &available,
+ set<int> *minimum);
+
+ virtual int encode(const set<int> &want_to_encode,
+ const bufferlist &in,
+ map<int, bufferlist> *encoded);
+
+ virtual int decode(const set<int> &want_to_read,
+ const map<int, bufferlist> &chunks,
+ map<int, bufferlist> *decoded);
+
+ void init(const map<std::string,std::string> &parameters);
+ virtual void jerasure_encode(char **data,
+ char **coding,
+ int blocksize) = 0;
+ virtual int jerasure_decode(int *erasures,
+ char **data,
+ char **coding,
+ int blocksize) = 0;
+ virtual unsigned get_alignment() = 0;
+ virtual void parse(const map<std::string,std::string> &parameters) = 0;
+ virtual void prepare() = 0;
+ static int to_int(const std::string &name,
+ const map<std::string,std::string> &parameters,
+ int default_value);
+ static bool is_prime(int value);
+};
+
+class ErasureCodeJerasureReedSolomonVandermonde : public ErasureCodeJerasure {
+public:
+ static const int DEFAULT_K = 7;
+ static const int DEFAULT_M = 3;
+ static const int DEFAULT_W = 8;
+ int *matrix;
+
+ ErasureCodeJerasureReedSolomonVandermonde() :
+ ErasureCodeJerasure("reed_sol_van"),
+ matrix(0)
+ { }
+ virtual ~ErasureCodeJerasureReedSolomonVandermonde() {
+ if (matrix)
+ free(matrix);
+ }
+
+ virtual void jerasure_encode(char **data,
+ char **coding,
+ int blocksize);
+ virtual int jerasure_decode(int *erasures,
+ char **data,
+ char **coding,
+ int blocksize);
+ virtual unsigned get_alignment();
+ virtual void parse(const map<std::string,std::string> &parameters);
+ virtual void prepare();
+};
+
+class ErasureCodeJerasureReedSolomonRAID6 : public ErasureCodeJerasure {
+public:
+ static const int DEFAULT_K = 7;
+ static const int DEFAULT_W = 8;
+ int *matrix;
+
+ ErasureCodeJerasureReedSolomonRAID6() :
+ ErasureCodeJerasure("reed_sol_r6_op"),
+ matrix(0)
+ { }
+ virtual ~ErasureCodeJerasureReedSolomonRAID6() {
+ if (matrix)
+ free(matrix);
+ }
+
+ virtual void jerasure_encode(char **data,
+ char **coding,
+ int blocksize);
+ virtual int jerasure_decode(int *erasures,
+ char **data,
+ char **coding,
+ int blocksize);
+ virtual unsigned get_alignment();
+ virtual void parse(const map<std::string,std::string> &parameters);
+ virtual void prepare();
+};
+
+class ErasureCodeJerasureCauchy : public ErasureCodeJerasure {
+public:
+ static const int DEFAULT_K = 7;
+ static const int DEFAULT_M = 3;
+ static const int DEFAULT_W = 8;
+ static const int DEFAULT_PACKETSIZE = 8;
+ int *bitmatrix;
+ int **schedule;
+ int packetsize;
+
+ ErasureCodeJerasureCauchy(const char *technique) :
+ ErasureCodeJerasure(technique),
+ bitmatrix(0),
+ schedule(0)
+ { }
+ virtual ~ErasureCodeJerasureCauchy() {
+ if (bitmatrix)
+ free(bitmatrix);
+ if (schedule)
+ free(schedule);
+ }
+
+ virtual void jerasure_encode(char **data,
+ char **coding,
+ int blocksize);
+ virtual int jerasure_decode(int *erasures,
+ char **data,
+ char **coding,
+ int blocksize);
+ virtual unsigned get_alignment();
+ virtual void parse(const map<std::string,std::string> &parameters);
+ void prepare_schedule(int *matrix);
+};
+
+class ErasureCodeJerasureCauchyOrig : public ErasureCodeJerasureCauchy {
+public:
+ ErasureCodeJerasureCauchyOrig() :
+ ErasureCodeJerasureCauchy("cauchy_orig")
+ {}
+
+ virtual void prepare();
+};
+
+class ErasureCodeJerasureCauchyGood : public ErasureCodeJerasureCauchy {
+public:
+ ErasureCodeJerasureCauchyGood() :
+ ErasureCodeJerasureCauchy("cauchy_good")
+ {}
+
+ virtual void prepare();
+};
+
+class ErasureCodeJerasureLiberation : public ErasureCodeJerasure {
+public:
+ static const int DEFAULT_K = 2;
+ static const int DEFAULT_M = 2;
+ static const int DEFAULT_W = 7;
+ static const int DEFAULT_PACKETSIZE = 8;
+ int *bitmatrix;
+ int **schedule;
+ int packetsize;
+
+ ErasureCodeJerasureLiberation(const char *technique = "liberation") :
+ ErasureCodeJerasure(technique),
+ bitmatrix(0),
+ schedule(0)
+ { }
+ virtual ~ErasureCodeJerasureLiberation();
+
+ virtual void jerasure_encode(char **data,
+ char **coding,
+ int blocksize);
+ virtual int jerasure_decode(int *erasures,
+ char **data,
+ char **coding,
+ int blocksize);
+ virtual unsigned get_alignment();
+ virtual void parse(const map<std::string,std::string> &parameters);
+ virtual void prepare();
+};
+
+class ErasureCodeJerasureBlaumRoth : public ErasureCodeJerasureLiberation {
+public:
+ ErasureCodeJerasureBlaumRoth() :
+ ErasureCodeJerasureLiberation("blaum_roth")
+ {}
+
+ virtual void prepare();
+};
+
+class ErasureCodeJerasureLiber8tion : public ErasureCodeJerasureLiberation {
+public:
+ static const int DEFAULT_K = 2;
+ static const int DEFAULT_M = 2;
+ static const int DEFAULT_W = 8;
+
+ ErasureCodeJerasureLiber8tion() :
+ ErasureCodeJerasureLiberation("liber8tion")
+ {}
+
+ virtual void parse(const map<std::string,std::string> &parameters);
+ virtual void prepare();
+};
+
+#endif
diff --git a/src/osd/ErasureCodePluginJerasure/ErasureCodePluginJerasure.cc b/src/osd/ErasureCodePluginJerasure/ErasureCodePluginJerasure.cc
new file mode 100644
index 00000000000..d5cb1cd6c93
--- /dev/null
+++ b/src/osd/ErasureCodePluginJerasure/ErasureCodePluginJerasure.cc
@@ -0,0 +1,70 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include "common/debug.h"
+#include "osd/ErasureCodePlugin.h"
+#include "ErasureCodeJerasure.h"
+
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout)
+
+static ostream& _prefix(std::ostream* _dout)
+{
+ return *_dout << "ErasureCodePluginJerasure: ";
+}
+
+class ErasureCodePluginJerasure : public ErasureCodePlugin {
+public:
+ virtual int factory(const map<std::string,std::string> &parameters,
+ ErasureCodeInterfaceRef *erasure_code) {
+ ErasureCodeJerasure *interface;
+ std::string t;
+ if (parameters.find("erasure-code-technique") != parameters.end())
+ t = parameters.find("erasure-code-technique")->second;
+ if (t == "reed_sol_van") {
+ interface = new ErasureCodeJerasureReedSolomonVandermonde();
+ } else if (t == "reed_sol_r6_op") {
+ interface = new ErasureCodeJerasureReedSolomonRAID6();
+ } else if (t == "cauchy_orig") {
+ interface = new ErasureCodeJerasureCauchyOrig();
+ } else if (t == "cauchy_good") {
+ interface = new ErasureCodeJerasureCauchyGood();
+ } else if (t == "liberation") {
+ interface = new ErasureCodeJerasureLiberation();
+ } else if (t == "blaum_roth") {
+ interface = new ErasureCodeJerasureBlaumRoth();
+ } else if (t == "liber8tion") {
+ interface = new ErasureCodeJerasureLiber8tion();
+ } else {
+ derr << "technique=" << t << " is not a valid coding technique. "
+ << " Choose one of the following: "
+ << "reed_sol_van, reed_sol_r6_op, cauchy_orig, "
+ << "cauchy_good, liberation, blaum_roth, liber8tion"
+ << dendl;
+ return -ENOENT;
+ }
+ interface->init(parameters);
+ *erasure_code = ErasureCodeInterfaceRef(interface);
+ return 0;
+ }
+};
+
+int __erasure_code_init(char *plugin_name)
+{
+ ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+ return instance.add(plugin_name, new ErasureCodePluginJerasure());
+}
diff --git a/src/osd/ErasureCodePluginJerasure/Makefile.am b/src/osd/ErasureCodePluginJerasure/Makefile.am
new file mode 100644
index 00000000000..b31fb1c0785
--- /dev/null
+++ b/src/osd/ErasureCodePluginJerasure/Makefile.am
@@ -0,0 +1,22 @@
+# jerasure plugin
+libec_jerasure_la_SOURCES = \
+ osd/ErasureCodePluginJerasure/ErasureCodePluginJerasure.cc \
+ osd/ErasureCodePluginJerasure/ErasureCodeJerasure.cc \
+ osd/ErasureCodePluginJerasure/cauchy.c \
+ osd/ErasureCodePluginJerasure/galois.c \
+ osd/ErasureCodePluginJerasure/jerasure.c \
+ osd/ErasureCodePluginJerasure/liberation.c \
+ osd/ErasureCodePluginJerasure/reed_sol.c
+noinst_HEADERS += \
+ osd/ErasureCodePluginJerasure/ErasureCodeJerasure.h \
+ osd/ErasureCodePluginJerasure/cauchy.h \
+ osd/ErasureCodePluginJerasure/galois.h \
+ osd/ErasureCodePluginJerasure/jerasure.h \
+ osd/ErasureCodePluginJerasure/liberation.h \
+ osd/ErasureCodePluginJerasure/reed_sol.h
+libec_jerasure_la_CFLAGS = ${AM_CFLAGS}
+libec_jerasure_la_CXXFLAGS= ${AM_CXXFLAGS}
+libec_jerasure_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libec_jerasure_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__erasure_code_.*'
+
+erasure_codelib_LTLIBRARIES += libec_jerasure.la
diff --git a/src/osd/ErasureCodePluginJerasure/cauchy.c b/src/osd/ErasureCodePluginJerasure/cauchy.c
new file mode 100755
index 00000000000..c6bb3c01abf
--- /dev/null
+++ b/src/osd/ErasureCodePluginJerasure/cauchy.c
@@ -0,0 +1,408 @@
+/* cauchy.c
+ * James S. Plank
+
+Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure Coding Techniques
+
+Revision 1.2A
+May 24, 2011
+
+James S. Plank
+Department of Electrical Engineering and Computer Science
+University of Tennessee
+Knoxville, TN 37996
+plank@cs.utk.edu
+
+Copyright (c) 2011, James S. Plank
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ - Neither the name of the University of Tennessee nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "galois.h"
+#include "jerasure.h"
+#include "cauchy.h"
+
+static int PPs[33] = { -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1 };
+static int NOs[33];
+static int ONEs[33][33];
+
+static int *cbest_0;
+static int *cbest_1;
+static int cbest_2[3];
+static int cbest_3[7];
+static int cbest_4[15];
+static int cbest_5[31];
+static int cbest_6[63];
+static int cbest_7[127];
+static int cbest_8[255];
+static int cbest_9[511];
+static int cbest_10[1023];
+static int cbest_11[1023];
+static int *cbest_12, *cbest_13, *cbest_14, *cbest_15, *cbest_16, *cbest_17, *cbest_18, *cbest_19, *cbest_20,
+ *cbest_21, *cbest_22, *cbest_23, *cbest_24, *cbest_25, *cbest_26, *cbest_27, *cbest_28, *cbest_29, *cbest_30,
+ *cbest_31, *cbest_32;
+
+static int cbest_max_k[33] = { -1, -1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 1023, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1 };
+
+static int cbest_init = 0;
+
+static int *cbest_all[33];
+
+
+#define talloc(type, num) (type *) malloc(sizeof(type)*(num))
+
+int cauchy_n_ones(int n, int w)
+{
+ int no;
+ int cno;
+ int nones;
+ int i, j;
+ int highbit;
+
+ highbit = (1 << (w-1));
+
+ if (PPs[w] == -1) {
+ nones = 0;
+ PPs[w] = galois_single_multiply(highbit, 2, w);
+ for (i = 0; i < w; i++) {
+ if (PPs[w] & (1 << i)) {
+ ONEs[w][nones] = (1 << i);
+ nones++;
+ }
+ }
+ NOs[w] = nones;
+ }
+
+ no = 0;
+ for (i = 0; i < w; i++) if (n & (1 << i)) no++;
+ cno = no;
+ for (i = 1; i < w; i++) {
+ if (n & highbit) {
+ n ^= highbit;
+ n <<= 1;
+ n ^= PPs[w];
+ cno--;
+ for (j = 0; j < NOs[w]; j++) {
+ cno += (n & ONEs[w][j]) ? 1 : -1;
+ }
+ } else {
+ n <<= 1;
+ }
+ no += cno;
+ }
+ return no;
+}
+
+int *cauchy_original_coding_matrix(int k, int m, int w)
+{
+ int *matrix;
+ int i, j, index;
+
+ if (w < 31 && (k+m) > (1 << w)) return NULL;
+ matrix = talloc(int, k*m);
+ if (matrix == NULL) return NULL;
+ index = 0;
+ for (i = 0; i < m; i++) {
+ for (j = 0; j < k; j++) {
+ matrix[index] = galois_single_divide(1, (i ^ (m+j)), w);
+ index++;
+ }
+ }
+ return matrix;
+}
+
+int *cauchy_xy_coding_matrix(int k, int m, int w, int *X, int *Y)
+{
+ int index, i, j;
+ int *matrix;
+
+ matrix = talloc(int, k*m);
+ if (matrix == NULL) { return NULL; }
+ index = 0;
+ for (i = 0; i < m; i++) {
+ for (j = 0; j < k; j++) {
+ matrix[index] = galois_single_divide(1, (X[i] ^ Y[j]), w);
+ index++;
+ }
+ }
+ return matrix;
+}
+
+void cauchy_improve_coding_matrix(int k, int m, int w, int *matrix)
+{
+ int index, i, j, x;
+ int tmp;
+ int bno, tno, bno_index;
+
+ for (j = 0; j < k; j++) {
+ if (matrix[j] != 1) {
+ tmp = galois_single_divide(1, matrix[j], w);
+ index = j;
+ for (i = 0; i < m; i++) {
+ matrix[index] = galois_single_multiply(matrix[index], tmp, w);
+ index += k;
+ }
+ }
+ }
+ for (i = 1; i < m; i++) {
+ bno = 0;
+ index = i*k;
+ for (j = 0; j < k; j++) bno += cauchy_n_ones(matrix[index+j], w);
+ bno_index = -1;
+ for (j = 0; j < k; j++) {
+ if (matrix[index+j] != 1) {
+ tmp = galois_single_divide(1, matrix[index+j], w);
+ tno = 0;
+ for (x = 0; x < k; x++) {
+ tno += cauchy_n_ones(galois_single_multiply(matrix[index+x], tmp, w), w);
+ }
+ if (tno < bno) {
+ bno = tno;
+ bno_index = j;
+ }
+ }
+ }
+ if (bno_index != -1) {
+ tmp = galois_single_divide(1, matrix[index+bno_index], w);
+ for (j = 0; j < k; j++) {
+ matrix[index+j] = galois_single_multiply(matrix[index+j], tmp, w);
+ }
+ }
+ }
+}
+
+int *cauchy_good_general_coding_matrix(int k, int m, int w)
+{
+ int *matrix, i;
+
+ if (m == 2 && k <= cbest_max_k[w]) {
+ matrix = talloc(int, k*m);
+ if (matrix == NULL) return NULL;
+ if (!cbest_init) {
+ cbest_init = 1;
+ cbest_all[0] = cbest_0; cbest_all[1] = cbest_1; cbest_all[2] = cbest_2; cbest_all[3] = cbest_3; cbest_all[4] =
+ cbest_4; cbest_all[5] = cbest_5; cbest_all[6] = cbest_6; cbest_all[7] = cbest_7; cbest_all[8] = cbest_8;
+ cbest_all[9] = cbest_9; cbest_all[10] = cbest_10; cbest_all[11] = cbest_11; cbest_all[12] = cbest_12;
+ cbest_all[13] = cbest_13; cbest_all[14] = cbest_14; cbest_all[15] = cbest_15; cbest_all[16] = cbest_16;
+ cbest_all[17] = cbest_17; cbest_all[18] = cbest_18; cbest_all[19] = cbest_19; cbest_all[20] = cbest_20;
+ cbest_all[21] = cbest_21; cbest_all[22] = cbest_22; cbest_all[23] = cbest_23; cbest_all[24] = cbest_24;
+ cbest_all[25] = cbest_25; cbest_all[26] = cbest_26; cbest_all[27] = cbest_27; cbest_all[28] = cbest_28;
+ cbest_all[29] = cbest_29; cbest_all[30] = cbest_30; cbest_all[31] = cbest_31; cbest_all[32] = (int *) cbest_32;
+ }
+ for (i = 0; i < k; i++) {
+ matrix[i] = 1;
+ matrix[i+k] = cbest_all[w][i];
+ }
+ return matrix;
+ } else {
+ matrix = cauchy_original_coding_matrix(k, m, w);
+ if (matrix == NULL) return NULL;
+ cauchy_improve_coding_matrix(k, m, w, matrix);
+ return matrix;
+ }
+}
+
+static int cbest_2[3] = { 1, 2, 3 };
+static int cbest_3[7] = { 1, 2, 5, 4, 7, 3, 6 };
+
+static int cbest_4[15] = { 1, 2, 9, 4, 8, 13, 3, 6, 12, 5, 11, 15, 10, 14, 7 };
+
+static int cbest_5[31] = { 1, 2, 18, 4, 9, 8, 22, 16, 3, 11, 19, 5, 10, 6, 20, 27, 13, 23, 26, 12,
+ 17, 25, 24, 31, 30, 7, 15, 21, 29, 14, 28 };
+
+static int cbest_6[63] = { 1, 2, 33, 4, 8, 49, 16, 32, 57, 3, 6, 12, 24, 48, 5, 35, 9, 37, 10, 17,
+ 41, 51, 56, 61, 18, 28, 53, 14, 20, 34, 7, 13, 25, 36, 59, 26, 39, 40, 45, 50, 60, 52, 63,
+ 11, 30, 55, 19, 22, 29, 43, 58, 15, 21, 38, 44, 47, 62, 27, 54, 42, 31, 23, 46 };
+
+static int cbest_7[127] = { 1, 2, 68, 4, 34, 8, 17, 16, 76, 32, 38, 3, 64, 69, 5, 19, 35, 70, 6, 9,
+ 18, 102, 10, 36, 85, 12, 21, 42, 51, 72, 77, 84, 20, 25, 33, 50, 78, 98, 24, 39, 49, 100, 110
+ , 48, 65, 93, 40, 66, 71, 92, 7, 46, 55, 87, 96, 103, 106, 11, 23, 37, 54, 81, 86, 108, 13,
+ 22, 27, 43, 53, 73, 80, 14, 26, 52, 74, 79, 99, 119, 44, 95, 101, 104, 111, 118, 29, 59, 89,
+ 94, 117, 28, 41, 58, 67, 88, 115, 116, 47, 57, 83, 97, 107, 114, 127, 56, 82, 109, 113, 126,
+ 112, 125, 15, 63, 75, 123, 124, 31, 45, 62, 91, 105, 122, 30, 61, 90, 121, 60, 120 };
+
+static int cbest_8[255] = { 1, 2, 142, 4, 71, 8, 70, 173, 3, 35, 143, 16, 17, 67, 134, 140, 172, 6, 34
+ , 69, 201, 216, 5, 33, 86, 12, 65, 138, 158, 159, 175, 10, 32, 43, 66, 108, 130, 193, 234, 9,
+ 24, 25, 50, 68, 79, 100, 132, 174, 200, 217, 20, 21, 42, 48, 87, 169, 41, 54, 64, 84, 96, 117
+ , 154, 155, 165, 226, 77, 82, 135, 136, 141, 168, 192, 218, 238, 7, 18, 19, 39, 40, 78, 113,
+ 116, 128, 164, 180, 195, 205, 220, 232, 14, 26, 27, 58, 109, 156, 157, 203, 235, 13, 28, 29, 38
+ , 51, 56, 75, 85, 90, 101, 110, 112, 139, 171, 11, 37, 49, 52, 76, 83, 102, 119, 131, 150, 151
+ , 167, 182, 184, 188, 197, 219, 224, 45, 55, 80, 94, 97, 133, 170, 194, 204, 221, 227, 236, 36,
+ 47, 73, 92, 98, 104, 118, 152, 153, 166, 202, 207, 239, 251, 22, 23, 44, 74, 91, 148, 149, 161
+ , 181, 190, 233, 46, 59, 88, 137, 146, 147, 163, 196, 208, 212, 222, 250, 57, 81, 95, 106, 111,
+ 129, 160, 176, 199, 243, 249, 15, 53, 72, 93, 103, 115, 125, 162, 183, 185, 189, 206, 225, 255,
+ 186, 210, 230, 237, 242, 248, 30, 31, 62, 89, 99, 105, 114, 121, 124, 178, 209, 213, 223, 228,
+ 241, 254, 60, 191, 198, 247, 120, 240, 107, 127, 144, 145, 177, 211, 214, 246, 245, 123, 126,
+ 187, 231, 253, 63, 179, 229, 244, 61, 122, 215, 252 };
+
+static int cbest_9[511] = { 1, 2, 264, 4, 132, 8, 66, 16, 33, 32, 280, 64, 140, 128, 3, 70, 265, 5,
+ 133, 256, 266, 6, 9, 35, 67, 134, 268, 396, 10, 17, 34, 330, 12, 18, 68, 198, 297, 20, 37, 74
+ , 136, 148, 165, 281, 296, 24, 36, 41, 65, 82, 99, 164, 272, 282, 388, 40, 49, 98, 141, 194,
+ 284, 328, 412, 48, 97, 129, 142, 196, 346, 71, 72, 96, 130, 313, 392, 80, 206, 257, 267, 312,
+ 334, 7, 135, 156, 173, 192, 258, 269, 397, 404, 11, 78, 144, 161, 172, 260, 270, 299, 331, 344,
+ 398, 13, 19, 39, 69, 86, 103, 160, 167, 199, 202, 298, 322, 384, 14, 21, 38, 43, 75, 102, 137,
+ 149, 166, 204, 289, 332, 408, 462, 22, 25, 42, 51, 83, 101, 138, 150, 273, 283, 288, 301, 350,
+ 389, 429, 26, 50, 76, 100, 195, 274, 285, 300, 329, 363, 390, 413, 428, 28, 45, 84, 143, 197,
+ 200, 214, 231, 276, 286, 315, 320, 347, 362, 414, 458, 44, 53, 73, 90, 107, 131, 152, 169, 181,
+ 230, 314, 338, 361, 393, 400, 454, 460, 52, 57, 81, 106, 115, 168, 175, 180, 207, 229, 305, 335
+ , 348, 360, 394, 421, 478, 56, 105, 114, 157, 163, 174, 193, 210, 227, 228, 259, 304, 317, 326,
+ 405, 420, 445, 79, 104, 113, 145, 158, 162, 212, 226, 261, 271, 316, 345, 379, 399, 406, 444,
+ 450, 456, 87, 88, 112, 146, 203, 225, 262, 291, 323, 336, 378, 385, 425, 452, 474, 15, 205, 222
+ , 224, 239, 290, 303, 333, 367, 377, 386, 409, 424, 431, 463, 470, 476, 23, 139, 151, 189, 208,
+ 238, 302, 324, 351, 366, 376, 410, 430, 437, 27, 47, 77, 94, 111, 177, 188, 237, 275, 293, 342,
+ 365, 391, 436, 448, 29, 46, 55, 85, 110, 119, 171, 176, 183, 201, 215, 218, 235, 236, 277, 287,
+ 292, 321, 355, 364, 415, 417, 459, 466, 472, 30, 54, 59, 91, 109, 118, 153, 170, 182, 220, 234,
+ 278, 307, 339, 354, 401, 416, 423, 441, 455, 461, 468, 495, 58, 108, 117, 154, 233, 306, 319,
+ 349, 353, 383, 395, 402, 422, 440, 447, 479, 494, 92, 116, 211, 232, 318, 327, 340, 352, 382,
+ 446, 493, 61, 159, 213, 216, 247, 309, 381, 407, 427, 451, 457, 464, 491, 492, 60, 89, 123, 147
+ , 185, 246, 263, 308, 337, 371, 380, 426, 433, 453, 475, 487, 490, 122, 184, 191, 223, 245, 370,
+ 387, 432, 439, 471, 477, 486, 489, 511, 121, 179, 190, 209, 243, 244, 295, 325, 359, 369, 411,
+ 438, 485, 488, 510, 95, 120, 178, 242, 294, 343, 358, 368, 419, 449, 483, 484, 509, 219, 241,
+ 357, 418, 443, 467, 473, 482, 507, 508, 31, 221, 240, 255, 279, 356, 442, 469, 481, 503, 506,
+ 155, 254, 403, 480, 502, 505, 63, 93, 127, 253, 311, 341, 375, 501, 504, 62, 126, 187, 217, 251
+ , 252, 310, 374, 435, 465, 499, 500, 125, 186, 250, 373, 434, 498, 124, 249, 372, 497, 248, 496
+ };
+
+static int cbest_10[1023] = { 1, 2, 516, 4, 258, 8, 129, 16, 32, 580, 64, 128, 290, 145, 256, 3, 512,
+ 517, 5, 259, 518, 588, 6, 9, 18, 36, 72, 144, 774, 10, 17, 131, 262, 288, 524, 645, 12, 33,
+ 133, 266, 294, 387, 532, 576, 581, 20, 34, 65, 137, 274, 548, 582, 24, 66, 291, 838, 40, 68,
+ 130, 147, 161, 322, 644, 709, 806, 48, 132, 193, 257, 386, 596, 80, 136, 298, 419, 612, 661, 772
+ , 96, 149, 260, 272, 306, 403, 513, 146, 153, 160, 264, 292, 385, 514, 519, 544, 584, 589, 708,
+ 870, 7, 19, 37, 73, 192, 354, 590, 770, 775, 11, 38, 74, 177, 263, 289, 418, 520, 525, 534, 641
+ , 660, 725, 802, 836, 846, 13, 22, 76, 148, 209, 267, 295, 320, 330, 402, 526, 528, 533, 577,
+ 647, 717, 804, 14, 21, 26, 35, 44, 135, 152, 165, 201, 275, 304, 384, 401, 435, 549, 578, 583,
+ 604, 608, 782, 903, 25, 52, 67, 88, 139, 270, 296, 391, 417, 550, 620, 653, 790, 834, 839, 41,
+ 50, 69, 104, 141, 176, 278, 302, 323, 395, 423, 540, 598, 640, 705, 724, 807, 866, 28, 42, 49,
+ 70, 82, 100, 163, 208, 282, 310, 556, 592, 597, 646, 663, 677, 711, 716, 868, 878, 81, 134, 151
+ , 164, 195, 200, 299, 326, 352, 362, 400, 434, 564, 613, 657, 768, 773, 902, 967, 97, 138, 155,
+ 169, 197, 261, 273, 307, 358, 390, 416, 433, 451, 614, 652, 733, 800, 814, 844, 854, 935, 56, 84
+ , 98, 140, 181, 217, 265, 293, 328, 338, 394, 422, 515, 545, 585, 704, 788, 822, 871, 919, 162,
+ 179, 276, 355, 407, 427, 546, 586, 591, 616, 662, 669, 676, 710, 727, 741, 771, 780, 901, 39, 75
+ , 150, 157, 194, 211, 225, 268, 280, 308, 314, 389, 411, 439, 521, 530, 535, 628, 656, 721, 803,
+ 832, 837, 842, 847, 966, 23, 77, 112, 154, 168, 196, 300, 321, 331, 393, 421, 432, 450, 522, 527
+ , 529, 552, 606, 643, 673, 693, 713, 732, 805, 864, 874, 934, 999, 15, 27, 45, 54, 78, 90, 108,
+ 180, 216, 305, 483, 560, 579, 600, 605, 609, 719, 778, 783, 852, 876, 886, 899, 918, 983, 46, 53
+ , 89, 167, 178, 185, 203, 213, 271, 297, 324, 334, 336, 360, 370, 406, 426, 467, 542, 551, 610,
+ 621, 649, 668, 726, 740, 786, 791, 810, 820, 835, 900, 917, 931, 951, 965, 975, 30, 51, 105, 156
+ , 205, 210, 224, 279, 303, 356, 366, 388, 405, 410, 438, 449, 459, 536, 541, 594, 599, 622, 655,
+ 720, 812, 818, 862, 867, 933, 29, 43, 71, 83, 92, 101, 106, 143, 173, 283, 311, 312, 346, 392,
+ 409, 420, 437, 443, 557, 566, 593, 642, 659, 672, 692, 707, 712, 737, 757, 869, 879, 911, 998,
+ 60, 102, 241, 327, 353, 363, 399, 425, 482, 558, 565, 624, 679, 718, 735, 749, 769, 798, 898,
+ 963, 982, 58, 86, 166, 183, 184, 202, 212, 219, 233, 286, 359, 431, 466, 615, 636, 648, 689, 729
+ , 801, 815, 840, 845, 850, 855, 884, 916, 930, 950, 964, 974, 981, 995, 1015, 57, 85, 99, 120,
+ 171, 199, 204, 229, 318, 329, 339, 368, 404, 448, 458, 465, 499, 654, 671, 685, 784, 789, 823,
+ 872, 882, 915, 932, 949, 997, 1007, 116, 142, 159, 172, 277, 408, 436, 442, 455, 481, 491, 547,
+ 572, 587, 617, 630, 658, 665, 706, 723, 736, 756, 776, 781, 816, 860, 894, 897, 910, 947, 991,
+ 114, 221, 240, 269, 281, 309, 315, 332, 342, 344, 378, 398, 424, 441, 475, 487, 531, 618, 629,
+ 678, 695, 734, 743, 748, 808, 833, 843, 929, 943, 962, 973, 113, 182, 189, 218, 227, 232, 301,
+ 364, 374, 430, 457, 523, 553, 562, 602, 607, 688, 728, 753, 796, 830, 865, 875, 927, 980, 994,
+ 1014, 55, 79, 91, 109, 170, 187, 198, 215, 228, 284, 415, 464, 498, 554, 561, 601, 670, 675, 684
+ , 715, 745, 765, 779, 848, 853, 877, 887, 909, 914, 948, 979, 996, 1006, 1013, 47, 110, 158, 249
+ , 316, 325, 335, 337, 361, 371, 397, 447, 454, 480, 490, 497, 538, 543, 611, 632, 664, 722, 787,
+ 811, 821, 880, 896, 913, 946, 961, 971, 990, 1011, 31, 94, 220, 245, 357, 367, 429, 440, 474,
+ 486, 537, 595, 623, 651, 681, 694, 701, 742, 759, 813, 819, 858, 863, 892, 928, 942, 945, 972,
+ 989, 993, 1003, 1023, 62, 93, 107, 188, 207, 226, 237, 243, 313, 340, 347, 376, 456, 471, 473,
+ 507, 567, 568, 626, 752, 890, 907, 926, 1005, 61, 103, 124, 175, 186, 214, 372, 414, 453, 463,
+ 489, 503, 559, 625, 638, 674, 691, 714, 731, 739, 744, 764, 794, 799, 828, 908, 925, 939, 959,
+ 978, 1012, 59, 87, 122, 248, 287, 350, 396, 413, 446, 485, 495, 496, 637, 751, 826, 841, 851,
+ 885, 912, 941, 960, 970, 977, 1010, 118, 121, 235, 244, 319, 369, 382, 428, 445, 574, 650, 667,
+ 680, 700, 758, 761, 785, 873, 883, 944, 988, 992, 1002, 1009, 1022, 117, 206, 223, 231, 236, 242
+ , 470, 472, 506, 573, 631, 687, 777, 817, 856, 861, 895, 906, 987, 1004, 1021, 115, 174, 191, 333
+ , 343, 345, 379, 452, 462, 469, 488, 502, 505, 619, 690, 697, 730, 738, 755, 809, 888, 924, 938,
+ 958, 969, 1019, 253, 365, 375, 412, 484, 494, 501, 563, 603, 750, 767, 792, 797, 831, 923, 940,
+ 957, 976, 1001, 234, 251, 285, 348, 444, 479, 555, 634, 666, 760, 824, 849, 905, 955, 1008, 111,
+ 222, 230, 247, 317, 380, 461, 511, 539, 633, 686, 703, 747, 881, 937, 986, 1020, 95, 190, 468,
+ 493, 504, 570, 696, 754, 859, 893, 968, 985, 1018, 63, 126, 252, 341, 377, 500, 569, 627, 683,
+ 766, 891, 922, 956, 1000, 1017, 125, 239, 250, 373, 478, 639, 795, 829, 904, 921, 954, 123, 246,
+ 351, 460, 477, 510, 702, 746, 763, 827, 936, 953, 119, 383, 492, 509, 575, 984, 682, 699, 857,
+ 1016, 238, 255, 889, 920, 476, 762, 793, 952, 349, 508, 635, 825, 381, 698, 254, 571, 127 };
+
+static int cbest_11[1023] = { 1,
+ 2, 1026, 4, 513, 8, 16, 1282, 32, 64, 641, 128, 256, 512, 1346, 1024, 3, 673, 1027, 5, 10, 20, 40, 80, 160, 320,
+ 640, 6, 9, 515, 1030, 1280, 1539, 17, 517, 1034, 1283, 12, 18, 33, 521, 1042, 1362, 34, 65, 529, 1058, 1286, 1795,
+ 24, 36, 66, 129, 545, 643, 1090, 1290, 1667, 68, 130, 257, 577, 645, 672, 1154, 1298, 1344, 48, 72, 132, 258, 336,
+ 649, 681, 1314, 1347, 136, 168, 260, 514, 657, 769, 1538, 1923, 84, 96, 144, 264, 516, 1025, 1350, 1410, 1859, 42,
+ 272, 520, 705, 1032, 1354, 11, 21, 41, 81, 161, 192, 288, 321, 528, 675, 1028, 1537, 1699, 1794, 7, 22, 82, 162,
+ 322, 544, 642, 677, 897, 1031, 1046, 1066, 1106, 1186, 1281, 1366, 1378, 1666, 14, 44, 164, 324, 384, 523, 533,
+ 553, 576, 593, 644, 833, 1035, 1040, 1288, 1360, 1987, 13, 19, 28, 88, 328, 519, 648, 680, 689, 1043, 1056, 1284,
+ 1363, 1474, 1543, 1793, 1955, 26, 35, 56, 176, 656, 768, 1038, 1059, 1088, 1287, 1302, 1322, 1442, 1547, 1665,
+ 1922, 25, 37, 52, 67, 112, 340, 352, 525, 531, 737, 1091, 1152, 1291, 1296, 1555, 1858, 1875, 38, 69, 74, 104, 131,
+ 224, 547, 651, 661, 683, 704, 721, 961, 1050, 1062, 1155, 1299, 1312, 1345, 1370, 1571, 1799, 49, 70, 73, 133, 138,
+ 148, 170, 208, 259, 337, 448, 537, 549, 579, 647, 674, 929, 1094, 1294, 1315, 1352, 1536, 1603, 1671, 1698, 1803,
+ 1921, 50, 134, 137, 169, 261, 266, 276, 296, 338, 416, 581, 676, 896, 1074, 1098, 1158, 1348, 1394, 1408, 1675,
+ 1707, 1811, 1857, 2019, 76, 85, 97, 145, 262, 265, 522, 532, 552, 561, 585, 592, 653, 659, 685, 771, 832, 849,
+ 1064, 1162, 1194, 1306, 1318, 1351, 1386, 1411, 1506, 1683, 1827, 1986, 2003, 43, 86, 98, 140, 146, 172, 273, 344,
+ 518, 688, 773, 1033, 1110, 1122, 1170, 1355, 1490, 1542, 1697, 1792, 1927, 1954, 100, 193, 268, 274, 289, 597, 609,
+ 665, 697, 707, 777, 1029, 1044, 1104, 1184, 1330, 1364, 1376, 1414, 1546, 1664, 1731, 1863, 1931, 1963, 23, 46, 83,
+ 92, 152, 163, 184, 194, 290, 323, 368, 524, 530, 555, 693, 709, 736, 753, 785, 993, 1036, 1047, 1067, 1107, 1187,
+ 1218, 1320, 1358, 1367, 1379, 1418, 1450, 1545, 1554, 1867, 1874, 1939, 1985, 15, 30, 45, 60, 90, 120, 165, 180,
+ 196, 240, 280, 292, 325, 330, 360, 385, 480, 546, 650, 660, 679, 682, 713, 720, 745, 801, 899, 960, 977, 1041,
+ 1289, 1361, 1426, 1472, 1541, 1570, 1703, 1798, 1953, 29, 58, 89, 116, 166, 200, 232, 326, 329, 386, 464, 535, 536,
+ 548, 578, 595, 646, 835, 901, 928, 1048, 1057, 1070, 1190, 1285, 1300, 1368, 1382, 1440, 1475, 1559, 1579, 1602,
+ 1619, 1670, 1802, 1879, 1891, 1920, 27, 57, 177, 304, 388, 527, 557, 580, 691, 725, 837, 905, 937, 1039, 1054,
+ 1089, 1114, 1292, 1303, 1323, 1374, 1443, 1553, 1674, 1706, 1715, 1801, 1810, 1856, 1873, 1991, 2018, 2035, 53,
+ 106, 113, 178, 212, 332, 341, 353, 392, 424, 541, 560, 584, 601, 652, 658, 684, 770, 841, 848, 913, 1060, 1082,
+ 1096, 1153, 1202, 1297, 1402, 1478, 1522, 1569, 1673, 1682, 1705, 1797, 1826, 1959, 1995, 2002, 2027, 39, 54, 75,
+ 105, 114, 225, 342, 354, 400, 539, 569, 739, 772, 1051, 1063, 1078, 1092, 1138, 1160, 1192, 1304, 1313, 1326, 1371,
+ 1384, 1398, 1446, 1482, 1514, 1551, 1601, 1669, 1696, 1763, 1815, 1835, 1926, 71, 139, 149, 171, 209, 226, 298,
+ 356, 449, 565, 596, 608, 625, 663, 664, 696, 706, 723, 741, 776, 853, 865, 963, 1072, 1095, 1130, 1156, 1250, 1295,
+ 1310, 1353, 1392, 1687, 1730, 1747, 1809, 1862, 1930, 1962, 1971, 2007, 2017, 51, 78, 108, 135, 150, 210, 228, 267,
+ 277, 297, 339, 348, 417, 450, 551, 554, 587, 617, 655, 687, 692, 708, 752, 784, 931, 965, 992, 1009, 1075, 1099,
+ 1159, 1174, 1234, 1316, 1338, 1349, 1395, 1409, 1458, 1494, 1504, 1544, 1563, 1575, 1681, 1825, 1866, 1883, 1929,
+ 1938, 1961, 1984, 2001, 77, 142, 174, 263, 278, 346, 376, 418, 452, 496, 583, 669, 678, 701, 712, 729, 744, 761,
+ 800, 898, 933, 969, 976, 1001, 1065, 1108, 1120, 1163, 1168, 1195, 1307, 1319, 1334, 1356, 1387, 1416, 1448, 1488,
+ 1507, 1540, 1607, 1702, 1807, 1865, 1925, 1952, 87, 99, 141, 147, 156, 173, 188, 216, 248, 270, 300, 345, 372, 420,
+ 456, 488, 534, 563, 594, 667, 699, 757, 779, 789, 809, 834, 851, 900, 1102, 1111, 1123, 1171, 1328, 1412, 1491,
+ 1558, 1578, 1587, 1611, 1618, 1679, 1711, 1729, 1861, 1878, 1890, 1907, 1943, 2023, 94, 101, 124, 154, 186, 244,
+ 269, 275, 284, 526, 556, 589, 690, 724, 775, 836, 904, 936, 945, 981, 1045, 1068, 1105, 1166, 1185, 1198, 1216,
+ 1331, 1365, 1377, 1390, 1415, 1430, 1510, 1552, 1577, 1714, 1800, 1819, 1831, 1872, 1899, 1937, 1990, 2034, 47, 62,
+ 93, 102, 122, 153, 185, 195, 282, 291, 312, 362, 369, 432, 468, 540, 599, 600, 611, 715, 747, 840, 857, 912, 1037,
+ 1052, 1112, 1126, 1219, 1321, 1359, 1372, 1419, 1424, 1451, 1568, 1623, 1635, 1672, 1691, 1701, 1704, 1723, 1796,
+ 1958, 1994, 2011, 2026, 2043, 31, 61, 91, 121, 181, 197, 202, 234, 241, 281, 293, 308, 331, 361, 370, 481, 538,
+ 568, 613, 695, 711, 738, 755, 781, 787, 995, 1080, 1118, 1178, 1188, 1210, 1380, 1400, 1427, 1473, 1498, 1530,
+ 1550, 1557, 1600, 1617, 1668, 1719, 1735, 1762, 1779, 1814, 1834, 1843, 1877, 1889, 1935, 1967, 1993, 2025, 2039,
+ 59, 117, 167, 182, 198, 201, 233, 242, 294, 327, 387, 465, 482, 559, 564, 605, 624, 662, 722, 740, 803, 852, 864,
+ 881, 907, 917, 939, 962, 979, 997, 1049, 1071, 1086, 1146, 1191, 1206, 1222, 1266, 1301, 1324, 1369, 1383, 1406,
+ 1422, 1441, 1454, 1480, 1512, 1526, 1549, 1686, 1713, 1739, 1746, 1771, 1808, 1833, 1871, 1970, 1989, 2006, 2016,
+ 2033, 118, 305, 334, 364, 389, 394, 404, 426, 466, 484, 543, 550, 573, 586, 603, 616, 633, 654, 686, 717, 749, 793,
+ 805, 843, 873, 903, 930, 964, 1008, 1055, 1115, 1128, 1142, 1200, 1226, 1258, 1293, 1308, 1375, 1476, 1520, 1562,
+ 1574, 1680, 1824 };
+
diff --git a/src/osd/ErasureCodePluginJerasure/cauchy.h b/src/osd/ErasureCodePluginJerasure/cauchy.h
new file mode 100755
index 00000000000..67fbf3bdbde
--- /dev/null
+++ b/src/osd/ErasureCodePluginJerasure/cauchy.h
@@ -0,0 +1,53 @@
+/* cauchy.h
+ * James S. Plank
+
+Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure Coding Techniques
+
+Revision 1.2A
+May 24, 2011
+
+James S. Plank
+Department of Electrical Engineering and Computer Science
+University of Tennessee
+Knoxville, TN 37996
+plank@cs.utk.edu
+
+Copyright (c) 2011, James S. Plank
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ - Neither the name of the University of Tennessee nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+extern int *cauchy_original_coding_matrix(int k, int m, int w);
+extern int *cauchy_xy_coding_matrix(int k, int m, int w, int *x, int *y);
+extern void cauchy_improve_coding_matrix(int k, int m, int w, int *matrix);
+extern int *cauchy_good_general_coding_matrix(int k, int m, int w);
+extern int cauchy_n_ones(int n, int w);
diff --git a/src/osd/ErasureCodePluginJerasure/cauchy_best_r6.c b/src/osd/ErasureCodePluginJerasure/cauchy_best_r6.c
new file mode 100755
index 00000000000..8ab7eab67fe
--- /dev/null
+++ b/src/osd/ErasureCodePluginJerasure/cauchy_best_r6.c
@@ -0,0 +1,1985 @@
+/* cauchy_best_r6.c
+ * James S. Plank
+
+Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure Coding Techniques
+
+Revision 1.2A
+May 24, 2011
+
+James S. Plank
+Department of Electrical Engineering and Computer Science
+University of Tennessee
+Knoxville, TN 37996
+plank@cs.utk.edu
+
+Copyright (c) 2011, James S. Plank
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ - Neither the name of the University of Tennessee nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "galois.h"
+#include "jerasure.h"
+#include "cauchy.h"
+
+static int PPs[33] = { -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1 };
+static int NOs[33];
+static int ONEs[33][33];
+
+static int *cbest_0;
+static int *cbest_1;
+static int cbest_2[3];
+static int cbest_3[7];
+static int cbest_4[15];
+static int cbest_5[31];
+static int cbest_6[63];
+static int cbest_7[127];
+static int cbest_8[255];
+static int cbest_9[511];
+static int cbest_10[1023];
+static int cbest_11[1023];
+static int cbest_12[1023], cbest_13[1023], cbest_14[1023], cbest_15[1023], cbest_16[1023], cbest_17[1023], cbest_18[1023],
+ cbest_19[1023], cbest_20[1023], cbest_21[1023], cbest_22[1023], cbest_23[1023], cbest_24[1023], cbest_25[1023],
+ cbest_26[1023], cbest_27[1023], cbest_28[1023], cbest_29[1023], cbest_30[1023], cbest_31[1023];
+static unsigned int cbest_32[1023];
+
+static int cbest_max_k[33] = { -1, -1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 1023,
+ 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+ 1023, 1023, 1023, 1023 };
+
+static int cbest_init = 0;
+
+static int *cbest_all[33];
+
+
+#define talloc(type, num) (type *) malloc(sizeof(type)*(num))
+
+int cauchy_n_ones(int n, int w)
+{
+ int no;
+ int cno;
+ int nones;
+ int i, j;
+ int highbit;
+
+ highbit = (1 << (w-1));
+
+ if (PPs[w] == -1) {
+ nones = 0;
+ PPs[w] = galois_single_multiply(highbit, 2, w);
+ for (i = 0; i < w; i++) {
+ if (PPs[w] & (1 << i)) {
+ ONEs[w][nones] = (1 << i);
+ nones++;
+ }
+ }
+ NOs[w] = nones;
+ }
+
+ no = 0;
+ for (i = 0; i < w; i++) if (n & (1 << i)) no++;
+ cno = no;
+ for (i = 1; i < w; i++) {
+ if (n & highbit) {
+ n ^= highbit;
+ n <<= 1;
+ n ^= PPs[w];
+ cno--;
+ for (j = 0; j < NOs[w]; j++) {
+ cno += (n & ONEs[w][j]) ? 1 : -1;
+ }
+ } else {
+ n <<= 1;
+ }
+ no += cno;
+ }
+ return no;
+}
+
+int *cauchy_original_coding_matrix(int k, int m, int w)
+{
+ int *matrix;
+ int i, j, index;
+
+ if (w < 31 && (k+m) > (1 << w)) return NULL;
+ matrix = talloc(int, k*m);
+ if (matrix == NULL) return NULL;
+ index = 0;
+ for (i = 0; i < m; i++) {
+ for (j = 0; j < k; j++) {
+ matrix[index] = galois_single_divide(1, (i ^ (m+j)), w);
+ index++;
+ }
+ }
+ return matrix;
+}
+
+int *cauchy_xy_coding_matrix(int k, int m, int w, int *X, int *Y)
+{
+ int index, i, j;
+ int *matrix;
+
+ matrix = talloc(int, k*m);
+ if (matrix == NULL) { return NULL; }
+ index = 0;
+ for (i = 0; i < m; i++) {
+ for (j = 0; j < k; j++) {
+ matrix[index] = galois_single_divide(1, (X[i] ^ Y[j]), w);
+ index++;
+ }
+ }
+ return matrix;
+}
+
+void cauchy_improve_coding_matrix(int k, int m, int w, int *matrix)
+{
+ int index, i, j, x;
+ int tmp;
+ int bno, tno, bno_index;
+
+ for (j = 0; j < k; j++) {
+ if (matrix[j] != 1) {
+ tmp = galois_single_divide(1, matrix[j], w);
+ index = j;
+ for (i = 0; i < m; i++) {
+ matrix[index] = galois_single_multiply(matrix[index], tmp, w);
+ index += k;
+ }
+ }
+ }
+ for (i = 1; i < m; i++) {
+ bno = 0;
+ index = i*k;
+ for (j = 0; j < k; j++) bno += cauchy_n_ones(matrix[index+j], w);
+ bno_index = -1;
+ for (j = 0; j < k; j++) {
+ if (matrix[index+j] != 1) {
+ tmp = galois_single_divide(1, matrix[index+j], w);
+ tno = 0;
+ for (x = 0; x < k; x++) {
+ tno += cauchy_n_ones(galois_single_multiply(matrix[index+x], tmp, w), w);
+ }
+ if (tno < bno) {
+ bno = tno;
+ bno_index = j;
+ }
+ }
+ }
+ if (bno_index != -1) {
+ tmp = galois_single_divide(1, matrix[index+bno_index], w);
+ for (j = 0; j < k; j++) {
+ matrix[index+j] = galois_single_multiply(matrix[index+j], tmp, w);
+ }
+ }
+ }
+}
+
+int *cauchy_good_general_coding_matrix(int k, int m, int w)
+{
+ int *matrix, i;
+
+ if (m == 2 && k <= cbest_max_k[w]) {
+ matrix = talloc(int, k*m);
+ if (matrix == NULL) return NULL;
+ if (!cbest_init) {
+ cbest_init = 1;
+ cbest_all[0] = cbest_0; cbest_all[1] = cbest_1; cbest_all[2] = cbest_2; cbest_all[3] = cbest_3; cbest_all[4] =
+ cbest_4; cbest_all[5] = cbest_5; cbest_all[6] = cbest_6; cbest_all[7] = cbest_7; cbest_all[8] = cbest_8;
+ cbest_all[9] = cbest_9; cbest_all[10] = cbest_10; cbest_all[11] = cbest_11; cbest_all[12] = cbest_12;
+ cbest_all[13] = cbest_13; cbest_all[14] = cbest_14; cbest_all[15] = cbest_15; cbest_all[16] = cbest_16;
+ cbest_all[17] = cbest_17; cbest_all[18] = cbest_18; cbest_all[19] = cbest_19; cbest_all[20] = cbest_20;
+ cbest_all[21] = cbest_21; cbest_all[22] = cbest_22; cbest_all[23] = cbest_23; cbest_all[24] = cbest_24;
+ cbest_all[25] = cbest_25; cbest_all[26] = cbest_26; cbest_all[27] = cbest_27; cbest_all[28] = cbest_28;
+ cbest_all[29] = cbest_29; cbest_all[30] = cbest_30; cbest_all[31] = cbest_31; cbest_all[32] = (int *) cbest_32;
+ }
+ for (i = 0; i < k; i++) {
+ matrix[i] = 1;
+ matrix[i+k] = cbest_all[w][i];
+ }
+ return matrix;
+ } else {
+ matrix = cauchy_original_coding_matrix(k, m, w);
+ if (matrix == NULL) return NULL;
+ cauchy_improve_coding_matrix(k, m, w, matrix);
+ return matrix;
+ }
+}
+
+static int cbest_2[3] = { 1, 2, 3 };
+static int cbest_3[7] = { 1, 2, 5, 4, 7, 3, 6 };
+
+static int cbest_4[15] = { 1, 2, 9, 4, 8, 13, 3, 6, 12, 5, 11, 15, 10, 14, 7 };
+
+static int cbest_5[31] = { 1, 2, 18, 4, 9, 8, 22, 16, 3, 11, 19, 5, 10, 6, 20, 27, 13, 23, 26, 12,
+ 17, 25, 24, 31, 30, 7, 15, 21, 29, 14, 28 };
+
+static int cbest_6[63] = { 1, 2, 33, 4, 8, 49, 16, 32, 57, 3, 6, 12, 24, 48, 5, 35, 9, 37, 10, 17,
+ 41, 51, 56, 61, 18, 28, 53, 14, 20, 34, 7, 13, 25, 36, 59, 26, 39, 40, 45, 50, 60, 52, 63,
+ 11, 30, 55, 19, 22, 29, 43, 58, 15, 21, 38, 44, 47, 62, 27, 54, 42, 31, 23, 46 };
+
+static int cbest_7[127] = { 1, 2, 68, 4, 34, 8, 17, 16, 76, 32, 38, 3, 64, 69, 5, 19, 35, 70, 6, 9,
+ 18, 102, 10, 36, 85, 12, 21, 42, 51, 72, 77, 84, 20, 25, 33, 50, 78, 98, 24, 39, 49, 100, 110
+ , 48, 65, 93, 40, 66, 71, 92, 7, 46, 55, 87, 96, 103, 106, 11, 23, 37, 54, 81, 86, 108, 13,
+ 22, 27, 43, 53, 73, 80, 14, 26, 52, 74, 79, 99, 119, 44, 95, 101, 104, 111, 118, 29, 59, 89,
+ 94, 117, 28, 41, 58, 67, 88, 115, 116, 47, 57, 83, 97, 107, 114, 127, 56, 82, 109, 113, 126,
+ 112, 125, 15, 63, 75, 123, 124, 31, 45, 62, 91, 105, 122, 30, 61, 90, 121, 60, 120 };
+
+static int cbest_8[255] = { 1, 2, 142, 4, 71, 8, 70, 173, 3, 35, 143, 16, 17, 67, 134, 140, 172, 6, 34
+ , 69, 201, 216, 5, 33, 86, 12, 65, 138, 158, 159, 175, 10, 32, 43, 66, 108, 130, 193, 234, 9,
+ 24, 25, 50, 68, 79, 100, 132, 174, 200, 217, 20, 21, 42, 48, 87, 169, 41, 54, 64, 84, 96, 117
+ , 154, 155, 165, 226, 77, 82, 135, 136, 141, 168, 192, 218, 238, 7, 18, 19, 39, 40, 78, 113,
+ 116, 128, 164, 180, 195, 205, 220, 232, 14, 26, 27, 58, 109, 156, 157, 203, 235, 13, 28, 29, 38
+ , 51, 56, 75, 85, 90, 101, 110, 112, 139, 171, 11, 37, 49, 52, 76, 83, 102, 119, 131, 150, 151
+ , 167, 182, 184, 188, 197, 219, 224, 45, 55, 80, 94, 97, 133, 170, 194, 204, 221, 227, 236, 36,
+ 47, 73, 92, 98, 104, 118, 152, 153, 166, 202, 207, 239, 251, 22, 23, 44, 74, 91, 148, 149, 161
+ , 181, 190, 233, 46, 59, 88, 137, 146, 147, 163, 196, 208, 212, 222, 250, 57, 81, 95, 106, 111,
+ 129, 160, 176, 199, 243, 249, 15, 53, 72, 93, 103, 115, 125, 162, 183, 185, 189, 206, 225, 255,
+ 186, 210, 230, 237, 242, 248, 30, 31, 62, 89, 99, 105, 114, 121, 124, 178, 209, 213, 223, 228,
+ 241, 254, 60, 191, 198, 247, 120, 240, 107, 127, 144, 145, 177, 211, 214, 246, 245, 123, 126,
+ 187, 231, 253, 63, 179, 229, 244, 61, 122, 215, 252 };
+
+static int cbest_9[511] = { 1, 2, 264, 4, 132, 8, 66, 16, 33, 32, 280, 64, 140, 128, 3, 70, 265, 5,
+ 133, 256, 266, 6, 9, 35, 67, 134, 268, 396, 10, 17, 34, 330, 12, 18, 68, 198, 297, 20, 37, 74
+ , 136, 148, 165, 281, 296, 24, 36, 41, 65, 82, 99, 164, 272, 282, 388, 40, 49, 98, 141, 194,
+ 284, 328, 412, 48, 97, 129, 142, 196, 346, 71, 72, 96, 130, 313, 392, 80, 206, 257, 267, 312,
+ 334, 7, 135, 156, 173, 192, 258, 269, 397, 404, 11, 78, 144, 161, 172, 260, 270, 299, 331, 344,
+ 398, 13, 19, 39, 69, 86, 103, 160, 167, 199, 202, 298, 322, 384, 14, 21, 38, 43, 75, 102, 137,
+ 149, 166, 204, 289, 332, 408, 462, 22, 25, 42, 51, 83, 101, 138, 150, 273, 283, 288, 301, 350,
+ 389, 429, 26, 50, 76, 100, 195, 274, 285, 300, 329, 363, 390, 413, 428, 28, 45, 84, 143, 197,
+ 200, 214, 231, 276, 286, 315, 320, 347, 362, 414, 458, 44, 53, 73, 90, 107, 131, 152, 169, 181,
+ 230, 314, 338, 361, 393, 400, 454, 460, 52, 57, 81, 106, 115, 168, 175, 180, 207, 229, 305, 335
+ , 348, 360, 394, 421, 478, 56, 105, 114, 157, 163, 174, 193, 210, 227, 228, 259, 304, 317, 326,
+ 405, 420, 445, 79, 104, 113, 145, 158, 162, 212, 226, 261, 271, 316, 345, 379, 399, 406, 444,
+ 450, 456, 87, 88, 112, 146, 203, 225, 262, 291, 323, 336, 378, 385, 425, 452, 474, 15, 205, 222
+ , 224, 239, 290, 303, 333, 367, 377, 386, 409, 424, 431, 463, 470, 476, 23, 139, 151, 189, 208,
+ 238, 302, 324, 351, 366, 376, 410, 430, 437, 27, 47, 77, 94, 111, 177, 188, 237, 275, 293, 342,
+ 365, 391, 436, 448, 29, 46, 55, 85, 110, 119, 171, 176, 183, 201, 215, 218, 235, 236, 277, 287,
+ 292, 321, 355, 364, 415, 417, 459, 466, 472, 30, 54, 59, 91, 109, 118, 153, 170, 182, 220, 234,
+ 278, 307, 339, 354, 401, 416, 423, 441, 455, 461, 468, 495, 58, 108, 117, 154, 233, 306, 319,
+ 349, 353, 383, 395, 402, 422, 440, 447, 479, 494, 92, 116, 211, 232, 318, 327, 340, 352, 382,
+ 446, 493, 61, 159, 213, 216, 247, 309, 381, 407, 427, 451, 457, 464, 491, 492, 60, 89, 123, 147
+ , 185, 246, 263, 308, 337, 371, 380, 426, 433, 453, 475, 487, 490, 122, 184, 191, 223, 245, 370,
+ 387, 432, 439, 471, 477, 486, 489, 511, 121, 179, 190, 209, 243, 244, 295, 325, 359, 369, 411,
+ 438, 485, 488, 510, 95, 120, 178, 242, 294, 343, 358, 368, 419, 449, 483, 484, 509, 219, 241,
+ 357, 418, 443, 467, 473, 482, 507, 508, 31, 221, 240, 255, 279, 356, 442, 469, 481, 503, 506,
+ 155, 254, 403, 480, 502, 505, 63, 93, 127, 253, 311, 341, 375, 501, 504, 62, 126, 187, 217, 251
+ , 252, 310, 374, 435, 465, 499, 500, 125, 186, 250, 373, 434, 498, 124, 249, 372, 497, 248, 496
+ };
+
+static int cbest_10[1023] = { 1, 2, 516, 4, 258, 8, 129, 16, 32, 580, 64, 128, 290, 145, 256, 3, 512,
+ 517, 5, 259, 518, 588, 6, 9, 18, 36, 72, 144, 774, 10, 17, 131, 262, 288, 524, 645, 12, 33,
+ 133, 266, 294, 387, 532, 576, 581, 20, 34, 65, 137, 274, 548, 582, 24, 66, 291, 838, 40, 68,
+ 130, 147, 161, 322, 644, 709, 806, 48, 132, 193, 257, 386, 596, 80, 136, 298, 419, 612, 661, 772
+ , 96, 149, 260, 272, 306, 403, 513, 146, 153, 160, 264, 292, 385, 514, 519, 544, 584, 589, 708,
+ 870, 7, 19, 37, 73, 192, 354, 590, 770, 775, 11, 38, 74, 177, 263, 289, 418, 520, 525, 534, 641
+ , 660, 725, 802, 836, 846, 13, 22, 76, 148, 209, 267, 295, 320, 330, 402, 526, 528, 533, 577,
+ 647, 717, 804, 14, 21, 26, 35, 44, 135, 152, 165, 201, 275, 304, 384, 401, 435, 549, 578, 583,
+ 604, 608, 782, 903, 25, 52, 67, 88, 139, 270, 296, 391, 417, 550, 620, 653, 790, 834, 839, 41,
+ 50, 69, 104, 141, 176, 278, 302, 323, 395, 423, 540, 598, 640, 705, 724, 807, 866, 28, 42, 49,
+ 70, 82, 100, 163, 208, 282, 310, 556, 592, 597, 646, 663, 677, 711, 716, 868, 878, 81, 134, 151
+ , 164, 195, 200, 299, 326, 352, 362, 400, 434, 564, 613, 657, 768, 773, 902, 967, 97, 138, 155,
+ 169, 197, 261, 273, 307, 358, 390, 416, 433, 451, 614, 652, 733, 800, 814, 844, 854, 935, 56, 84
+ , 98, 140, 181, 217, 265, 293, 328, 338, 394, 422, 515, 545, 585, 704, 788, 822, 871, 919, 162,
+ 179, 276, 355, 407, 427, 546, 586, 591, 616, 662, 669, 676, 710, 727, 741, 771, 780, 901, 39, 75
+ , 150, 157, 194, 211, 225, 268, 280, 308, 314, 389, 411, 439, 521, 530, 535, 628, 656, 721, 803,
+ 832, 837, 842, 847, 966, 23, 77, 112, 154, 168, 196, 300, 321, 331, 393, 421, 432, 450, 522, 527
+ , 529, 552, 606, 643, 673, 693, 713, 732, 805, 864, 874, 934, 999, 15, 27, 45, 54, 78, 90, 108,
+ 180, 216, 305, 483, 560, 579, 600, 605, 609, 719, 778, 783, 852, 876, 886, 899, 918, 983, 46, 53
+ , 89, 167, 178, 185, 203, 213, 271, 297, 324, 334, 336, 360, 370, 406, 426, 467, 542, 551, 610,
+ 621, 649, 668, 726, 740, 786, 791, 810, 820, 835, 900, 917, 931, 951, 965, 975, 30, 51, 105, 156
+ , 205, 210, 224, 279, 303, 356, 366, 388, 405, 410, 438, 449, 459, 536, 541, 594, 599, 622, 655,
+ 720, 812, 818, 862, 867, 933, 29, 43, 71, 83, 92, 101, 106, 143, 173, 283, 311, 312, 346, 392,
+ 409, 420, 437, 443, 557, 566, 593, 642, 659, 672, 692, 707, 712, 737, 757, 869, 879, 911, 998,
+ 60, 102, 241, 327, 353, 363, 399, 425, 482, 558, 565, 624, 679, 718, 735, 749, 769, 798, 898,
+ 963, 982, 58, 86, 166, 183, 184, 202, 212, 219, 233, 286, 359, 431, 466, 615, 636, 648, 689, 729
+ , 801, 815, 840, 845, 850, 855, 884, 916, 930, 950, 964, 974, 981, 995, 1015, 57, 85, 99, 120,
+ 171, 199, 204, 229, 318, 329, 339, 368, 404, 448, 458, 465, 499, 654, 671, 685, 784, 789, 823,
+ 872, 882, 915, 932, 949, 997, 1007, 116, 142, 159, 172, 277, 408, 436, 442, 455, 481, 491, 547,
+ 572, 587, 617, 630, 658, 665, 706, 723, 736, 756, 776, 781, 816, 860, 894, 897, 910, 947, 991,
+ 114, 221, 240, 269, 281, 309, 315, 332, 342, 344, 378, 398, 424, 441, 475, 487, 531, 618, 629,
+ 678, 695, 734, 743, 748, 808, 833, 843, 929, 943, 962, 973, 113, 182, 189, 218, 227, 232, 301,
+ 364, 374, 430, 457, 523, 553, 562, 602, 607, 688, 728, 753, 796, 830, 865, 875, 927, 980, 994,
+ 1014, 55, 79, 91, 109, 170, 187, 198, 215, 228, 284, 415, 464, 498, 554, 561, 601, 670, 675, 684
+ , 715, 745, 765, 779, 848, 853, 877, 887, 909, 914, 948, 979, 996, 1006, 1013, 47, 110, 158, 249
+ , 316, 325, 335, 337, 361, 371, 397, 447, 454, 480, 490, 497, 538, 543, 611, 632, 664, 722, 787,
+ 811, 821, 880, 896, 913, 946, 961, 971, 990, 1011, 31, 94, 220, 245, 357, 367, 429, 440, 474,
+ 486, 537, 595, 623, 651, 681, 694, 701, 742, 759, 813, 819, 858, 863, 892, 928, 942, 945, 972,
+ 989, 993, 1003, 1023, 62, 93, 107, 188, 207, 226, 237, 243, 313, 340, 347, 376, 456, 471, 473,
+ 507, 567, 568, 626, 752, 890, 907, 926, 1005, 61, 103, 124, 175, 186, 214, 372, 414, 453, 463,
+ 489, 503, 559, 625, 638, 674, 691, 714, 731, 739, 744, 764, 794, 799, 828, 908, 925, 939, 959,
+ 978, 1012, 59, 87, 122, 248, 287, 350, 396, 413, 446, 485, 495, 496, 637, 751, 826, 841, 851,
+ 885, 912, 941, 960, 970, 977, 1010, 118, 121, 235, 244, 319, 369, 382, 428, 445, 574, 650, 667,
+ 680, 700, 758, 761, 785, 873, 883, 944, 988, 992, 1002, 1009, 1022, 117, 206, 223, 231, 236, 242
+ , 470, 472, 506, 573, 631, 687, 777, 817, 856, 861, 895, 906, 987, 1004, 1021, 115, 174, 191, 333
+ , 343, 345, 379, 452, 462, 469, 488, 502, 505, 619, 690, 697, 730, 738, 755, 809, 888, 924, 938,
+ 958, 969, 1019, 253, 365, 375, 412, 484, 494, 501, 563, 603, 750, 767, 792, 797, 831, 923, 940,
+ 957, 976, 1001, 234, 251, 285, 348, 444, 479, 555, 634, 666, 760, 824, 849, 905, 955, 1008, 111,
+ 222, 230, 247, 317, 380, 461, 511, 539, 633, 686, 703, 747, 881, 937, 986, 1020, 95, 190, 468,
+ 493, 504, 570, 696, 754, 859, 893, 968, 985, 1018, 63, 126, 252, 341, 377, 500, 569, 627, 683,
+ 766, 891, 922, 956, 1000, 1017, 125, 239, 250, 373, 478, 639, 795, 829, 904, 921, 954, 123, 246,
+ 351, 460, 477, 510, 702, 746, 763, 827, 936, 953, 119, 383, 492, 509, 575, 984, 682, 699, 857,
+ 1016, 238, 255, 889, 920, 476, 762, 793, 952, 349, 508, 635, 825, 381, 698, 254, 571, 127 };
+
+static int cbest_11[1023] = { 1,
+ 2, 1026, 4, 513, 8, 16, 1282, 32, 64, 641, 128, 256, 512, 1346, 1024, 3, 673, 1027, 5, 10, 20, 40, 80, 160, 320,
+ 640, 6, 9, 515, 1030, 1280, 1539, 17, 517, 1034, 1283, 12, 18, 33, 521, 1042, 1362, 34, 65, 529, 1058, 1286, 1795,
+ 24, 36, 66, 129, 545, 643, 1090, 1290, 1667, 68, 130, 257, 577, 645, 672, 1154, 1298, 1344, 48, 72, 132, 258, 336,
+ 649, 681, 1314, 1347, 136, 168, 260, 514, 657, 769, 1538, 1923, 84, 96, 144, 264, 516, 1025, 1350, 1410, 1859, 42,
+ 272, 520, 705, 1032, 1354, 11, 21, 41, 81, 161, 192, 288, 321, 528, 675, 1028, 1537, 1699, 1794, 7, 22, 82, 162,
+ 322, 544, 642, 677, 897, 1031, 1046, 1066, 1106, 1186, 1281, 1366, 1378, 1666, 14, 44, 164, 324, 384, 523, 533,
+ 553, 576, 593, 644, 833, 1035, 1040, 1288, 1360, 1987, 13, 19, 28, 88, 328, 519, 648, 680, 689, 1043, 1056, 1284,
+ 1363, 1474, 1543, 1793, 1955, 26, 35, 56, 176, 656, 768, 1038, 1059, 1088, 1287, 1302, 1322, 1442, 1547, 1665,
+ 1922, 25, 37, 52, 67, 112, 340, 352, 525, 531, 737, 1091, 1152, 1291, 1296, 1555, 1858, 1875, 38, 69, 74, 104, 131,
+ 224, 547, 651, 661, 683, 704, 721, 961, 1050, 1062, 1155, 1299, 1312, 1345, 1370, 1571, 1799, 49, 70, 73, 133, 138,
+ 148, 170, 208, 259, 337, 448, 537, 549, 579, 647, 674, 929, 1094, 1294, 1315, 1352, 1536, 1603, 1671, 1698, 1803,
+ 1921, 50, 134, 137, 169, 261, 266, 276, 296, 338, 416, 581, 676, 896, 1074, 1098, 1158, 1348, 1394, 1408, 1675,
+ 1707, 1811, 1857, 2019, 76, 85, 97, 145, 262, 265, 522, 532, 552, 561, 585, 592, 653, 659, 685, 771, 832, 849,
+ 1064, 1162, 1194, 1306, 1318, 1351, 1386, 1411, 1506, 1683, 1827, 1986, 2003, 43, 86, 98, 140, 146, 172, 273, 344,
+ 518, 688, 773, 1033, 1110, 1122, 1170, 1355, 1490, 1542, 1697, 1792, 1927, 1954, 100, 193, 268, 274, 289, 597, 609,
+ 665, 697, 707, 777, 1029, 1044, 1104, 1184, 1330, 1364, 1376, 1414, 1546, 1664, 1731, 1863, 1931, 1963, 23, 46, 83,
+ 92, 152, 163, 184, 194, 290, 323, 368, 524, 530, 555, 693, 709, 736, 753, 785, 993, 1036, 1047, 1067, 1107, 1187,
+ 1218, 1320, 1358, 1367, 1379, 1418, 1450, 1545, 1554, 1867, 1874, 1939, 1985, 15, 30, 45, 60, 90, 120, 165, 180,
+ 196, 240, 280, 292, 325, 330, 360, 385, 480, 546, 650, 660, 679, 682, 713, 720, 745, 801, 899, 960, 977, 1041,
+ 1289, 1361, 1426, 1472, 1541, 1570, 1703, 1798, 1953, 29, 58, 89, 116, 166, 200, 232, 326, 329, 386, 464, 535, 536,
+ 548, 578, 595, 646, 835, 901, 928, 1048, 1057, 1070, 1190, 1285, 1300, 1368, 1382, 1440, 1475, 1559, 1579, 1602,
+ 1619, 1670, 1802, 1879, 1891, 1920, 27, 57, 177, 304, 388, 527, 557, 580, 691, 725, 837, 905, 937, 1039, 1054,
+ 1089, 1114, 1292, 1303, 1323, 1374, 1443, 1553, 1674, 1706, 1715, 1801, 1810, 1856, 1873, 1991, 2018, 2035, 53,
+ 106, 113, 178, 212, 332, 341, 353, 392, 424, 541, 560, 584, 601, 652, 658, 684, 770, 841, 848, 913, 1060, 1082,
+ 1096, 1153, 1202, 1297, 1402, 1478, 1522, 1569, 1673, 1682, 1705, 1797, 1826, 1959, 1995, 2002, 2027, 39, 54, 75,
+ 105, 114, 225, 342, 354, 400, 539, 569, 739, 772, 1051, 1063, 1078, 1092, 1138, 1160, 1192, 1304, 1313, 1326, 1371,
+ 1384, 1398, 1446, 1482, 1514, 1551, 1601, 1669, 1696, 1763, 1815, 1835, 1926, 71, 139, 149, 171, 209, 226, 298,
+ 356, 449, 565, 596, 608, 625, 663, 664, 696, 706, 723, 741, 776, 853, 865, 963, 1072, 1095, 1130, 1156, 1250, 1295,
+ 1310, 1353, 1392, 1687, 1730, 1747, 1809, 1862, 1930, 1962, 1971, 2007, 2017, 51, 78, 108, 135, 150, 210, 228, 267,
+ 277, 297, 339, 348, 417, 450, 551, 554, 587, 617, 655, 687, 692, 708, 752, 784, 931, 965, 992, 1009, 1075, 1099,
+ 1159, 1174, 1234, 1316, 1338, 1349, 1395, 1409, 1458, 1494, 1504, 1544, 1563, 1575, 1681, 1825, 1866, 1883, 1929,
+ 1938, 1961, 1984, 2001, 77, 142, 174, 263, 278, 346, 376, 418, 452, 496, 583, 669, 678, 701, 712, 729, 744, 761,
+ 800, 898, 933, 969, 976, 1001, 1065, 1108, 1120, 1163, 1168, 1195, 1307, 1319, 1334, 1356, 1387, 1416, 1448, 1488,
+ 1507, 1540, 1607, 1702, 1807, 1865, 1925, 1952, 87, 99, 141, 147, 156, 173, 188, 216, 248, 270, 300, 345, 372, 420,
+ 456, 488, 534, 563, 594, 667, 699, 757, 779, 789, 809, 834, 851, 900, 1102, 1111, 1123, 1171, 1328, 1412, 1491,
+ 1558, 1578, 1587, 1611, 1618, 1679, 1711, 1729, 1861, 1878, 1890, 1907, 1943, 2023, 94, 101, 124, 154, 186, 244,
+ 269, 275, 284, 526, 556, 589, 690, 724, 775, 836, 904, 936, 945, 981, 1045, 1068, 1105, 1166, 1185, 1198, 1216,
+ 1331, 1365, 1377, 1390, 1415, 1430, 1510, 1552, 1577, 1714, 1800, 1819, 1831, 1872, 1899, 1937, 1990, 2034, 47, 62,
+ 93, 102, 122, 153, 185, 195, 282, 291, 312, 362, 369, 432, 468, 540, 599, 600, 611, 715, 747, 840, 857, 912, 1037,
+ 1052, 1112, 1126, 1219, 1321, 1359, 1372, 1419, 1424, 1451, 1568, 1623, 1635, 1672, 1691, 1701, 1704, 1723, 1796,
+ 1958, 1994, 2011, 2026, 2043, 31, 61, 91, 121, 181, 197, 202, 234, 241, 281, 293, 308, 331, 361, 370, 481, 538,
+ 568, 613, 695, 711, 738, 755, 781, 787, 995, 1080, 1118, 1178, 1188, 1210, 1380, 1400, 1427, 1473, 1498, 1530,
+ 1550, 1557, 1600, 1617, 1668, 1719, 1735, 1762, 1779, 1814, 1834, 1843, 1877, 1889, 1935, 1967, 1993, 2025, 2039,
+ 59, 117, 167, 182, 198, 201, 233, 242, 294, 327, 387, 465, 482, 559, 564, 605, 624, 662, 722, 740, 803, 852, 864,
+ 881, 907, 917, 939, 962, 979, 997, 1049, 1071, 1086, 1146, 1191, 1206, 1222, 1266, 1301, 1324, 1369, 1383, 1406,
+ 1422, 1441, 1454, 1480, 1512, 1526, 1549, 1686, 1713, 1739, 1746, 1771, 1808, 1833, 1871, 1970, 1989, 2006, 2016,
+ 2033, 118, 305, 334, 364, 389, 394, 404, 426, 466, 484, 543, 550, 573, 586, 603, 616, 633, 654, 686, 717, 749, 793,
+ 805, 843, 873, 903, 930, 964, 1008, 1055, 1115, 1128, 1142, 1200, 1226, 1258, 1293, 1308, 1375, 1476, 1520, 1562,
+ 1574, 1680, 1824 };
+
+static int cbest_12[1023] = {
+ 1, 2, 2089, 4, 8, 3133, 16, 2088, 1044, 3, 32, 522, 261, 3639, 5, 64, 65, 2057, 2091, 6, 130, 3132, 260,
+ 2219, 9, 2093, 3117, 10, 128, 257, 2081, 3890, 12, 1566, 2217, 3129, 17, 514, 520, 3135, 18, 3196, 3637, 20,
+ 256, 1028, 1045, 3638, 24, 33, 523, 783, 2105, 3197, 3647, 34, 1040, 1945, 2056, 2090, 2595, 3125, 3891, 36,
+ 1046, 2348, 40, 1598, 2218, 3635, 7, 48, 66, 67, 131, 263, 269, 512, 538, 782, 1076, 1174, 1819, 2049, 2059,
+ 2092, 2152, 3113, 3116, 3384, 3607, 3888, 68, 69, 277, 554, 1108, 1109, 1558, 1564, 1944, 2061, 2080, 2478,
+ 3894, 72, 73, 134, 265, 293, 391, 526, 586, 587, 779, 2153, 2211, 2216, 2563, 3128, 4016, 11, 14, 80, 81,
+ 129, 132, 138, 530, 1172, 1692, 2073, 2095, 2223, 2316, 3101, 3109, 3119, 3134, 3645, 3874, 13, 96, 97, 146,
+ 259, 262, 268, 289, 799, 1052, 1060, 1158, 1567, 1818, 2083, 2221, 2344, 2349, 2476, 2980, 3045, 3192, 3368,
+ 3636, 144, 162, 276, 389, 515, 521, 578, 579, 781, 972, 1024, 1156, 2085, 2120, 2235, 2282, 2312, 3131, 3623,
+ 19, 22, 28, 160, 194, 195, 264, 292, 390, 778, 846, 1238, 1239, 1562, 1596, 1684, 1937, 1947, 2104, 2233,
+ 2593, 3193, 3385, 3633, 3643, 3646, 3889, 21, 26, 324, 518, 536, 909, 1029, 1297, 2008, 2121, 2209, 2283,
+ 2479, 2594, 2854, 3085, 3121, 3124, 3188, 3198, 3605, 3895, 25, 136, 258, 288, 528, 552, 798, 842, 1030,
+ 1140, 1141, 1490, 1556, 2470, 2603, 2626, 2721, 3892, 3898, 4017, 35, 38, 44, 56, 320, 388, 423, 486, 524,
+ 570, 584, 585, 618, 619, 648, 777, 780, 1041, 1594, 1817, 2097, 2107, 2317, 2561, 3105, 3164, 3189, 3199,
+ 3263, 3634, 3875, 37, 42, 52, 285, 309, 421, 1036, 1042, 1047, 1056, 1072, 1170, 1313, 1427, 1823, 1936,
+ 1946, 2048, 2053, 2058, 2065, 2109, 2280, 2345, 2477, 2579, 2627, 2981, 3041, 3044, 3112, 3127, 3369, 3388,
+ 3603, 3606, 3631, 3765, 3872, 41, 192, 193, 273, 399, 775, 797, 908, 968, 1104, 1105, 1168, 1236, 1237, 1281,
+ 1296, 1554, 1582, 1599, 1949, 2051, 2060, 2313, 2332, 2340, 2474, 3115, 3165, 3336, 3360, 3641, 4018, 49, 70,
+ 71, 76, 77, 88, 89, 112, 113, 387, 454, 484, 513, 516, 539, 640, 744, 745, 1004, 1077, 1175, 1234, 1235,
+ 1550, 1668, 1680, 1694, 1803, 2063, 2112, 2210, 2215, 2227, 2281, 2472, 2562, 2784, 3261, 3449, 3547, 3591,
+ 3621, 84, 85, 104, 105, 142, 154, 178, 226, 227, 242, 243, 267, 271, 385, 422, 534, 555, 771, 776, 791, 834,
+ 840, 844, 973, 1068, 1078, 1166, 1425, 1488, 1559, 1560, 1565, 1688, 1816, 2072, 2077, 2094, 2136, 2144,
+ 2154, 2222, 2298, 2336, 2350, 2468, 2543, 2611, 2624, 2658, 2729, 2850, 2855, 2982, 3081, 3097, 3100, 3108,
+ 3118, 3184, 3247, 3326, 3376, 3644, 3702, 4024, 82, 83, 135, 164, 170, 210, 211, 279, 284, 308, 328, 356,
+ 372, 417, 420, 527, 542, 576, 577, 616, 617, 656, 795, 847, 1048, 1110, 1111, 1312, 1329, 1392, 1426, 1590,
+ 1592, 1822, 2082, 2113, 2128, 2156, 2187, 2220, 2364, 2466, 2471, 2785, 2976, 3077, 3111, 3180, 3245, 3882,
+ 3893, 3899, 3902, 4081, 15, 30, 50, 60, 98, 99, 120, 121, 133, 139, 166, 272, 291, 295, 325, 340, 398, 419,
+ 531, 558, 582, 583, 712, 713, 774, 796, 838, 1064, 1084, 1173, 1182, 1232, 1233, 1280, 1522, 1542, 1676,
+ 1693, 1811, 1948, 2009, 2084, 2137, 2145, 2155, 2185, 2213, 2225, 2234, 2286, 2299, 2599, 2601, 2625, 2659,
+ 2745, 2870, 2988, 3047, 3130, 3185, 3194, 3327, 3352, 3386, 3545, 3601, 3619, 3622, 3629, 3703, 3733, 3773,
+ 3896, 3955, 140, 147, 152, 176, 186, 198, 199, 224, 225, 281, 305, 332, 386, 397, 532, 546, 590, 591, 680,
+ 696, 843, 901, 911, 974, 1026, 1032, 1053, 1061, 1116, 1117, 1159, 1164, 1491, 1552, 1802, 1941, 2069, 2087,
+ 2122, 2129, 2157, 2168, 2232, 2296, 2314, 2318, 2446, 2567, 2592, 2737, 2852, 3040, 3073, 3093, 3103, 3156,
+ 3181, 3389, 3627, 3632, 3642, 3858, 3873, 4020, 74, 75, 145, 163, 168, 208, 209, 266, 270, 321, 384, 395,
+ 452, 487, 502, 664, 770, 790, 905, 1025, 1054, 1148, 1149, 1152, 1157, 1301, 1360, 1424, 1578, 1686, 1821,
+ 1851, 1929, 1939, 2012, 2208, 2287, 2328, 2333, 2341, 2346, 2381, 2397, 2444, 2464, 2475, 2723, 2752, 3084,
+ 3120, 3195, 3337, 3361, 3370, 3417, 3514, 3579, 3604, 3615, 3780, 3878, 3984, 4019, 23, 29, 46, 92, 93, 161,
+ 240, 241, 278, 348, 416, 450, 562, 568, 610, 611, 650, 769, 773, 789, 794, 1154, 1222, 1223, 1328, 1376,
+ 1494, 1520, 1563, 1597, 1685, 1951, 2010, 2075, 2123, 2169, 2284, 2297, 2473, 2511, 2541, 2602, 2619, 2666,
+ 2720, 3107, 3123, 3157, 3294, 3324, 3372, 3448, 3453, 3546, 3587, 3670, 3710, 27, 196, 197, 290, 294, 418,
+ 482, 519, 537, 540, 574, 688, 787, 793, 969, 1062, 1142, 1143, 1435, 1548, 1588, 1662, 1663, 1801, 1810,
+ 1992, 2055, 2096, 2101, 2106, 2337, 2351, 2469, 2539, 2542, 2560, 2571, 2577, 2634, 2753, 2851, 2919, 2983,
+ 3013, 3043, 3104, 3259, 3262, 3322, 3377, 3380, 3441, 3625, 3717, 3767, 3781, 3880, 3900, 3939, 4025, 4049,
+ 137, 174, 280, 304, 322, 326, 344, 352, 393, 396, 455, 480, 485, 529, 553, 556, 760, 761, 832, 900, 910, 964,
+ 970, 1005, 1031, 1124, 1125, 1180, 1289, 1317, 1333, 1540, 1557, 1574, 1580, 1586, 1690, 1724, 1921, 1940,
+ 2052, 2064, 2099, 2108, 2250, 2266, 2285, 2304, 2365, 2467, 2578, 2583, 2609, 2656, 2667, 2848, 2977, 2996,
+ 3126, 3166, 3176, 3215, 3239, 3257, 3295, 3325, 3512, 3602, 3617, 3630, 3671, 3700, 3711, 3761, 3764, 3842,
+ 3883, 3903, 4080, 39, 45, 57, 148, 172, 184, 287, 297, 336, 394, 448, 525, 544, 571, 588, 589, 649, 746, 747,
+ 835, 841, 845, 904, 996, 1006, 1038, 1074, 1080, 1092, 1093, 1178, 1220, 1221, 1300, 1489, 1492, 1538, 1595,
+ 1670, 1682, 1756, 1757, 1795, 1815, 1820, 1835, 1850, 1928, 1938, 2050, 2111, 2179, 2356, 2440, 2509, 2565,
+ 2587, 2635, 2871, 2915, 2978, 2984, 2989, 3046, 3053, 3114, 3148, 3172, 3207, 3292, 3323, 3340, 3353, 3356,
+ 3364, 3387, 3401, 3445, 3544, 3589, 3640, 3729, 3757, 3769, 3796, 3897, 3954, 3959, 251, 373, 581, 1057,
+ 1107, 1137, 1299, 1630, 1882, 2214, 2308, 2315, 2319, 2597, 2853, 3167, 3260, 3482, 3590, 3862 };
+
+static int cbest_13[1023] = {
+ 1, 2, 4109, 4, 6155, 8, 7176, 16, 3588, 4108, 32, 2054, 3, 1027, 1794, 6, 64, 4620, 6154, 5, 897, 4111, 6153,
+ 7177, 12, 2310, 3077, 4105, 128, 9, 24, 513, 1155, 10, 1026, 4101, 7689, 18, 48, 256, 896, 3589, 4365, 6159,
+ 7178, 17, 448, 4557, 4621, 4684, 5647, 36, 96, 224, 1025, 1792, 20, 112, 512, 769, 1538, 1795, 2052, 3076,
+ 3590, 6147, 6283, 7180, 33, 56, 72, 192, 2055, 2342, 3584, 4125, 6411, 7945, 28, 34, 577, 1154, 6922, 7240,
+ 14, 144, 384, 2050, 4110, 4493, 5134, 5903, 6152, 7168, 7304, 7, 40, 65, 1171, 3620, 4104, 4141, 5004, 288,
+ 768, 1153, 2308, 2567, 3652, 4097, 4107, 4397, 4685, 6157, 6171, 6379, 7688, 13, 68, 1024, 1810, 2311, 3461,
+ 4676, 7179, 66, 129, 576, 1826, 3079, 3844, 4100, 4173, 4622, 5646, 6347, 8073, 25, 80, 899, 905, 1031, 1091,
+ 1798, 2182, 2306, 2338, 2502, 3073, 3596, 4364, 4616, 5390, 5645, 6158, 6187, 7050, 7192, 11, 26, 913, 1922,
+ 4117, 4556, 4652, 5262, 6151, 6299, 6443, 6923, 7181, 19, 49, 130, 136, 257, 1169, 1283, 1536, 1793, 2062,
+ 2326, 2566, 2823, 4103, 4237, 6145, 7182, 7232, 7272, 7691, 7944, 50, 449, 515, 901, 961, 1035, 1163, 1170,
+ 1539, 1802, 2048, 2340, 2695, 3586, 3591, 3604, 3616, 4553, 4612, 4680, 4748, 5005, 6146, 6219, 6282, 7208,
+ 7241, 7288, 22, 37, 52, 97, 160, 225, 585, 1152, 1251, 1808, 2374, 2631, 3141, 3585, 4124, 4549, 4876, 5135,
+ 5902, 6281, 6410, 7169, 7172, 7305, 7320, 7560, 21, 38, 98, 113, 132, 226, 258, 452, 545, 898, 904, 1187,
+ 2053, 2070, 2278, 2438, 3205, 3460, 3525, 3636, 3972, 4127, 4137, 4157, 4367, 4677, 5132, 5839, 5901, 6163,
+ 6169, 6409, 6666, 7693, 57, 73, 100, 114, 193, 228, 272, 450, 456, 517, 912, 1029, 1043, 1159, 1219, 1796,
+ 2343, 3078, 3333, 4113, 4121, 4361, 4381, 4393, 4492, 4559, 4623, 4686, 5518, 6275, 6920, 7721, 8137, 29, 35,
+ 44, 74, 104, 146, 641, 1030, 1090, 1730, 1824, 2246, 2318, 2822, 3072, 3085, 3592, 3621, 3660, 3780, 4099,
+ 4140, 4149, 4165, 4205, 4333, 4617, 4700, 5454, 5643, 6149, 6377, 7051, 7170, 7244, 7753, 7947, 15, 30, 42,
+ 60, 76, 120, 145, 196, 240, 260, 292, 320, 385, 480, 514, 771, 900, 960, 1123, 1411, 1818, 1986, 2051, 2086,
+ 2304, 2694, 2951, 3622, 3648, 3653, 4096, 4106, 4221, 4396, 4429, 4589, 4636, 4653, 4716, 5422, 5679, 6156,
+ 6170, 6378, 6603, 7184, 7242, 7681, 8072, 41, 58, 194, 200, 521, 581, 584, 785, 865, 1059, 1139, 1168, 1282,
+ 1347, 1811, 2058, 2198, 2350, 2565, 2630, 3644, 4301, 4357, 4485, 4732, 5391, 5644, 5711, 6191, 6203, 6287,
+ 6297, 7193, 7296, 7306, 70, 88, 148, 208, 289, 544, 579, 801, 1034, 1089, 1099, 1162, 1315, 1570, 1827, 1830,
+ 1890, 2309, 2324, 2336, 2358, 2759, 3075, 3093, 3845, 4133, 4143, 4172, 4613, 4648, 4668, 4681, 4749, 4812,
+ 4996, 5263, 5278, 5388, 5639, 5775, 6167, 6175, 6185, 6315, 6346, 6383, 6415, 6427, 6447, 7194, 7949, 7961,
+ 69, 84, 116, 152, 264, 290, 386, 392, 516, 593, 773, 833, 909, 993, 1033, 1161, 1175, 1250, 1542, 1602, 1799,
+ 1800, 1920, 2118, 2498, 2727, 3140, 3597, 3654, 4253, 4389, 4399, 4495, 4672, 4780, 4877, 5260, 6186, 6195,
+ 6211, 6251, 6345, 6441, 6794, 6926, 6938, 7018, 7183, 7233, 7256, 7273, 7624, 7690, 7977, 67, 82, 400, 529,
+ 609, 640, 929, 1041, 1157, 1179, 1186, 1249, 1281, 1666, 1858, 1923, 2060, 2180, 2334, 2406, 2500, 2563,
+ 2711, 3173, 3204, 3463, 3524, 3598, 3628, 3716, 4036, 4116, 4169, 4489, 4509, 4608, 4618, 4660, 4678, 4764,
+ 5006, 5130, 5899, 5919, 6150, 6267, 6298, 6351, 6442, 6475, 6667, 6730, 6954, 7048, 7174, 7209, 7212, 7224,
+ 7289, 7308, 7400, 7432, 81, 140, 176, 232, 296, 416, 625, 770, 915, 945, 1028, 1042, 1158, 1185, 1218, 1537,
+ 1814, 2068, 2183, 2307, 2322, 2339, 2390, 2503, 2950, 3109, 3189, 3332, 3397, 3457, 3606, 3612, 3846, 4102,
+ 4236, 4525, 4687, 5000, 5020, 5582, 5641, 5871, 5935, 6144, 6339, 6403, 6539, 6921, 6986, 7173, 7188, 7196,
+ 7321, 7336, 7561, 7705, 7937, 8065, 8075, 27, 54, 108, 168, 216, 304, 388, 432, 520, 561, 580, 705, 777, 784,
+ 864, 907, 1039, 1167, 1203, 1546, 1803, 1806, 2314, 2366, 2382, 2564, 2639, 2821, 3081, 3084, 3149, 3221,
+ 3365, 3469, 3587, 3600, 3605, 3617, 3668, 3840, 4373, 4413, 4541, 4552, 4701, 5133, 5198, 5838, 5900, 6179,
+ 6218, 6303, 6395, 6914, 7042, 7216, 7234, 7274, 7322, 7368, 7692, 8169, 131, 137, 164, 464, 578, 673, 800,
+ 903, 1095, 1122, 1195, 1410, 1475, 1762, 1809, 1822, 1834, 2056, 2063, 2078, 2082, 2178, 2276, 2327, 2330,
+ 2370, 2599, 2627, 2693, 2758, 3137, 3477, 3594, 3618, 3684, 3812, 4119, 4175, 4548, 4555, 4573, 4637, 4644,
+ 4654, 4682, 4692, 4717, 5068, 5126, 5254, 5446, 5486, 5519, 5663, 5837, 5895, 6031, 6161, 6217, 6279, 6280,
+ 6291, 6435, 7210, 7236, 7276, 7290, 7695, 7720, 7725, 7737, 7745, 8136, 51, 138, 280, 352, 549, 592, 657,
+ 772, 832, 908, 917, 992, 1051, 1058, 1121, 1138, 1191, 1217, 1346, 1816, 1842, 2018, 2049, 2066, 2190, 2274,
+ 2341, 2346, 2372, 2510, 2629, 2726, 3493, 3521, 3608, 3632, 3637, 3700, 3860, 3973, 4126, 4136, 4156, 4189,
+ 4229, 4366, 4405, 4421, 4461, 4628, 4696, 4733, 5276, 5420, 5455, 5470, 5516, 5642, 5677, 6162, 6168, 6223,
+ 6273, 6363, 6408, 7171, 7245, 7264, 7752, 7946, 8077, 8089, 23, 53, 134, 161, 336, 454, 496, 528, 608, 921,
+ 928, 1057, 1088, 1098, 1107, 1137, 1173, 1314, 1379, 1540, 1554, 1728, 1797, 1930, 2102, 2196, 2316, 2348,
+ 2375, 2436, 2562, 2710, 2819, 3074, 3092, 3509, 3638, 3876, 4112, 4120, 4360, 4380, 4392, 4445, 4545, 4558,
+ 4565, 4605, 4614, 4649, 4669, 4740, 4813, 4997, 5386, 5406, 5418, 5423, 5452, 5675, 5678, 5695, 5703, 6274,
+ 6285, 6371, 6664, 7054, 7066, 7185, 7200, 7243, 7246, 7280, 7324, 7680, 7685, 7729, 7881, 39, 78, 133, 156,
+ 162, 227, 274, 312, 328, 453, 519, 569, 624, 914, 965, 1032, 1037, 1075, 1097, 1165, 1313, 1363, 1409, 1825,
+ 1828, 1938, 2074, 2084, 2214, 2242, 2279, 2439, 2534, 2691, 3143, 3593, 3650, 3656, 3661, 3781, 4123, 4148,
+ 4153, 4239, 4353, 4363, 4481, 4551, 4673, 4781, 5128, 5258, 5358, 5637, 5710, 5807, 5835, 5897, 5917, 6173,
+ 6355, 6413, 6419, 6425, 6459, 6795, 6858, 6927, 7019, 7297, 7300, 7307, 7312, 7562, 7723, 7953 };
+
+static int cbest_14[1023] = {
+ 1, 2, 8737, 4, 8, 13105, 16, 8736, 32, 4368, 3, 2184, 17, 1092, 15289, 34, 546, 5, 64, 68, 273, 8739, 8745,
+ 6, 136, 13104, 272, 9, 8741, 128, 8873, 12, 6552, 13109, 10, 544, 13107, 256, 18, 24, 3276, 8753, 33, 1088,
+ 4369, 15288, 16381, 2185, 8705, 13089, 13173, 20, 512, 1093, 2186, 4372, 8195, 8738, 8744, 13113, 35, 66,
+ 547, 1094, 1638, 2188, 4370, 4376, 8752, 8865, 12832, 15291, 36, 48, 65, 69, 257, 2176, 6416, 8707, 10921, 7,
+ 19, 25, 70, 137, 3208, 8740, 9008, 9829, 13088, 13169, 15281, 96, 132, 138, 277, 554, 1024, 1108, 1604, 6544,
+ 8749, 8872, 50, 140, 275, 281, 562, 802, 819, 3272, 4504, 7644, 8747, 8801, 8805, 12833, 13073, 13108, 21,
+ 100, 129, 192, 276, 401, 1636, 4352, 4436, 8743, 12563, 13106, 15257, 15293, 13, 38, 40, 72, 130, 200, 264,
+ 274, 280, 514, 550, 818, 2216, 2252, 6553, 8193, 8875, 9009, 15259, 11, 14, 49, 384, 400, 545, 1124, 2048,
+ 2218, 6554, 8761, 8877, 13075, 13111, 15273, 409, 1100, 1126, 14197, 15153, 15839, 42, 528, 768, 1109, 4097,
+ 4432, 8937, 9144, 15016, 16377, 16380, 51, 76, 98, 102, 204, 408, 552, 563, 803, 1090, 2200, 2248, 3277,
+ 8203, 8704, 10785, 13172, 144, 260, 548, 560, 800, 817, 1028, 1089, 1536, 3822, 4096, 4353, 8194, 8760,
+ 13112, 13117, 13241, 15272, 16365, 28, 258, 265, 1606, 1634, 4380, 6556, 8864, 8869, 8993, 9136, 10913,
+ 12561, 12836, 15290, 26, 80, 84, 196, 385, 513, 816, 1056, 2180, 2187, 2190, 3072, 3268, 3278, 4373, 4400,
+ 4496, 6145, 7508, 8706, 8755, 8941, 9016, 9145, 9287, 10920, 12013, 12290, 13175, 15017, 22, 67, 152, 1095,
+ 1639, 2189, 3212, 3292, 4371, 4374, 4377, 4568, 4572, 6418, 6536, 6584, 8225, 8929, 9825, 9828, 12849, 13093,
+ 13141, 13168, 13233, 14129, 14747, 15280, 16373, 16383, 37, 529, 570, 769, 1104, 1600, 1632, 2177, 5460,
+ 6144, 6417, 6586, 7636, 8466, 8713, 8748, 8867, 13091, 13097, 13171, 15837, 16313, 16364, 71, 134, 168, 285,
+ 392, 520, 530, 770, 1026, 1058, 1096, 2056, 2178, 2201, 2284, 3209, 4233, 4360, 4402, 4508, 4914, 6424, 7640,
+ 8709, 8721, 8746, 8757, 8800, 8804, 8992, 9121, 9137, 10923, 11809, 12837, 13072, 13115, 13137, 14193, 15566,
+ 97, 133, 139, 142, 145, 261, 288, 305, 337, 555, 561, 785, 801, 1025, 1120, 1140, 1605, 1646, 2112, 2116,
+ 2730, 3210, 3264, 3754, 4378, 6420, 6545, 8201, 8211, 8451, 8742, 8754, 8803, 8809, 8841, 8889, 9001, 9017,
+ 9831, 10925, 12291, 12562, 12834, 12840, 15161, 15256, 15283, 15292, 29, 56, 141, 259, 268, 279, 284, 674,
+ 1110, 1142, 1538, 1570, 1911, 2280, 2457, 3273, 3820, 4232, 4505, 6281, 6546, 7628, 7645, 8192, 8715, 8751,
+ 8769, 8874, 9012, 9285, 9317, 12848, 12960, 13057, 13077, 13092, 14880, 15258, 15265, 15285, 15295, 27, 52,
+ 74, 81, 85, 101, 160, 193, 304, 336, 516, 558, 784, 1060, 1348, 1365, 1540, 1637, 2052, 2254, 2286, 3140,
+ 3200, 3274, 3818, 4225, 4437, 4506, 4560, 6480, 6528, 6548, 7440, 8190, 8199, 8467, 8723, 8807, 8876, 9010,
+ 9129, 9837, 11945, 12307, 12547, 13074, 13090, 13096, 13110, 13143, 13655, 14745, 14777, 15241, 15353, 15357,
+ 15831, 23, 39, 41, 44, 73, 131, 153, 194, 201, 278, 290, 307, 403, 515, 536, 551, 610, 614, 806, 823, 1116,
+ 1122, 1228, 1612, 1910, 2050, 2217, 2220, 2253, 2456, 2696, 3224, 3240, 3293, 3720, 4105, 4438, 6280, 6537,
+ 8331, 8720, 8756, 8773, 8879, 8881, 9120, 9761, 9845, 10849, 10853, 12288, 12593, 13157, 13297, 13617, 14196,
+ 15008, 15152, 15243, 15567, 15835, 15838, 16245, 15, 58, 112, 170, 202, 283, 341, 405, 571, 682, 810, 1040,
+ 1105, 1125, 1364, 1568, 1620, 1860, 1877, 2049, 2192, 2219, 2250, 3076, 4101, 4112, 4224, 4356, 4500, 5392,
+ 6448, 6555, 7504, 7646, 8210, 8227, 8450, 8711, 8813, 8888, 8936, 9000, 10889, 12009, 12567, 12835, 12841,
+ 12968, 13059, 13177, 13181, 13305, 13309, 14888, 15155, 16108, 16376, 104, 306, 386, 393, 402, 531, 568, 771,
+ 822, 930, 1030, 1101, 1127, 1602, 1644, 2120, 2728, 3080, 3288, 3752, 3814, 4104, 4354, 4361, 4434, 4440,
+ 4444, 6153, 7576, 8202, 8219, 8464, 8602, 8843, 8933, 9013, 9797, 10784, 10789, 10891, 11877, 12571, 12896,
+ 12900, 12961, 13056, 13237, 14097, 14181, 14474, 14881, 15249, 15264, 15275, 16357, 16369, 43, 116, 148, 224,
+ 266, 282, 289, 340, 388, 404, 411, 465, 522, 576, 580, 608, 672, 1072, 1102, 1141, 1220, 1876, 1909, 2208,
+ 2222, 2240, 2244, 2282, 3136, 4100, 4433, 4468, 4564, 4912, 5456, 6273, 6400, 6484, 7444, 7620, 7632, 8323,
+ 8722, 8765, 9011, 9128, 9799, 9965, 10787, 10989, 12306, 12546, 13079, 13116, 13139, 13240, 13245, 14199,
+ 15033, 15240, 15251, 15261, 15564, 15869, 16317, 16361, 54, 57, 77, 99, 103, 162, 205, 232, 269, 518, 553,
+ 556, 566, 787, 1062, 1091, 1111, 1143, 1542, 2060, 2210, 2232, 2249, 3788, 3816, 6152, 6450, 7629, 7783,
+ 8182, 8197, 8209, 8241, 8449, 8481, 8587, 8763, 8868, 8880, 8939, 9146, 9295, 9844, 10912, 10917, 11471,
+ 11813, 12005, 12289, 12560, 13153, 13156, 13301, 13653, 13685, 15009, 15018, 15157, 15242, 15562, 16379, 46,
+ 53, 78, 208, 320, 410, 448, 464, 549, 612, 772, 804, 821, 938, 955, 1029, 1349, 1537, 1642, 1908, 2202, 2234,
+ 3242, 3284, 3722, 3823, 4240, 4464, 6160, 6272, 6529, 6558, 6576, 7098, 7237, 8233, 8329, 8771, 8775, 8871,
+ 8905, 8940, 9020, 9283, 9286, 10793, 12012, 12294, 12844, 12857, 12969, 13058, 13174, 13587, 14099, 14521,
+ 14739, 14761, 14889, 15020, 15321, 15430, 15833, 16109, 16349, 82, 88, 146, 262, 291, 537, 578, 680, 776,
+ 786, 807, 1032, 1074, 1117, 1160, 1574, 1607, 1635, 1654, 1894, 1907, 2124, 2182, 2204, 2440, 3084, 3225,
+ 3308, 3810, 4099, 4120, 4301, 4381, 4488, 4643, 6147, 6557, 6578, 6616, 6620, 7782, 8188, 8207, 8218, 8224,
+ 8465, 8603, 8833, 8928, 9249, 9824, 11873, 12305, 12337, 12545, 12565, 12577, 12853, 12897, 12901, 12977,
+ 13119, 13140, 13232, 13243, 14128, 14133, 14180, 14475, 14746, 15121, 15217, 15263, 15274, 15771, 15829,
+ 16177, 16356, 16372, 16382, 30, 59, 113, 197, 206, 469, 811, 820, 827, 896, 928, 954, 1041, 1057, 1136, 1216,
+ 1344, 1361, 1572, 1621, 1861, 2080, 2144, 2181, 2193, 2321, 3073, 3204, 3214, 3269, 3279, 4113, 4357, 4384,
+ 4404, 4420, 4497, 4922, 7236, 7492, 7509, 8579, 8600, 8712, 8729, 8866, 9138, 9319, 9827, 9833, 10915, 10985,
+ 11267, 12595, 12631, 12838, 13101, 13170, 13382, 13619, 13651, 14131, 14472, 14504, 15032, 15325, 15565,
+ 15827, 16125, 16312 };
+
+static int cbest_15[1023] = {
+ 1, 2, 16385, 4, 8, 24577, 16, 32, 64, 28673, 128, 256, 512, 1024, 30721, 2048, 4096, 8192, 16384, 31745, 3,
+ 6, 12, 24, 48, 96, 192, 384, 768, 1536, 3072, 6144, 12288, 24576, 5, 16387, 9, 16389, 10, 17, 16393, 24579,
+ 28672, 18, 33, 14336, 16401, 24581, 20, 34, 65, 7168, 16417, 24585, 32257, 36, 66, 129, 3584, 16449, 24593,
+ 28675, 40, 68, 130, 257, 1792, 16513, 24609, 28677, 30720, 72, 132, 258, 513, 896, 16641, 24641, 28681, 80,
+ 136, 260, 448, 514, 1025, 15360, 16897, 24705, 28689, 144, 224, 264, 516, 1026, 2049, 17409, 24833, 28705,
+ 30723, 112, 160, 272, 520, 1028, 2050, 4097, 7680, 18433, 25089, 28737, 30725, 56, 288, 528, 1032, 2052,
+ 4098, 8193, 20481, 25601, 28801, 30729, 31744, 32513, 28, 320, 544, 1040, 2056, 3840, 4100, 8194, 26625,
+ 28929, 30737, 14, 576, 1056, 2064, 4104, 8196, 16386, 29185, 30753, 7, 13, 25, 49, 97, 193, 385, 640, 769,
+ 1088, 1537, 1920, 2080, 3073, 4112, 6145, 8200, 12289, 15872, 16388, 29697, 30785, 31747, 26, 50, 98, 194,
+ 386, 770, 1152, 1538, 2112, 3074, 4128, 6146, 8208, 12290, 16391, 16392, 16397, 16409, 16433, 16481, 16577,
+ 16769, 17153, 17921, 19457, 22529, 24578, 30849, 31749, 52, 100, 196, 388, 772, 960, 1280, 1540, 2176, 3076,
+ 4160, 6148, 8224, 12292, 16400, 24580, 30977, 31753, 11, 104, 200, 392, 776, 1544, 2304, 3080, 4224, 6152,
+ 7936, 8256, 12296, 16416, 24583, 24584, 24589, 24601, 24625, 24673, 24769, 24961, 25345, 26113, 27649, 31233,
+ 31761, 32256, 19, 22, 208, 400, 480, 784, 1552, 2560, 3088, 4352, 6160, 8320, 12304, 14337, 16395, 16448,
+ 24592, 28674, 31777, 21, 35, 38, 44, 416, 800, 1568, 3104, 4608, 6176, 7169, 8448, 12320, 14338, 16403,
+ 16512, 24608, 28676, 31809, 32641, 37, 67, 70, 76, 88, 240, 832, 1600, 3136, 3585, 3968, 5120, 6208, 7170,
+ 8704, 12352, 14340, 16405, 16419, 16640, 23553, 24587, 24640, 28679, 28680, 28685, 28697, 28721, 28769,
+ 28865, 29057, 29441, 30209, 31873, 32259, 41, 69, 131, 134, 140, 152, 176, 1664, 1793, 3200, 3586, 6272,
+ 7172, 9216, 12416, 14344, 16128, 16421, 16451, 16896, 19969, 24595, 24704, 28688, 32001, 32261, 42, 73, 120,
+ 133, 259, 262, 268, 280, 304, 352, 897, 1794, 3328, 3588, 6400, 7176, 10240, 12544, 14352, 16425, 16453,
+ 16515, 17408, 18177, 24597, 24611, 24832, 28704, 30722, 32265, 74, 81, 137, 261, 449, 515, 518, 524, 536,
+ 560, 608, 704, 898, 1796, 1984, 3592, 6656, 7184, 12800, 14368, 15361, 16457, 16517, 16643, 17281, 18432,
+ 24613, 24643, 25088, 28161, 28683, 28736, 30724, 32273, 60, 82, 138, 145, 225, 265, 450, 517, 900, 1027,
+ 1030, 1036, 1048, 1072, 1120, 1216, 1408, 1800, 3600, 7200, 13312, 14400, 15362, 16465, 16521, 16645, 16833,
+ 16899, 20480, 24617, 24645, 24707, 25600, 26369, 28691, 28800, 30727, 30728, 30733, 30745, 30769, 30817,
+ 30913, 31105, 31489, 32289, 32512, 84, 113, 146, 161, 226, 266, 273, 452, 521, 904, 1029, 1808, 2051, 2054,
+ 2060, 2072, 2096, 2144, 2240, 2432, 2816, 3616, 7232, 7681, 8064, 14464, 15364, 16529, 16609, 16649, 16901,
+ 17411, 24649, 24709, 24835, 25473, 26624, 28693, 28707, 28928, 30736, 32321, 30, 57, 114, 148, 162, 228, 274,
+ 289, 456, 522, 529, 912, 992, 1033, 1824, 2053, 3648, 4099, 4102, 4108, 4120, 4144, 4192, 4288, 4480, 4864,
+ 5632, 7296, 7682, 14592, 15368, 16497, 16545, 16657, 16905, 17413, 18435, 24065, 24657, 24713, 24837, 25025,
+ 25091, 28709, 28739, 29184, 30752, 32385, 29, 58, 116, 164, 232, 276, 290, 321, 464, 530, 545, 928, 1034,
+ 1041, 1856, 2057, 3712, 3841, 4101, 7424, 7684, 8195, 8198, 8204, 8216, 8240, 8288, 8384, 8576, 8960, 9728,
+ 11264, 14848, 15376, 16441, 16673, 16913, 17417, 18437, 20483, 24721, 24801, 24841, 25093, 25603, 28713,
+ 28741, 28803, 29696, 30465, 30731, 30784, 31746, 32515, 15, 168, 292, 322, 532, 546, 577, 1042, 1057, 2058,
+ 2065, 3842, 4105, 7688, 8197, 15392, 16390, 16396, 16408, 16413, 16432, 16480, 16576, 16705, 16768, 16929,
+ 17152, 17425, 17920, 18441, 19456, 20225, 20485, 22528, 24689, 24737, 24849, 25097, 25605, 26627, 28745,
+ 28805, 28931, 29569, 30739, 30848, 31748, 32517, 32705, 296, 324, 496, 548, 578, 641, 1044, 1058, 1089, 1921,
+ 2066, 2081, 3844, 4032, 4106, 4113, 7696, 8201, 15424, 15873, 16256, 16399, 16961, 17441, 18449, 20489,
+ 24633, 24865, 25105, 25609, 26629, 28753, 28809, 28933, 29121, 29187, 30741, 30755, 30976, 31751, 31752,
+ 31757, 31769, 31793, 31841, 31937, 32129, 32521, 27, 51, 54, 99, 102, 108, 195, 198, 204, 216, 328, 387, 390,
+ 396, 408, 432, 552, 580, 642, 771, 774, 780, 792, 816, 864, 1060, 1090, 1153, 1539, 1542, 1548, 1560, 1584,
+ 1632, 1728, 1922, 2068, 2082, 2113, 3075, 3078, 3084, 3096, 3120, 3168, 3264, 3456, 3848, 4114, 4129, 6147,
+ 6150, 6156, 6168, 6192, 6240, 6336, 6528, 6912, 7712, 8202, 8209, 12291, 12294, 12300, 12312, 12336, 12384,
+ 12480, 12672, 13056, 13824, 15488, 15874, 17025, 17473, 18305, 18465, 20497, 24582, 24588, 24600, 24605,
+ 24624, 24672, 24768, 24897, 24960, 25121, 25344, 25617, 26112, 26633, 27648, 28417, 28817, 28897, 28937,
+ 29189, 29699, 30757, 30787, 31232, 31760, 32529, 53, 101, 197, 336, 389, 584, 644, 773, 961, 1064, 1092,
+ 1154, 1281, 1541, 1924, 2084, 2114, 2177, 3077, 3856, 4116, 4130, 4161, 6149, 7744, 8210, 8225, 12293, 15616,
+ 15876, 16394, 16411, 16435, 16483, 16579, 16771, 17155, 17537, 17923, 18497, 19459, 20513, 22531, 24591,
+ 25153, 25633, 26641, 28785, 28833, 28945, 29193, 29701, 30761, 30789, 30851, 31776, 32545, 105, 201, 248,
+ 393, 592, 648, 777, 962, 1096, 1156, 1282, 1545, 1928, 2088, 2116, 2178, 2305, 3081, 3872, 4132, 4162, 4225,
+ 6153, 7808, 7937, 8212, 8226, 8257, 12297, 15880, 16402, 16437, 16485, 16581, 16773, 17157, 17345, 17665,
+ 17925, 18561, 19461, 20545, 22533, 25217, 25665, 26497, 26657, 28729, 28961, 29201, 29705, 30793, 30853,
+ 30979, 31617, 31755, 31808, 32577, 32640, 23, 46, 92, 106, 184, 202, 209, 368, 394, 401, 481, 656, 736, 778,
+ 785, 964, 1104, 1160, 1284, 1472, 1546, 1553, 1936, 2016, 2120, 2180, 2306, 2561, 2944, 3082, 3089, 3904,
+ 4136, 4164, 4226, 4353, 5888, 6154, 6161, 7938, 8228, 8258, 8321, 11776, 12298, 12305, 15888, 16404, 16418,
+ 16489, 16585, 16777, 17161, 17929, 18689, 19465, 20609, 22537, 23552, 24321, 24586, 24603, 24627, 24675,
+ 24771, 24963, 25347, 25729, 26115, 26689, 27651, 28678, 28684, 28696, 28701, 28720, 28768, 28864, 28993,
+ 29056, 29217, 29440, 29713, 30208, 30801, 30857, 30981, 31169, 31235, 31763, 31872, 32258, 45, 78, 156, 312,
+ 624, 672, 801, 1554, 2562, 14384, 16593, 16785, 17937, 24677, 31237, 31765 };
+
+static int cbest_16[1023] = {
+ 1, 2, 34821, 4, 8, 52231, 16, 60934, 34820, 32, 17410, 3, 8705, 17, 34, 5, 68, 34823, 34829, 6, 136, 30467,
+ 34817, 39173, 52230, 64, 272, 52229, 9, 544, 52227, 10, 1088, 12, 2176, 26115, 4352, 50311, 60935, 18, 8704,
+ 45956, 60932, 20, 128, 24, 34837, 47876, 17408, 30466, 33, 36, 17411, 52239, 59974, 40, 22978, 48, 256, 8707,
+ 17414, 34822, 34828, 43524, 60930, 65286, 19, 35, 8709, 17418, 23938, 26114, 34816, 34836, 34853, 39172,
+ 52247, 21, 69, 15233, 21762, 30465, 34825, 52225, 52228, 7, 25, 38, 72, 137, 13057, 19586, 34831, 34881,
+ 52226, 42, 65, 80, 273, 10881, 26113, 29987, 34819, 34855, 39175, 50, 66, 70, 76, 545, 9793, 11489, 45957,
+ 60933, 60942, 11, 14, 84, 96, 138, 1089, 11969, 34957, 37253, 50310, 52261, 13, 152, 274, 1090, 2177, 34885,
+ 34949, 35093, 39169, 50309, 52246, 52263, 49, 144, 257, 546, 4353, 8713, 32643, 32901, 34845, 35365, 38341,
+ 47877, 60950, 22, 28, 140, 512, 2180, 4354, 25155, 30471, 35909, 40261, 45716, 52291, 52295, 100, 129, 160,
+ 276, 2178, 8706, 17426, 26123, 39717, 49351, 52235, 59975, 44, 98, 130, 168, 548, 4360, 8708, 8721, 34833,
+ 51271, 52237, 60928, 81, 132, 192, 304, 514, 1092, 15232, 30464, 30475, 34839, 34861, 39181, 43525, 45958,
+ 47872, 50307, 52238, 52367, 53703, 60931, 60951, 60966, 65287, 51, 88, 102, 204, 280, 408, 816, 1632, 3264,
+ 6528, 8720, 13056, 17409, 17412, 26119, 45952, 47396, 52243, 52503, 59972, 60964, 26, 37, 85, 170, 196, 288,
+ 340, 552, 680, 1360, 2720, 4356, 5440, 10880, 17416, 22858, 22976, 23936, 26112, 29986, 34844, 40565, 40805,
+ 52245, 52775, 54663, 41, 56, 153, 162, 200, 306, 612, 1096, 1224, 2448, 4896, 7616, 8737, 9792, 11488, 17422,
+ 17440, 21760, 22979, 30483, 34849, 34852, 39189, 47044, 47878, 49895, 50855, 59494, 60454, 65284, 145, 320,
+ 336, 1028, 2184, 8711, 11968, 17415, 17442, 34824, 34889, 34893, 43520, 50583, 52224, 60940, 60994, 23, 29,
+ 258, 392, 560, 608, 17419, 23698, 23939, 30482, 34827, 34830, 34832, 34863, 34880, 43526, 50319, 52259,
+ 56583, 60943, 64806, 46, 58, 74, 176, 260, 324, 384, 1024, 1104, 3808, 8225, 8712, 14993, 17427, 17474,
+ 19584, 21763, 32642, 34818, 34838, 34854, 39174, 45964, 52257, 57574, 59970, 39, 73, 82, 92, 264, 290, 2192,
+ 5744, 5984, 8769, 11429, 19587, 25154, 26131, 29985, 30227, 30470, 34869, 35077, 38933, 39205, 41604, 43532,
+ 52242, 52359, 58054, 60948, 61070, 63366, 65294, 43, 52, 112, 116, 400, 2056, 4368, 8739, 12577, 16450,
+ 17478, 20802, 21766, 26122, 29747, 30497, 34841, 34851, 34956, 37252, 42212, 50327, 52244, 52255, 52260,
+ 60998, 61206, 65282, 67, 71, 77, 89, 178, 184, 576, 672, 1904, 8773, 10401, 10883, 11425, 11849, 13061,
+ 15235, 15241, 17546, 18626, 22850, 22982, 26121, 26130, 30499, 34884, 34948, 34965, 35092, 39168, 39188,
+ 42692, 45700, 45717, 45972, 47884, 50191, 50308, 52233, 52262, 59014, 60938, 15, 27, 78, 97, 139, 148, 164,
+ 289, 352, 356, 1120, 1216, 2992, 8717, 8841, 9313, 13059, 13065, 17538, 17682, 19594, 23522, 30469, 30474,
+ 32900, 34857, 34883, 35364, 37013, 37249, 38340, 39171, 44612, 45836, 52293, 59906, 60246, 57, 86, 232, 275,
+ 640, 784, 1091, 2872, 8715, 8977, 9797, 17424, 17434, 17954, 23942, 26118, 26147, 29953, 32641, 34897, 34901,
+ 34945, 34981, 35908, 37255, 38221, 39233, 40260, 49349, 51127, 52290, 52294, 59766, 59982, 60110, 60929,
+ 60949, 154, 224, 321, 368, 547, 648, 712, 952, 4112, 4384, 9795, 11491, 15237, 16321, 17430, 22986, 25153,
+ 26117, 26145, 28787, 29027, 29955, 32403, 32897, 34871, 34887, 34951, 38337, 39207, 39716, 40533, 45596,
+ 45959, 47873, 47892, 49350, 51007, 51269, 52234, 52241, 52271, 52289, 52303, 60946, 60967, 104, 118, 141,
+ 156, 259, 328, 393, 513, 516, 561, 580, 704, 768, 800, 1496, 2181, 2208, 4355, 8725, 8736, 11393, 11457,
+ 11971, 19590, 21106, 21346, 22306, 22918, 29507, 30473, 30535, 34868, 34917, 34959, 35009, 35013, 35076,
+ 40021, 45701, 45953, 45973, 47397, 49231, 50305, 50341, 51270, 52236, 52269, 52279, 52311, 52487, 59973,
+ 60965, 30, 59, 101, 142, 161, 177, 186, 261, 277, 325, 385, 464, 520, 578, 786, 1105, 1344, 1424, 2179, 2182,
+ 4362, 8710, 8723, 10885, 17420, 19170, 20130, 23946, 24675, 25635, 29991, 30603, 31683, 32647, 34835, 34840,
+ 34860, 34909, 34953, 34973, 35095, 35101, 35911, 39180, 39309, 39477, 40517, 40549, 40737, 50306, 50326,
+ 50343, 52254, 52277, 52366, 53702, 59910, 59990, 60962, 61062, 45, 54, 83, 93, 99, 114, 131, 134, 146, 169,
+ 172, 194, 236, 265, 278, 291, 358, 476, 528, 549, 650, 736, 1094, 1122, 1436, 1572, 2048, 2193, 2240, 2432,
+ 2856, 4361, 8724, 8741, 11493, 17450, 17482, 19858, 21770, 22786, 22914, 22994, 30531, 32903, 32909, 34847,
+ 34913, 34964, 35089, 35367, 35373, 38343, 39183, 39237, 39719, 40535, 45988, 47045, 47879, 49827, 51267,
+ 52327, 52365, 52502, 53583, 53701, 54661, 59426, 59495, 60455, 60958, 65285, 53, 113, 117, 133, 193, 234,
+ 305, 308, 372, 448, 468, 515, 550, 582, 642, 716, 748, 936, 1093, 1152, 1300, 1408, 1428, 2210, 2848, 3144,
+ 4369, 8224, 8722, 11153, 11459, 11841, 11973, 14992, 17446, 17448, 22798, 25163, 26127, 30481, 34877, 34983,
+ 35333, 35361, 35917, 39177, 39685, 40263, 40564, 40804, 43521, 43540, 45236, 45476, 45580, 45718, 47636,
+ 47893, 49893, 50183, 50371, 50567, 50991, 52251, 52299, 52325, 52363, 52501, 52774, 53575, 54662, 60452,
+ 60941, 60947, 60995, 65302, 103, 166, 179, 185, 205, 281, 296, 312, 354, 370, 409, 472, 714, 740, 817, 928,
+ 1164, 1432, 1480, 1568, 1633, 1872, 2244, 2600, 3265, 4358, 4364, 4386, 5712, 6288, 6529, 8768, 8833, 9729,
+ 10065, 10673, 10889, 11397, 11428, 11761, 17413, 17444, 22790, 22856, 23682, 23954, 26183, 29713, 29984,
+ 29995, 30123, 30226, 34848, 34859, 34896, 34900, 35905, 40257, 43527, 45116, 45572, 45712, 45828, 46016,
+ 47156, 47392, 47874, 49894, 50854, 50983, 52307, 52499, 52773, 60960, 60974, 60992, 64807, 90, 149, 171, 197,
+ 208, 238, 282, 341, 353, 357, 518, 553, 681, 744, 770, 1098, 1156, 1280, 1296, 1361, 1472, 1634, 2188, 2328,
+ 2721, 2864, 3266, 3744, 4357, 4420, 5200, 5441, 6530, 7496, 8729, 8738, 9585, 9801, 9929, 10553, 11395,
+ 11497, 12576, 14977, 15249, 17417, 21778, 22794, 22859, 22977, 23696, 23818, 23937, 26129, 26251, 29715,
+ 29746, 30055, 30479, 30480, 30496, 33989, 34870, 34888, 34892, 34905, 34915, 34919, 34989, 35017, 38213,
+ 39191, 39713, 40741, 45954, 45965, 45990, 46924, 47908, 49347, 50317, 50582, 50735, 50853, 52240, 52267,
+ 52771, 54423, 56581, 57506, 57510, 57575, 59492, 59971, 59991, 60102, 60936, 60982, 198, 374, 1097, 1225,
+ 7488, 15113, 17441, 17472, 26387, 29883, 30211, 34879, 35205, 50463, 50999, 52258, 61071 };
+
+
+static int cbest_17[1023] = {
+ 1, 2, 65540, 4, 32770, 8, 16385, 16, 32, 73732, 64, 128, 36866, 256, 512, 18433, 1024, 2048, 4096, 74756,
+ 8192, 16384, 37378, 32768, 3, 18689, 65536, 65541, 5, 32771, 65542, 6, 9, 18, 36, 72, 144, 288, 576, 1152,
+ 2304, 4608, 9216, 18432, 98310, 10, 17, 16387, 32774, 36864, 65548, 81925, 12, 33, 16389, 32778, 49155,
+ 65556, 73728, 73733, 74884, 20, 34, 65, 16393, 32786, 65572, 73734, 24, 66, 129, 16401, 32802, 36867, 65604,
+ 106502, 40, 68, 130, 257, 16417, 32834, 65668, 73740, 90117, 102406, 48, 132, 258, 513, 16449, 32898, 36870,
+ 37442, 65796, 73748, 80, 136, 260, 514, 1025, 16513, 18435, 33026, 36874, 53251, 66052, 73764, 83973, 96,
+ 264, 516, 1026, 2049, 16641, 18437, 33282, 36882, 51203, 66564, 73796, 160, 272, 520, 1028, 2050, 4097,
+ 16897, 18441, 18688, 33794, 36898, 37376, 67588, 73860, 74752, 74757, 110598, 192, 528, 1032, 2052, 4098,
+ 8193, 9344, 17409, 18449, 18721, 34818, 36930, 69636, 73988, 74758, 320, 544, 1040, 2056, 4100, 4672, 8194,
+ 18465, 36994, 74244, 92165, 107526, 384, 1056, 2064, 2336, 4104, 8196, 16386, 18497, 20481, 37122, 37379,
+ 40962, 74764, 81924, 91141, 640, 1088, 1168, 2080, 4112, 8200, 16388, 18561, 24577, 32769, 49154, 55299,
+ 74772, 75780, 102918, 584, 768, 2112, 4128, 8208, 16392, 37382, 37890, 74788, 77828, 98308, 292, 1280, 2176,
+ 4160, 8224, 16400, 18945, 32772, 32784, 37386, 38914, 53763, 65537, 74820, 74900, 146, 1536, 4224, 8256,
+ 16416, 18691, 19457, 32776, 37394, 49153, 65538, 65543, 65568, 84229, 90116, 111622, 7, 19, 37, 73, 145, 289,
+ 577, 1153, 2305, 2560, 4352, 4609, 8320, 9217, 16448, 18693, 37410, 45058, 51459, 75012, 98306, 98311,
+ 111110, 11, 38, 74, 290, 578, 1154, 2306, 3072, 4610, 8448, 9218, 16512, 18434, 18697, 22529, 32775, 32800,
+ 36865, 53250, 65544, 65549, 65558, 65612, 65684, 65828, 66116, 66692, 67844, 70148, 75268, 81921, 83972,
+ 93189, 106500, 13, 22, 76, 148, 580, 1156, 2308, 4612, 5120, 8704, 9220, 16640, 18436, 18705, 26625, 32779,
+ 32806, 32832, 32842, 32914, 33058, 33346, 33922, 35074, 37506, 41986, 51202, 65550, 65552, 65557, 73729,
+ 74880, 74885, 81927, 102404, 14, 21, 26, 35, 44, 152, 296, 1160, 2312, 4616, 6144, 9224, 16391, 16403, 16421,
+ 16457, 16529, 16673, 16896, 16961, 17537, 18440, 20993, 25601, 32787, 32896, 36868, 36880, 37440, 37450,
+ 37634, 65573, 73730, 73735, 73760, 74886, 76804, 92421, 98318, 114695, 25, 52, 67, 88, 304, 592, 2320, 4624,
+ 9232, 10240, 16395, 17408, 18448, 18720, 18753, 32782, 32803, 33024, 36872, 49159, 53249, 55811, 65574,
+ 65600, 65605, 78852, 81933, 98326, 106498, 106503, 107654, 41, 50, 69, 104, 131, 176, 608, 1184, 4640, 9248,
+ 12288, 16397, 18464, 18817, 32790, 32835, 33280, 38402, 49163, 51201, 55555, 65564, 65606, 65664, 65669,
+ 73736, 73741, 73750, 73804, 73876, 74020, 74308, 74892, 76036, 78340, 81941, 90113, 91269, 92164, 98342,
+ 102402, 102407, 28, 42, 49, 70, 82, 100, 133, 208, 259, 352, 1216, 2368, 9280, 9360, 16405, 16419, 18496,
+ 20480, 32794, 32899, 33792, 36871, 36896, 37443, 39426, 49171, 65580, 65670, 65792, 65797, 73742, 73744,
+ 73749, 81957, 90119, 91140, 98374, 110596, 81, 134, 137, 164, 200, 261, 416, 515, 704, 2432, 4736, 16409,
+ 16451, 18560, 19201, 24576, 32810, 32838, 33027, 34816, 36875, 36902, 36928, 36938, 37010, 37154, 38018,
+ 39170, 46082, 49187, 55298, 65588, 65798, 66048, 66053, 73765, 74916, 81989, 83969, 98438, 102982, 106510,
+ 112134, 122887, 97, 138, 262, 265, 274, 328, 400, 517, 832, 1027, 1408, 4680, 4864, 9472, 16425, 16453,
+ 16515, 18725, 19713, 32818, 32902, 33283, 36883, 36992, 37446, 45570, 49219, 65620, 65676, 66054, 66560,
+ 66565, 73766, 73792, 73797, 74948, 82053, 83975, 90125, 98566, 102414, 106518, 107524, 118791, 56, 84, 98,
+ 140, 161, 266, 273, 518, 521, 530, 548, 656, 800, 1029, 1664, 2051, 2816, 9728, 16433, 16517, 16643, 18439,
+ 18451, 18469, 18505, 18577, 18944, 19009, 19585, 23041, 27649, 32850, 32906, 33030, 33795, 36878, 36899,
+ 37120, 37377, 40960, 49283, 53255, 53762, 53827, 65636, 65804, 66566, 67584, 67589, 73756, 73798, 73856,
+ 73861, 74753, 82181, 90133, 98822, 102422, 106534, 110594, 110599, 111750, 116743, 162, 193, 268, 522, 529,
+ 1030, 1033, 1042, 1060, 1096, 1312, 1600, 2053, 2340, 3328, 4099, 5632, 9345, 16465, 16521, 16645, 16899,
+ 18443, 18690, 19456, 22785, 32866, 33034, 33286, 34819, 36886, 36931, 37458, 49152, 49411, 51207, 53259,
+ 55297, 65700, 65812, 66060, 67590, 69632, 69637, 73772, 73862, 73984, 73989, 74754, 74759, 74784, 75140,
+ 82437, 83981, 84228, 90149, 93445, 99334, 102438, 102916, 106566, 194, 276, 321, 524, 545, 1034, 1041, 2054,
+ 2057, 2066, 2084, 2120, 2192, 2624, 3200, 4101, 4673, 6656, 8195, 9346, 11264, 16481, 16649, 16901, 17411,
+ 18445, 18692, 18723, 26881, 32930, 33042, 33290, 33798, 36890, 36995, 37380, 37392, 37474, 37888, 49667,
+ 51211, 51458, 53267, 65732, 66068, 66572, 69638, 73780, 73990, 74240, 74245, 75396, 82949, 83989, 84261,
+ 90181, 92161, 93317, 100358, 102470, 106630, 107522, 107527, 111174, 112, 168, 196, 280, 322, 385, 532, 546,
+ 1036, 1057, 1170, 2058, 2065, 2337, 4102, 4105, 4114, 4132, 4168, 4240, 4384, 4674, 5248, 6400, 8197, 9348,
+ 13312, 16545, 16657, 16905, 17413, 18453, 18467, 18696, 22528, 32962, 33298, 33802, 34822, 36906, 36934,
+ 37123, 37384, 38912, 40963, 42114, 50179, 51219, 51491, 53283, 53761, 65860, 66084, 66580, 67596, 70212,
+ 73812, 73868, 74246, 74760, 74765, 74774, 74828, 75044, 75332, 77060, 79364, 81920, 84005, 90245, 91137,
+ 92167, 93188, 102534, 106758, 110606, 126983, 324, 386, 536, 641, 1044, 1058, 1089, 1169, 2060, 2081, 2338,
+ 4106, 4113, 4676, 8198, 8201, 8210, 8228, 8264, 8336, 8480, 8768, 9352, 10496, 12800, 16577, 16913, 17417,
+ 18457, 18499, 18704, 18729, 20483, 25729, 26624, 33090, 33314, 33810, 34826, 36914, 36998, 37570, 51235,
+ 53315, 56067, 65924, 66596, 67604, 67876, 69644, 73828, 73996, 74766, 74768, 74773, 75776, 75781, 76932,
+ 81926, 84037, 86021, 90373, 91143, 102662, 102914, 102919, 107014, 110614, 111620, 124935, 388, 552, 585,
+ 642, 769, 1048, 1090, 2068, 2082, 2113, 4108, 4129, 8202, 8209, 16390, 16402, 16420, 16456, 16528, 16672,
+ 16705, 16929, 16960, 17425, 17536, 18473, 18501, 18563, 18737, 20485, 20992, 21057, 24579, 25600, 32912,
+ 33154, 33826, 34834, 35106, 36946, 37002, 37126, 37383, 37408, 37698, 37891, 40966, 45056, 51267, 51457,
+ 53379, 65824, 66180, 66628, 66708, 67620, 69652, 73892, 74004, 74252, 74789, 74902, 75782, 77824, 77829,
+ 78980, 84101, 90629, 92173, 92420, 98304, 98309, 107534, 110630, 111108, 114694, 120839, 123911, 392, 644,
+ 770, 1281, 2072, 2114, 2177, 4116, 4130, 4161, 8204, 8225, 16769, 17441, 18565, 24581, 33410, 33938, 34850,
+ 36962, 37130, 38530, 38915, 40970, 46594, 49158, 53248, 55303, 55810, 69668, 73924, 74260, 74790, 74821,
+ 74896, 74901, 77830, 91149, 92181, 92453, 107542, 110662 };
+
+static int cbest_18[1023] = {
+ 1, 2, 131136, 4, 65568, 8, 32784, 16, 16392, 32, 8196, 64, 4098, 128, 2049, 256, 512, 132160, 1024, 2048,
+ 66080, 4096, 33040, 8192, 16520, 8260, 16384, 3, 4130, 32768, 131137, 5, 65569, 131138, 6, 9, 2065, 32785,
+ 65536, 65570, 131140, 196704, 10, 17, 16393, 32786, 65572, 131144, 163920, 12, 18, 33, 8197, 16394, 32788,
+ 65576, 98352, 131072, 131152, 147528, 20, 34, 65, 4099, 8198, 16396, 32792, 65584, 81960, 131168, 132168,
+ 139332, 24, 36, 66, 129, 258, 516, 1032, 2064, 49176, 73764, 135234, 40, 68, 130, 257, 2051, 4102, 4128,
+ 8204, 16408, 32816, 40980, 65632, 69666, 131264, 133185, 48, 72, 132, 513, 2053, 4106, 8212, 8256, 16424,
+ 24588, 32848, 36882, 65696, 66084, 67617, 131392, 132161, 80, 136, 260, 514, 1025, 2057, 4114, 8228, 16456,
+ 16512, 20490, 32912, 34833, 65824, 131648, 132162, 96, 144, 264, 1026, 12294, 18441, 33024, 66081, 132164,
+ 197728, 160, 272, 520, 1028, 2050, 2081, 4162, 8324, 10245, 16648, 33042, 33296, 66048, 66082, 66592, 133184,
+ 164944, 197216, 192, 288, 528, 2052, 2113, 4097, 4226, 6147, 8452, 16904, 33041, 33808, 67616, 132096,
+ 132176, 148552, 320, 544, 1040, 2056, 2177, 4354, 8708, 17416, 34832, 66088, 98864, 132192, 135232, 140356,
+ 164176, 384, 576, 1056, 2305, 4100, 4610, 8193, 9220, 16521, 18440, 33044, 66096, 69664, 82472, 98608,
+ 136258, 640, 1088, 2080, 2561, 4104, 5122, 8194, 10244, 16522, 33048, 36880, 74276, 132288, 134209, 139268,
+ 139328, 147656, 768, 1152, 2112, 3073, 4112, 6146, 8261, 16385, 16524, 20488, 49432, 66144, 69634, 70178,
+ 73760, 82088, 132416, 1280, 2176, 8200, 8262, 12292, 16386, 33072, 34817, 40976, 41236, 49304, 66208, 68129,
+ 132672, 147520, 1536, 2304, 4131, 4160, 4352, 8208, 16388, 16536, 24584, 32769, 33104, 37138, 66336, 73828,
+ 81952, 2560, 4224, 6145, 8224, 8268, 8704, 16552, 24716, 32770, 33168, 35089, 41044, 49168, 131139, 134208,
+ 135266, 148544, 163904, 198240, 7, 3072, 4134, 8276, 12290, 16400, 16584, 17408, 20618, 24652, 32772, 65537,
+ 65571, 67104, 98336, 131141, 196705, 11, 2067, 4138, 4608, 8292, 8320, 10241, 16416, 18569, 24580, 32776,
+ 32787, 33552, 34816, 36914, 65538, 65573, 68128, 74272, 131142, 131145, 133201, 136256, 163921, 165200,
+ 196672, 196706, 13, 19, 2069, 4146, 5120, 8448, 12358, 16395, 16448, 16776, 20482, 20522, 32789, 34064,
+ 49160, 65540, 65574, 65577, 67633, 98353, 131073, 131146, 131153, 147529, 163922, 196708, 14, 21, 35, 2073,
+ 6144, 8199, 8388, 10309, 12326, 16397, 17032, 18433, 32790, 32793, 32800, 35088, 37136, 40964, 65544, 65578,
+ 65585, 69632, 70176, 81961, 98320, 98354, 99120, 131074, 131148, 131154, 131169, 132169, 139333, 140292,
+ 140352, 147530, 148680, 163924, 196712, 229488, 22, 25, 37, 67, 259, 517, 1033, 4194, 8516, 9216, 16398,
+ 16640, 17544, 18457, 32794, 32832, 36866, 49177, 65552, 65580, 65586, 73765, 81928, 81962, 98356, 131076,
+ 131156, 131170, 132170, 135235, 139334, 147532, 163928, 196640, 196720, 213096, 26, 38, 41, 69, 131, 518,
+ 1034, 2066, 2097, 4103, 4129, 4258, 6179, 8205, 8772, 10240, 10261, 16409, 16896, 18568, 32796, 32817, 32896,
+ 40981, 49178, 65588, 65633, 69667, 70146, 73732, 73766, 81964, 82600, 98360, 131080, 131160, 131172, 131265,
+ 131394, 131652, 132172, 133200, 139264, 163856, 180312, 197736, 204900, 28, 42, 49, 70, 73, 133, 262, 1036,
+ 2068, 2129, 4107, 4386, 6163, 8206, 8213, 8257, 9284, 16410, 16425, 24589, 32818, 32849, 36883, 40982, 49180,
+ 65592, 65600, 65634, 65697, 65826, 66085, 66600, 67632, 114744, 131088, 131176, 131266, 131393, 133187,
+ 135238, 135264, 139340, 147464, 147544, 163952, 164952, 172116, 200802, 44, 50, 74, 81, 134, 137, 261, 266,
+ 515, 524, 2055, 2072, 2193, 4115, 4132, 4642, 8214, 8229, 8258, 10308, 12288, 16412, 16426, 16457, 16513,
+ 18432, 20491, 20616, 24590, 32820, 32850, 32913, 33026, 33280, 33300, 33816, 35073, 41232, 49560, 65636,
+ 65664, 65698, 65825, 66052, 66086, 67585, 67619, 69670, 73772, 74340, 81976, 82464, 106548, 131104, 131184,
+ 131268, 131649, 132104, 132163, 132184, 133189, 135242, 136290, 139348, 147560, 155724, 164928, 168018,
+ 196832, 197220, 198753, 52, 76, 82, 97, 138, 145, 265, 274, 532, 1027, 1048, 2059, 2321, 4110, 4136, 5154,
+ 8230, 12295, 16428, 16458, 16514, 16650, 16908, 18456, 32824, 32852, 32914, 33025, 33792, 34835, 36886,
+ 36912, 40988, 65640, 65700, 65792, 67621, 69674, 73780, 73824, 90156, 98416, 102450, 131272, 131396, 131650,
+ 132165, 132200, 133193, 135170, 135250, 139364, 140364, 147648, 151626, 164048, 165969, 196960, 197729, 56,
+ 84, 98, 140, 146, 161, 273, 290, 521, 548, 1029, 1064, 2061, 2096, 2577, 4118, 4144, 4163, 6178, 8220, 8264,
+ 8325, 8454, 9228, 10260, 12356, 16460, 16516, 16649, 18443, 20480, 20494, 20520, 24712, 32856, 32916, 33043,
+ 33297, 34837, 36890, 41040, 41300, 49208, 49424, 57372, 65648, 65704, 65828, 66049, 66083, 66092, 66593,
+ 67625, 69682, 82024, 82080, 86058, 98480, 98848, 98868, 100401, 131200, 131280, 131400, 132166, 133121,
+ 134225, 136266, 143430, 149577, 164160, 164945, 197217, 197696, 197730, 88, 100, 148, 162, 193, 268, 289,
+ 322, 522, 529, 580, 1030, 1096, 2083, 2128, 3089, 4122, 4227, 4614, 5130, 6162, 8236, 8272, 8326, 8453,
+ 10247, 12324, 16440, 16905, 18445, 24604, 24648, 32920, 33028, 33298, 33809, 34841, 41012, 49240, 49296,
+ 53274, 65712, 65832, 66050, 66100, 66560, 66594, 77862, 82476, 84009, 98592, 131296, 131328, 131408, 131656,
+ 132097, 132177, 132296, 133186, 133217, 134217, 139460, 141381, 147784, 148553, 148672, 164178, 164432,
+ 164946, 197184, 197218, 197732, 104, 152, 164, 194, 276, 321, 386, 530, 545, 644, 1041, 1160, 2054, 2085,
+ 2115, 2192, 2307, 2565, 3081, 4166, 4192, 4355, 4384, 8244, 8288, 8709, 8768, 12302, 16472, 16528, 16652,
+ 16906, 17417, 17536, 20506, 24576, 24620, 24780, 32880, 33032, 33046, 33810, 35072, 36864, 37170, 45078,
+ 51225, 65840, 66089, 66596, 67584, 67618, 68145, 69730, 73892, 75813, 82216, 98610, 98865, 131424, 131584,
+ 131664, 132098, 132178, 132193, 132424, 133188, 135233, 135362, 137283, 139588, 140357, 148040, 148554,
+ 164177, 164948, 198752, 230512, 112, 168, 196, 280, 292, 385, 546, 577, 772, 1042, 1057, 1288, 2058, 2089,
+ 2117, 2179, 2320, 4101, 4170, 4230, 4256, 4611, 6151, 6177, 8332, 8710, 9221, 10253, 12310, 12354, 16488,
+ 16544, 17418, 24708, 28686, 32944, 33045, 33050, 33304, 33812, 34834, 34865, 36946, 41108, 43029, 49416,
+ 65760, 66056, 66090, 66097, 66148, 67620, 67681, 69665, 69794, 70144, 70182, 71715, 74020, 82473, 98609,
+ 98832, 98866, 99376, 131680, 132100, 132180, 132194, 132680, 133192, 133313, 135490, 136259, 139844, 140358,
+ 148556, 165968, 197224, 197664, 197744, 214120, 230000, 176, 200, 296, 324, 536, 641, 770, 1044, 1058, 1089,
+ 1544, 2060, 2121, 2181, 4105, 4178, 4234, 4358, 8460, 9222, 10305, 16576, 17420, 18442, 18473, 20554, 20610,
+ 32976, 33049, 33056, 34836, 34897, 38931, 41220, 65888, 66098, 66608, 67624, 67745, 68133, 74277, 74336,
+ 82440, 82474, 131520, 132196, 132418, 133120, 133441, 134224, 135746, 164880, 181336, 197232, 205924, 229744 };
+
+
+static int cbest_19[1023] = {
+ 1, 2, 262163, 4, 393242, 8, 196621, 16, 32, 360469, 64, 262162, 128, 131081, 3, 442393, 262161, 6, 256,
+ 393243, 5, 327703, 12, 196620, 512, 393240, 9, 24, 98310, 262167, 426008, 10, 458782, 48, 1024, 49155,
+ 196617, 17, 213004, 262171, 393234, 483359, 18, 96, 262147, 360471, 393246, 20, 2048, 16385, 196623, 286738,
+ 360468, 442392, 33, 192, 32770, 106502, 229391, 34, 65540, 196613, 270355, 360465, 36, 384, 4096, 131080,
+ 143369, 180234, 393226, 40, 65, 53251, 221196, 262195, 66, 768, 376852, 397338, 442395, 68, 8192, 24577,
+ 49154, 90117, 503836, 72, 129, 1536, 262160, 262227, 80, 110598, 198669, 288786, 393274, 132, 3072, 16384,
+ 98308, 188426, 196637, 274451, 327702, 333847, 483358, 7, 130, 136, 257, 131083, 262291, 307217, 144, 6144,
+ 49153, 163851, 213005, 262165, 286739, 327699, 360477, 393241, 393306, 426010, 13, 160, 55299, 131073,
+ 144393, 196616, 196653, 262166, 327701, 360453, 361493, 426009, 14, 26, 258, 264, 513, 12288, 53250, 94213,
+ 131085, 196609, 262419, 399386, 442385, 458783, 25, 52, 272, 26625, 98306, 98311, 251918, 344086, 360467,
+ 393370, 429080, 442397, 11, 104, 288, 24576, 32768, 90116, 106500, 196619, 196622, 196685, 229389, 262155,
+ 262170, 262175, 311312, 368661, 393218, 393235, 393238, 393244, 405530, 458778, 22, 28, 49, 208, 260, 320,
+ 514, 1025, 45058, 155656, 180235, 229390, 241679, 262146, 262169, 262675, 360470, 376853, 393232, 393247,
+ 442394, 458780, 44, 416, 528, 22529, 77828, 172043, 180232, 196612, 199693, 213000, 221197, 262145, 262179,
+ 289810, 360501, 393498, 415771, 442905, 483355, 491548, 19, 88, 97, 544, 832, 38914, 53249, 143368, 196749,
+ 275475, 309265, 442377, 475167, 483357, 21, 38, 50, 56, 176, 516, 576, 1026, 1664, 2049, 19457, 71684,
+ 114695, 131097, 135177, 202765, 214540, 263187, 270354, 288787, 334359, 360464, 426000, 76, 193, 352, 640,
+ 3328, 32771, 35842, 49152, 106498, 106503, 139273, 213006, 229387, 262151, 278546, 286736, 327711, 352278,
+ 360533, 393227, 393754, 426012, 446489, 458774, 35, 98, 152, 194, 704, 1056, 6656, 17921, 55298, 65536,
+ 65541, 196629, 196877, 245774, 262194, 262259, 273427, 294929, 319504, 327687, 348182, 376854, 393224,
+ 393258, 450585, 37, 42, 70, 112, 304, 385, 520, 1028, 1088, 1408, 2050, 4097, 13312, 16387, 49159, 94212,
+ 110596, 125959, 131113, 144905, 188427, 196615, 212996, 262193, 264211, 270353, 315408, 329751, 360473,
+ 362005, 372757, 382996, 397339, 483351, 503838, 41, 140, 608, 1152, 2816, 26624, 27649, 98318, 159752,
+ 188424, 198668, 221192, 262355, 271891, 331799, 360597, 394266, 399898, 442425, 499740, 503837, 67, 100, 280,
+ 388, 769, 1216, 1280, 5632, 47106, 65542, 98304, 99334, 107270, 157704, 176139, 196633, 196636, 197133,
+ 262226, 363541, 376848, 393230, 406554, 409627, 425992, 427032, 442384, 69, 74, 84, 224, 386, 560, 1032,
+ 2052, 2112, 2432, 4098, 8193, 11264, 16389, 49163, 49667, 79876, 90113, 122887, 131082, 131145, 143371,
+ 163849, 174091, 229383, 262199, 262225, 262547, 266259, 271123, 286742, 344087, 360479, 393275, 393338,
+ 397336, 428056, 429336, 442396, 458766, 470046, 483615, 503832, 73, 196, 1120, 1537, 2176, 4864, 22528,
+ 23553, 32774, 55297, 78852, 81925, 98326, 163850, 213516, 221198, 262203, 274450, 307219, 311313, 327735,
+ 333846, 360455, 360725, 393266, 393272, 395290, 398874, 416795, 442399, 442457, 460830, 81, 134, 770, 776,
+ 2240, 2304, 9728, 39938, 53248, 110594, 110599, 131072, 137225, 143361, 144392, 180226, 196649, 196652,
+ 196669, 197645, 198665, 214028, 251919, 262290, 262931, 270359, 286994, 303121, 307216, 327697, 356374,
+ 360449, 393434, 434200, 442387, 483343, 133, 148, 168, 448, 1040, 2056, 2560, 3073, 4100, 4480, 8194, 16393,
+ 16897, 19456, 24579, 39426, 49171, 90119, 98309, 106758, 114694, 131084, 131209, 143373, 180238, 191498,
+ 196608, 199949, 221188, 249870, 251916, 262164, 262211, 262231, 275987, 286746, 288784, 289811, 323600,
+ 327698, 333843, 360461, 360476, 361495, 393307, 398106, 415770, 426011, 442904, 443161, 448537, 466974,
+ 483354, 485407, 131, 137, 200, 1538, 4224, 8960, 17409, 19969, 32778, 53255, 53635, 57347, 72196, 98342,
+ 106510, 107014, 131077, 143497, 153608, 155657, 196639, 196717, 203277, 213020, 217100, 262235, 263699,
+ 270611, 274449, 286722, 325136, 327700, 327767, 333845, 350230, 360452, 360981, 361492, 393278, 393298,
+ 393304, 393626, 397330, 426040, 442376, 442521, 475166, 483356, 487455, 82, 138, 145, 268, 772, 1552, 4352,
+ 6145, 17920, 18433, 19713, 33794, 45056, 53379, 65548, 106496, 166923, 180233, 196618, 196645, 196681,
+ 196684, 198671, 213001, 229385, 229388, 230415, 241677, 262289, 262418, 270363, 270867, 317456, 334615,
+ 349206, 360485, 397342, 397722, 399387, 417819, 429082, 442369, 442389, 443929, 458814, 503828, 146, 161,
+ 296, 336, 392, 896, 2064, 3074, 4104, 4608, 8196, 16386, 16401, 20481, 24581, 34818, 36098, 49158, 49187,
+ 53507, 76804, 108550, 125958, 131087, 131089, 131337, 161800, 172041, 180746, 196813, 199437, 214668, 221452,
+ 241678, 262153, 262275, 262295, 265235, 270339, 271379, 273939, 278547, 286737, 290834, 311314, 327707,
+ 330775, 352279, 360466, 360503, 361489, 368663, 393216, 393371, 394010, 397466, 413723, 429081, 446488,
+ 458770, 458776, 15, 27, 54, 108, 162, 216, 259, 265, 432, 864, 1540, 1728, 3456, 5120, 6912, 12289, 13824,
+ 27648, 32786, 36866, 38912, 49157, 53259, 67588, 94209, 98374, 106518, 131075, 172042, 178187, 196655,
+ 198661, 199692, 213036, 214541, 229407, 233487, 262154, 262174, 262299, 272403, 319505, 327831, 348183,
+ 360497, 368660, 376855, 393219, 393239, 393245, 393290, 393310, 393362, 397594, 401434, 405531, 426014,
+ 426072, 442379, 442649, 450584, 458779, 483391, 503964, 514077, 53, 266, 273, 536, 3104, 6146, 8448, 18049,
+ 24833, 38402, 40962, 54275, 65556, 69636, 86021, 90373, 98307, 114693, 124935, 145417, 158728, 188418,
+ 196611, 196677, 196745, 196748, 197005, 198733, 199053, 200717, 207885, 212992, 213007, 235023, 237583,
+ 251914, 262168, 262173, 262417, 262674, 268307, 288790, 288850, 315409, 339991, 344082, 376860, 377364,
+ 382997, 393233, 393236, 394778, 397850, 399384, 426002, 442907, 458781, 458846, 483347, 483350, 483353,
+ 491550, 30, 105, 262, 274, 289, 400, 592, 672, 1792, 2080, 3076, 4112, 8200, 8704, 16388, 16417, 24585,
+ 32769, 35840, 49162, 49219, 49666, 73732, 80900, 90112, 90125, 95749, 98314, 99846, 106501, 110726, 122886,
+ 131096, 131593, 135176, 143385, 144395, 175115, 180250, 184330, 188430, 198797, 202764, 223244, 245775,
+ 262144, 262149, 262178, 262183, 262323, 262403, 272147, 274455, 286770, 288914, 309267, 333911, 344084,
+ 360500, 360535, 360565, 362133, 368657, 376836, 388116, 393368, 393499, 397322, 398362, 400154, 407066,
+ 427544, 438296, 442424, 491549, 23, 29, 46, 92, 106, 184, 209, 261, 276, 290, 321, 368, 515, 736, 1472, 1544,
+ 2944, 5888, 9216, 11776, 12290, 19201, 23552, 32802, 45059, 53267, 55296, 81924, 94215, 98316, 98438, 106534,
+ 139272, 163843, 174603, 196687, 197389, 198861, 198925, 212997, 213002, 213068, 221212, 225292, 229386,
+ 229423, 241675, 245772, 262427, 272019, 274579, 275474, 288978, 305169, 307221, 309264, 327959, 345622,
+ 360529, 362517, 363797, 376849, 378900, 393354, 393374, 393490, 396314, 405528, 409626, 426136, 436248,
+ 461854, 470558, 471070, 483423, 503820, 504860, 417, 529, 1072, 6148, 10240, 92165, 101382, 125957, 144425,
+ 147464, 196741, 204813, 262159, 262339, 262673, 263186, 286743, 288794, 307345, 327683, 425984, 426001,
+ 432152, 442409, 458910, 505884 };
+
+static int cbest_20[1023] = {
+ 1, 2, 524292, 4, 262146, 8, 131073, 16, 32, 589828, 64, 128, 294914, 256, 512, 147457, 1024, 2048, 4096,
+ 598020, 8192, 16384, 32768, 299010, 65536, 131072, 149505, 262144, 3, 524288, 524293, 5, 262147, 524294,
+ 599044, 6, 9, 18, 36, 72, 144, 288, 576, 1152, 2304, 4608, 9216, 18432, 36864, 73728, 147456, 786438, 10, 17,
+ 131075, 262150, 294912, 524300, 655365, 12, 33, 131077, 262154, 393219, 524308, 589824, 589829, 20, 34, 65,
+ 131081, 262162, 299522, 524324, 589830, 24, 66, 129, 131089, 262178, 294915, 524356, 851974, 40, 68, 130,
+ 257, 131105, 262210, 524420, 589836, 720901, 819206, 48, 132, 258, 513, 131137, 262274, 294918, 524548,
+ 589844, 80, 136, 260, 514, 1025, 131201, 147459, 149761, 262402, 294922, 425987, 524804, 589860, 671749, 96,
+ 264, 516, 1026, 2049, 131329, 147461, 262658, 294930, 409603, 525316, 589892, 160, 272, 520, 1028, 2050,
+ 4097, 131585, 147465, 149504, 263170, 294946, 299008, 526340, 589956, 598016, 598021, 884742, 192, 528, 1032,
+ 2052, 4098, 8193, 74752, 132097, 147473, 264194, 294978, 528388, 590084, 598022, 320, 544, 1040, 2056, 4100,
+ 8194, 16385, 37376, 133121, 147489, 266242, 295042, 532484, 590340, 737285, 860166, 384, 1056, 2064, 4104,
+ 8196, 16386, 18688, 32769, 135169, 147521, 270338, 295170, 299011, 540676, 590852, 598028, 599172, 729093,
+ 640, 1088, 2080, 4112, 8200, 9344, 16388, 32770, 65537, 139265, 147585, 278530, 295426, 442371, 557060,
+ 591876, 598036, 823302, 768, 2112, 4128, 4672, 8208, 16392, 32772, 65538, 147713, 295938, 299014, 593924,
+ 598052, 1280, 2176, 2336, 4160, 8224, 16400, 32776, 65540, 131074, 147969, 163841, 296962, 299018, 327682,
+ 430083, 598084, 655364, 1168, 1536, 4224, 8256, 16416, 32784, 65544, 131076, 148481, 149507, 196609, 262145,
+ 299026, 393218, 598148, 606212, 673797, 892934, 584, 2560, 4352, 8320, 16448, 32800, 65552, 131080, 149509,
+ 299042, 299586, 303106, 411651, 598276, 622596, 786436, 888838, 292, 3072, 8448, 16512, 32832, 65568, 131088,
+ 149513, 151553, 262148, 262160, 299074, 311298, 524289, 598532, 745477, 146, 5120, 8704, 16640, 32896, 65600,
+ 131104, 149521, 155649, 262152, 299138, 393217, 524290, 524295, 524320, 599040, 599045, 720900, 7, 19, 37,
+ 73, 145, 289, 577, 1153, 2305, 4609, 6144, 9217, 16896, 18433, 33024, 36865, 65664, 73729, 131136, 149537,
+ 299266, 299520, 360450, 599046, 600068, 739333, 786434, 786439, 11, 38, 74, 290, 578, 1154, 2306, 4610, 9218,
+ 10240, 17408, 18434, 33280, 36866, 65792, 73730, 131200, 147458, 149569, 149760, 180225, 262151, 262176,
+ 294913, 425986, 446467, 524296, 524301, 524310, 524364, 524436, 524580, 524868, 525444, 526596, 528900,
+ 533508, 542724, 561156, 602116, 655361, 671748, 851972, 861190, 13, 22, 76, 148, 580, 1156, 2308, 4612, 9220,
+ 12288, 18436, 33792, 36868, 66048, 73732, 131328, 147460, 149633, 149793, 212993, 262155, 262182, 262208,
+ 262218, 262290, 262434, 262722, 263298, 264450, 266754, 271362, 280578, 300034, 335874, 409602, 444419,
+ 524302, 524304, 524309, 589825, 599052, 655367, 730117, 819204, 14, 21, 26, 35, 44, 152, 296, 1160, 2312,
+ 4616, 9224, 18440, 20480, 34816, 36872, 66560, 73736, 74880, 131079, 131091, 131109, 131145, 131217, 131361,
+ 131584, 131649, 132225, 133377, 135681, 140289, 147464, 167937, 204801, 262163, 262272, 294916, 294928,
+ 299523, 301058, 524325, 589826, 589831, 589856, 599060, 614404, 786446, 917511, 25, 52, 67, 88, 304, 592,
+ 2320, 4624, 9232, 18448, 24576, 36880, 67584, 73744, 131083, 132096, 147472, 150017, 262158, 262179, 262400,
+ 294920, 393223, 425985, 524326, 524352, 524357, 599076, 630788, 655373, 786454, 823814, 851970, 851975,
+ 897030, 41, 50, 69, 104, 131, 176, 608, 1184, 4640, 9248, 18464, 36896, 37440, 40960, 69632, 73760, 131085,
+ 133120, 147488, 150529, 262166, 262211, 262656, 299526, 307202, 393227, 409601, 524316, 524358, 524416,
+ 524421, 589832, 589837, 589846, 589900, 589972, 590116, 590404, 590980, 592132, 594436, 599108, 608260,
+ 626692, 655381, 720897, 737284, 786470, 819202, 819207, 28, 42, 49, 70, 82, 100, 133, 208, 259, 352, 1216,
+ 2368, 9280, 18496, 36928, 49152, 73792, 131093, 131107, 135168, 147520, 262170, 262275, 263168, 294919,
+ 294944, 299530, 315394, 393235, 430595, 524332, 524422, 524544, 524549, 589838, 589840, 589845, 655397,
+ 720903, 729092, 786502, 884740, 893958, 81, 134, 137, 164, 200, 261, 416, 515, 704, 2432, 4736, 18560, 18720,
+ 36992, 73856, 81920, 131097, 131139, 139264, 147584, 153601, 262186, 262214, 262403, 264192, 294923, 294950,
+ 294976, 294986, 295058, 295202, 295490, 296066, 297218, 299538, 304130, 313346, 368642, 393251, 442370,
+ 524340, 524550, 524800, 524805, 589861, 599300, 655429, 671745, 747525, 786566, 851982, 983047, 97, 138, 262,
+ 265, 274, 328, 400, 517, 832, 1027, 1408, 4864, 9472, 37120, 73984, 98304, 131113, 131141, 131203, 147712,
+ 149763, 157697, 262194, 262278, 262659, 266240, 294931, 295040, 299554, 364546, 393283, 524372, 524428,
+ 524806, 525312, 525317, 589862, 589888, 589893, 599188, 599556, 655493, 671751, 674053, 720909, 746501,
+ 786694, 819214, 851990, 860164, 889350, 950279, 56, 84, 98, 140, 161, 266, 273, 518, 521, 530, 548, 656, 800,
+ 1029, 1664, 2051, 2816, 9360, 9728, 18944, 74240, 131121, 131205, 131331, 147463, 147475, 147493, 147529,
+ 147601, 147745, 147968, 148033, 148609, 149765, 152065, 156673, 163840, 184321, 221185, 262226, 262282,
+ 262406, 263171, 270336, 294926, 294947, 295168, 299009, 393347, 411907, 425991, 430082, 524388, 524556,
+ 525318, 526336, 526341, 589852, 589894, 589952, 589957, 598017, 655621, 720917, 786950, 819222, 852006,
+ 884738, 884743, 933895, 162, 193, 268, 522, 529, 1030, 1033, 1042, 1060, 1096, 1312, 1600, 2053, 3328, 4099,
+ 5632, 19456, 37888, 74753, 131153, 131209, 131333, 131587, 147467, 148480, 149506, 149769, 182273, 196608,
+ 262242, 262410, 262662, 264195, 278528, 294934, 294979, 295424, 299650, 393475, 409607, 425995, 442369,
+ 448515, 524452, 524564, 524812, 526342, 528384, 528389, 589868, 589958, 590080, 590085, 598018, 598023,
+ 598048, 601092, 655877, 671757, 673796, 720933, 787462, 819238, 823300, 852038, 194, 276, 321, 524, 545,
+ 1034, 1041, 2054, 2057, 2066, 2084, 2120, 2192, 2624, 3200, 4101, 4680, 6656, 8195, 11264, 37377, 38912,
+ 74754, 75776, 131169, 131337, 131589, 132099, 147469, 149508, 149777, 215041, 262306, 262418, 262666, 263174,
+ 266243, 294938, 295043, 295936, 299012, 299024, 299778, 393731, 409611, 411650, 426003, 524484, 524820,
+ 525324, 528390, 532480, 532485, 589876, 590086, 590336, 590341, 603140, 656389, 671765, 720965, 737281,
+ 788486, 819270, 852102, 860162, 860167, 112, 168, 196, 280, 322, 385, 532, 546, 1036, 1057, 2058, 2065, 4102,
+ 4105, 4114, 4132, 4168, 4240, 4384, 5248, 6400, 8197, 13312, 16387, 18689, 22528, 37378, 74756, 77824,
+ 131233, 131345, 131593, 132101, 133123, 147477, 147491, 149512, 151552, 262338, 262674, 263178, 264198,
+ 270339, 294954, 294982, 295171, 296960, 299016, 327680, 336898, 394243, 409619, 426019, 430081, 446979,
+ 524612, 524836, 525332, 526348, 532486, 540672, 540677, 561668, 589908, 589964, 590342, 590848, 590853,
+ 598024, 598029, 598038, 598092, 598164, 598308, 598596, 599168, 599173, 600324, 602628, 616452, 634884,
+ 657413, 671781, 721029, 729089, 737287, 739589, 745476, 790534, 819334, 852230, 884750, 1015815, 386, 1089,
+ 4106, 8228, 8264, 8336, 10496, 131601, 147481, 147523, 149825, 155648, 262466, 263186, 295046, 300546,
+ 393216, 426051, 590092, 599174, 615428, 729095, 823303 };
+
+static int cbest_21[1023] = {
+ 1, 2, 1048578, 4, 524289, 8, 16, 1310722, 32, 64, 655361, 128, 256, 512, 1376258, 1024, 2048, 4096, 688129,
+ 8192, 16384, 32768, 65536, 1392642, 131072, 262144, 524288, 696321, 1048576, 3, 1048579, 5, 10, 20, 40, 80,
+ 160, 320, 640, 1280, 2560, 5120, 10240, 20480, 40960, 81920, 163840, 327680, 655360, 6, 9, 524291, 1048582,
+ 1310720, 1572867, 17, 524293, 1048586, 1310723, 1396738, 12, 18, 33, 524297, 1048594, 34, 65, 524305,
+ 1048610, 1310726, 1835011, 24, 36, 66, 129, 524321, 655363, 1048642, 1310730, 1703939, 68, 130, 257, 524353,
+ 655365, 688128, 1048706, 1310738, 1376256, 48, 72, 132, 258, 513, 344064, 524417, 655369, 698369, 1048834,
+ 1310754, 1376259, 136, 260, 514, 1025, 172032, 524545, 655377, 1049090, 1310786, 1966083, 96, 144, 264, 516,
+ 1026, 2049, 86016, 524801, 655393, 1049602, 1310850, 1376262, 1900547, 272, 520, 1028, 2050, 4097, 43008,
+ 525313, 655425, 1050626, 1310978, 1376266, 192, 288, 528, 1032, 2052, 4098, 8193, 21504, 526337, 655489,
+ 688131, 1052674, 1311234, 1376274, 1736707, 544, 1040, 2056, 4100, 8194, 10752, 16385, 528385, 655617,
+ 688133, 1056770, 1311746, 1376290, 384, 576, 1056, 2064, 4104, 5376, 8196, 16386, 32769, 532481, 655873,
+ 688137, 1064962, 1312770, 1376322, 1392640, 1397762, 2031619, 1088, 2080, 2688, 4112, 8200, 16388, 32770,
+ 65537, 540673, 656385, 688145, 696320, 1081346, 1314818, 1376386, 1392643, 1998851, 768, 1152, 1344, 2112,
+ 4128, 8208, 16392, 32772, 65538, 131073, 557057, 657409, 688161, 1114114, 1318914, 1376514, 672, 2176, 4160,
+ 8224, 16400, 32776, 65540, 131074, 262145, 348160, 589825, 659457, 688193, 1179650, 1327106, 1376770,
+ 1392646, 1916931, 336, 1536, 2304, 4224, 8256, 16416, 32784, 65544, 131076, 262146, 663553, 688257, 1343490,
+ 1377282, 1392650, 168, 4352, 8320, 16448, 32800, 65552, 131080, 174080, 262148, 524290, 671745, 688385,
+ 786433, 1378306, 1392658, 1572866, 84, 3072, 4608, 8448, 16512, 32832, 65568, 131088, 262152, 524292, 688641,
+ 696323, 698881, 1048577, 1380354, 1392674, 1441794, 1744899, 2064387, 42, 8704, 16640, 32896, 65600, 87040,
+ 131104, 262160, 524296, 689153, 696325, 720897, 1048584, 1384450, 1392706, 2048003, 11, 21, 41, 81, 161, 321,
+ 641, 1281, 2561, 5121, 6144, 9216, 10241, 16896, 20481, 33024, 40961, 65664, 81921, 131136, 163841, 262176,
+ 327681, 524304, 690177, 696329, 1048580, 1392770, 1572865, 1835010, 7, 22, 82, 162, 322, 642, 1282, 2562,
+ 5122, 10242, 17408, 20482, 33280, 40962, 43520, 65792, 81922, 131200, 163842, 262208, 327682, 524320, 655362,
+ 692225, 696337, 917505, 1048583, 1048598, 1048618, 1048658, 1048738, 1048898, 1049218, 1049858, 1051138,
+ 1053698, 1058818, 1069058, 1089538, 1130498, 1212418, 1310721, 1392898, 1396736, 1409026, 1703938, 2007043,
+ 14, 44, 164, 324, 644, 1284, 2564, 5124, 10244, 12288, 18432, 20484, 33792, 40964, 66048, 81924, 131328,
+ 163844, 262272, 327684, 524299, 524309, 524329, 524352, 524369, 524449, 524609, 524929, 525569, 526849,
+ 529409, 534529, 544769, 565249, 606209, 655364, 696353, 851969, 1048587, 1048592, 1310728, 1393154, 1396739,
+ 13, 19, 28, 88, 328, 648, 1288, 2568, 5128, 10248, 20488, 21760, 34816, 40968, 66560, 81928, 131584, 163848,
+ 262400, 327688, 524295, 524416, 655368, 696385, 698368, 704513, 1048595, 1048608, 1310724, 1393666, 1507330,
+ 1572871, 1835009, 26, 35, 56, 176, 656, 1296, 2576, 5136, 10256, 20496, 24576, 36864, 40976, 67584, 81936,
+ 132096, 163856, 262656, 327696, 524544, 655376, 696449, 1048590, 1048611, 1048640, 1310727, 1310742, 1310762,
+ 1310802, 1310882, 1311042, 1311362, 1312002, 1313282, 1315842, 1320962, 1331202, 1351682, 1394690, 1396742,
+ 1474562, 1572875, 1703937, 1921027, 1966082, 25, 37, 52, 67, 112, 352, 1312, 2592, 5152, 10272, 10880, 20512,
+ 40992, 69632, 81952, 133120, 163872, 263168, 327712, 524301, 524307, 524800, 655392, 696577, 753665, 1048643,
+ 1048704, 1310731, 1310736, 1396746, 1398018, 1572883, 1900546, 2080771, 38, 69, 74, 104, 131, 224, 704, 2624,
+ 5184, 10304, 20544, 41024, 49152, 73728, 81984, 135168, 163904, 264192, 327744, 349184, 524323, 525312,
+ 655371, 655381, 655401, 655424, 655441, 655521, 655681, 656001, 656641, 657921, 660481, 665601, 675841,
+ 696833, 737281, 983041, 1048602, 1048614, 1048707, 1048832, 1310739, 1310752, 1376257, 1396754, 1400834,
+ 1572899, 1835015, 2072579, 49, 70, 73, 133, 138, 148, 208, 259, 448, 1408, 5248, 5440, 10368, 20608, 41088,
+ 82048, 139264, 163968, 266240, 327808, 344065, 524313, 524325, 524355, 526336, 655367, 655488, 688130,
+ 697345, 950273, 1048646, 1048835, 1049088, 1310734, 1310755, 1310784, 1376264, 1396770, 1572931, 1703943,
+ 1736706, 1835019, 1966081, 50, 134, 137, 261, 266, 276, 296, 416, 515, 896, 2816, 10496, 20736, 41216, 82176,
+ 98304, 147456, 164096, 172033, 270336, 327936, 344066, 524357, 524419, 528384, 655616, 688132, 698371,
+ 1048626, 1048650, 1048710, 1049091, 1049600, 1310787, 1310848, 1376260, 1396802, 1425410, 1572995, 1703947,
+ 1746947, 1835027, 1900545, 2052099, 76, 97, 145, 262, 265, 517, 522, 532, 552, 592, 832, 1027, 1792, 2720,
+ 5632, 20992, 41472, 82432, 86017, 164352, 172034, 174592, 278528, 328192, 344068, 524337, 524361, 524421,
+ 524547, 532480, 655373, 655379, 655872, 688136, 698373, 700417, 868353, 1048714, 1048838, 1049603, 1050624,
+ 1220610, 1310746, 1310758, 1310851, 1310976, 1376263, 1376278, 1376298, 1376338, 1376418, 1376578, 1376898,
+ 1377538, 1378818, 1381378, 1386498, 1396866, 1417218, 1540098, 1573123, 1703955, 1835043, 2031618, 98, 140,
+ 146, 273, 518, 521, 1029, 1034, 1044, 1064, 1104, 1184, 1664, 2051, 3584, 11264, 41984, 43009, 82944, 86018,
+ 164864, 172036, 196608, 294912, 328704, 344072, 524425, 524549, 524803, 540672, 655395, 656384, 688144,
+ 698377, 1048674, 1048722, 1048842, 1049094, 1050627, 1052672, 1134594, 1310790, 1310979, 1311232, 1376267,
+ 1376272, 1396994, 1523714, 1573379, 1703971, 1736705, 1835075, 1966087, 1998850, 100, 193, 268, 274, 289,
+ 529, 1030, 1033, 1360, 2053, 2058, 2068, 2088, 2128, 2208, 2368, 3328, 4099, 7168, 21505, 22528, 43010,
+ 83968, 86020, 165888, 172040, 329728, 344080, 524385, 524433, 524553, 524805, 525315, 557056, 610305, 655385,
+ 655397, 655427, 657408, 688160, 698385, 712705, 1048850, 1049098, 1049606, 1052675, 1056768, 1091586,
+ 1310770, 1310794, 1310854, 1311235, 1311744, 1376275, 1376288, 1397250, 1573891, 1704003, 1835139, 1900551,
+ 1966091, 2009091, 152, 194, 290, 524, 530, 545, 1041, 2054, 2057, 4101, 4106, 4116, 4136, 4176, 4256, 4416,
+ 4736, 6656, 8195, 10753, 14336, 21506, 43012, 45056, 86024, 87296, 167936, 172048, 331776, 344096, 393216,
+ 524561, 524809, 525317, 526339, 567297, 589824, 655429, 655491, 659456, 688139, 688149, 688169, 688192,
+ 688209, 688289, 688449, 688769, 689409, 690689, 693249, 698401, 699009, 708609, 770049, 1015809, 1048770,
+ 1048866, 1049106, 1049610, 1050630, 1056771, 1064960, 1070082, 1310858, 1310982, 1311747, 1312768, 1376270,
+ 1376291, 1376320, 1397760, 1482754, 1574915, 1704067, 1835267, 1900555, 1916930, 1966099, 2031617, 196, 280,
+ 292, 385, 546, 577, 680, 1036, 1042, 1057, 2065, 4102, 4105, 5377, 8197, 8202, 8212, 8232, 8272, 8352, 8512,
+ 8832, 9472, 10754, 13312, 16387, 21508, 28672, 43016, 86032, 90112, 172064, 335872, 344128, 524481, 524577,
+ 524817, 525321, 526341, 528387, 545793, 655409, 655433, 655493, 655619, 663552, 688135, 688256, 698433,
+ 761857, 999425, 1049122, 1049618, 1050634, 1052678, 1059330, 1064963, 1081344, 1310818, 1310866, 1310986,
+ 1311238, 1312771, 1314816, 1376323, 1376384, 1392641, 1397763, 1398786, 1576963, 1704195, 1736711, 1835523,
+ 1900563, 1966115, 1998849, 2088963, 200, 386, 548, 1058, 16394, 16424, 16544, 17664, 18944, 57344, 180224,
+ 535041, 671744, 698497, 1052682, 1056774, 1353730, 1392648, 1836035 };
+
+static int cbest_22[1023] = {
+ 1, 2, 2097153, 4, 8, 3145729, 16, 32, 64, 3670017, 128, 256, 512, 1024, 3932161, 2048, 4096, 8192, 16384,
+ 32768, 4063233, 65536, 131072, 262144, 524288, 1048576, 2097152, 4128769, 3, 6, 12, 24, 48, 96, 192, 384,
+ 768, 1536, 3072, 6144, 12288, 24576, 49152, 98304, 196608, 393216, 786432, 1572864, 3145728, 5, 2097155, 9,
+ 2097157, 10, 17, 2097161, 3145731, 3670016, 18, 33, 1835008, 2097169, 3145733, 20, 34, 65, 917504, 2097185,
+ 3145737, 4161537, 36, 66, 129, 458752, 2097217, 3145745, 3670019, 40, 68, 130, 257, 229376, 2097281, 3145761,
+ 3670021, 3932160, 72, 132, 258, 513, 114688, 2097409, 3145793, 3670025, 80, 136, 260, 514, 1025, 57344,
+ 1966080, 2097665, 3145857, 3670033, 144, 264, 516, 1026, 2049, 28672, 2098177, 3145985, 3670049, 3932163,
+ 160, 272, 520, 1028, 2050, 4097, 14336, 983040, 2099201, 3146241, 3670081, 3932165, 288, 528, 1032, 2052,
+ 4098, 7168, 8193, 2101249, 3146753, 3670145, 3932169, 4063232, 320, 544, 1040, 2056, 3584, 4100, 8194, 16385,
+ 491520, 2105345, 3147777, 3670273, 3932177, 4177921, 576, 1056, 1792, 2064, 4104, 8196, 16386, 32769,
+ 2113537, 3149825, 3670529, 3932193, 640, 896, 1088, 2080, 4112, 8200, 16388, 32770, 65537, 245760, 2031616,
+ 2129921, 3153921, 3671041, 3932225, 4063235, 448, 1152, 2112, 4128, 8208, 16392, 32772, 65538, 131073,
+ 2162689, 3162113, 3672065, 3932289, 4063237, 224, 1280, 2176, 4160, 8224, 16400, 32776, 65540, 122880,
+ 131074, 262145, 2228225, 3178497, 3674113, 3932417, 4063241, 112, 2304, 4224, 8256, 16416, 32784, 65544,
+ 131076, 262146, 524289, 1015808, 2359297, 3211265, 3678209, 3932673, 4063249, 4128768, 56, 2560, 4352, 8320,
+ 16448, 32800, 61440, 65552, 131080, 262148, 524290, 1048577, 2621441, 3276801, 3686401, 3933185, 4063265, 28,
+ 4608, 8448, 16512, 32832, 65568, 131088, 262152, 524292, 1048578, 3407873, 3702785, 3934209, 4063297, 14,
+ 5120, 8704, 16640, 30720, 32896, 65600, 131104, 262160, 507904, 524296, 1048580, 2097154, 3735553, 3936257,
+ 4063361, 4128771, 7, 13, 25, 49, 97, 193, 385, 769, 1537, 3073, 6145, 9216, 12289, 16896, 24577, 33024,
+ 49153, 65664, 98305, 131136, 196609, 262176, 393217, 524304, 786433, 1048584, 1572865, 2064384, 2097156,
+ 3801089, 3940353, 4063489, 4128773, 4186113, 26, 50, 98, 194, 386, 770, 1538, 3074, 6146, 10240, 12290,
+ 15360, 17408, 24578, 33280, 49154, 65792, 98306, 131200, 196610, 262208, 393218, 524320, 786434, 1048592,
+ 1572866, 2097159, 2097160, 2097165, 2097177, 2097201, 2097249, 2097345, 2097537, 2097921, 2098689, 2100225,
+ 2103297, 2109441, 2121729, 2146305, 2195457, 2293761, 2490369, 2883585, 3145730, 3948545, 4063745, 4128777,
+ 52, 100, 196, 388, 772, 1540, 3076, 6148, 12292, 18432, 24580, 33792, 49156, 66048, 98308, 131328, 196612,
+ 253952, 262272, 393220, 524352, 786436, 1048608, 1572868, 2097168, 3145732, 3964929, 4064257, 4128785, 11,
+ 104, 200, 392, 776, 1544, 3080, 6152, 7680, 12296, 20480, 24584, 34816, 49160, 66560, 98312, 131584, 196616,
+ 262400, 393224, 524416, 786440, 1048640, 1572872, 2097184, 3145735, 3145736, 3145741, 3145753, 3145777,
+ 3145825, 3145921, 3146113, 3146497, 3147265, 3148801, 3151873, 3158017, 3170305, 3194881, 3244033, 3342337,
+ 3538945, 3997697, 4065281, 4128801, 4161536, 19, 22, 208, 400, 784, 1552, 3088, 6160, 12304, 24592, 36864,
+ 49168, 67584, 98320, 132096, 196624, 262656, 393232, 524544, 786448, 1032192, 1048704, 1572880, 1835009,
+ 2097163, 2097216, 3145744, 3670018, 4067329, 4128833, 21, 35, 38, 44, 416, 800, 1568, 3104, 3840, 6176,
+ 12320, 24608, 40960, 49184, 69632, 98336, 126976, 133120, 196640, 263168, 393248, 524800, 786464, 917505,
+ 1048832, 1572896, 1835010, 2097171, 2097280, 3145760, 3670020, 4071425, 4128897, 37, 67, 70, 76, 88, 832,
+ 1600, 3136, 6208, 12352, 24640, 49216, 73728, 98368, 135168, 196672, 264192, 393280, 458753, 525312, 786496,
+ 917506, 1049088, 1572928, 1835012, 2097173, 2097187, 2097408, 3014657, 3145739, 3145792, 3670023, 3670024,
+ 3670029, 3670041, 3670065, 3670113, 3670209, 3670401, 3670785, 3671553, 3673089, 3676161, 3682305, 3694593,
+ 3719169, 3768321, 3866625, 4079617, 4129025, 4161539, 41, 69, 131, 134, 140, 152, 176, 1664, 1920, 3200,
+ 6272, 12416, 24704, 49280, 81920, 98432, 139264, 196736, 229377, 266240, 393344, 458754, 526336, 786560,
+ 917508, 1049600, 1572992, 1835016, 2097189, 2097219, 2097664, 2555905, 3145747, 3145856, 3670032, 4096001,
+ 4129281, 4161541, 42, 73, 133, 259, 262, 268, 280, 304, 352, 3328, 6400, 12544, 24832, 49408, 63488, 98560,
+ 114689, 147456, 196864, 229378, 270336, 393472, 458756, 516096, 528384, 786688, 917512, 1050624, 1573120,
+ 1835024, 2080768, 2097193, 2097221, 2097283, 2098176, 2326529, 3145749, 3145763, 3145984, 3670048, 3932162,
+ 4129793, 4161545, 74, 81, 137, 261, 515, 518, 524, 536, 560, 608, 704, 960, 6656, 12800, 25088, 49664, 57345,
+ 98816, 114690, 163840, 197120, 229380, 278528, 393728, 458760, 532480, 786944, 917520, 1052672, 1573376,
+ 1835040, 1966081, 2097225, 2097285, 2097411, 2099200, 2211841, 3145765, 3145795, 3146240, 3604481, 3670027,
+ 3670080, 3932164, 4130817, 4161553, 82, 138, 145, 265, 517, 1027, 1030, 1036, 1048, 1072, 1120, 1216, 1408,
+ 13312, 25600, 28673, 50176, 57346, 99328, 114692, 197632, 229384, 294912, 394240, 458768, 540672, 787456,
+ 917536, 1056768, 1573888, 1835072, 1966082, 2097233, 2097289, 2097413, 2097667, 2101248, 2154497, 3145769,
+ 3145797, 3145859, 3146752, 3375105, 3670035, 3670144, 3932167, 3932168, 3932173, 3932185, 3932209, 3932257,
+ 3932353, 3932545, 3932929, 3933697, 3935233, 3938305, 3944449, 3956737, 3981313, 4030465, 4132865, 4161569,
+ 4190209, 84, 146, 161, 266, 273, 480, 521, 1029, 2051, 2054, 2060, 2072, 2096, 2144, 2240, 2432, 2816, 14337,
+ 26624, 28674, 31744, 51200, 57348, 100352, 114696, 198656, 229392, 327680, 395264, 458784, 557056, 788480,
+ 917568, 983041, 1064960, 1574912, 1835136, 1966084, 2097297, 2097417, 2097669, 2098179, 2105344, 2125825,
+ 3145801, 3145861, 3145987, 3147776, 3260417, 3670037, 3670051, 3670272, 3932176, 4136961, 4161601, 4177920,
+ 148, 162, 274, 289, 522, 529, 1033, 2053, 4099, 4102, 4108, 4120, 4144, 4192, 4288, 4480, 4864, 5632, 7169,
+ 14338, 28676, 53248, 57352, 102400, 114704, 200704, 229408, 258048, 397312, 458816, 589824, 790528, 917632,
+ 983042, 1081344, 1576960, 1835264, 1966088, 2097313, 2097425, 2097673, 2098181, 2099203, 2111489, 2113536,
+ 3080193, 3145809, 3145865, 3145989, 3146243, 3149824, 3203073, 3670053, 3670083, 3670528, 3932192, 4145153,
+ 4161665, 164, 240, 276, 290, 321, 530, 545, 1034, 1041, 2057, 3585, 4101, 7170, 8195, 8198, 8204, 8216, 8240,
+ 8288, 8384, 8576, 8960, 9728, 11264, 14340, 28680, 57360, 106496, 114720, 204800, 229440, 401408, 458880,
+ 491521, 655360, 794624, 917760, 983044, 1040384, 1114112, 1581056, 1835520, 1966096, 2097441, 2097681,
+ 2098185, 2099205, 2101251, 2104321, 2129920, 3145873, 3145993, 3146245, 3146755, 3153920, 3174401, 3670057,
+ 3670085, 3670147, 3671040, 3899393, 3932171, 3932224, 4063234, 4161793, 168, 292, 322, 532, 546, 577, 1042,
+ 1057, 1793, 2058, 2065, 3586, 4105, 7172, 8197, 14344, 15872, 16387, 16390, 16396, 16408, 16432, 16480,
+ 16576, 16768, 17152, 17920, 19456, 22528, 28688, 57376, 114752, 212992, 229504, 409600, 459008, 491522,
+ 802816, 918016, 983048, 1179648, 1589248, 1836032, 1966112, 2097473, 2097697, 2098193, 2099209, 2100737,
+ 2101253, 2105347, 2162688, 2588673, 3145889, 3146001, 3146249, 3146757, 3147779, 3160065, 3162112, 3670089,
+ 3670149, 3670275, 3672064, 3784705, 3932179, 3932288, 4063236, 4162049, 4177923, 120, 296, 897, 1058, 1089,
+ 2081, 4106, 4113, 8201, 32774, 32780, 33536, 34304, 45056, 491524, 983056, 1605632, 1966144, 2031617,
+ 2098209, 2098945, 2099217, 2228224, 3146257, 3146761, 3178496, 3670153, 3670277, 3670531, 3932181, 3932416,
+ 4063329, 4063425, 4069377, 4075521 };
+
+static int cbest_23[1023] = {
+ 1, 2, 4194320, 4, 2097160, 8, 1048580, 16, 524290, 32, 262145, 64, 128, 4325392, 256, 512, 2162696, 1024,
+ 2048, 1081348, 4096, 8192, 540674, 16384, 32768, 270337, 65536, 131072, 262144, 4329488, 524288, 2164744,
+ 1048576, 3, 1082372, 2097152, 4194321, 5, 2097161, 4194322, 6, 9, 1048581, 2097162, 4194304, 4194324,
+ 6291480, 10, 17, 524291, 541186, 1048582, 2097164, 4194328, 5242900, 12, 18, 33, 66, 132, 264, 528, 1056,
+ 2112, 4224, 8448, 16896, 33792, 67584, 135168, 270336, 3145740, 4718610, 20, 34, 65, 262147, 524294, 540672,
+ 1048588, 2097176, 2621450, 4194352, 4456465, 24, 36, 129, 262149, 270593, 524298, 1048596, 1081344, 1572870,
+ 2097192, 2359305, 4194384, 4325393, 40, 68, 130, 257, 262153, 524306, 1048612, 1310725, 2097224, 2162688,
+ 4194448, 4325394, 48, 72, 258, 513, 262161, 524322, 786435, 1048644, 2097288, 2162697, 4194576, 4325376,
+ 4325396, 6422552, 80, 136, 260, 514, 1025, 262177, 524354, 1048708, 2097416, 2162698, 4194832, 4325400,
+ 5373972, 6357016, 96, 144, 516, 1026, 2049, 262209, 524418, 1048836, 1081349, 2097672, 2162700, 4195344,
+ 4329616, 4849682, 160, 272, 520, 1028, 2050, 4097, 262273, 524546, 1049092, 1081350, 2098184, 3211276,
+ 4196368, 4325424, 4587537, 5275668, 192, 288, 1032, 2052, 4098, 8193, 262401, 524802, 540675, 1049604,
+ 2099208, 2162712, 2686986, 3178508, 4198416, 4325456, 320, 544, 1040, 2056, 4100, 8194, 16385, 262657,
+ 525314, 1050628, 1081356, 2101256, 2162728, 2424841, 4202512, 4325520, 4734994, 384, 576, 2064, 4104, 8196,
+ 16386, 32769, 263169, 526338, 540678, 1052676, 1081364, 1605638, 2105352, 2162760, 2164808, 2637834, 4210704,
+ 4325648, 640, 1088, 2080, 4112, 8200, 16388, 32770, 65537, 264193, 270339, 528386, 540682, 1056772, 1081380,
+ 1343493, 1589254, 2113544, 2162824, 4227088, 4325904, 4464657, 6488088, 768, 1152, 4128, 8208, 16392, 32772,
+ 65538, 131073, 266241, 270341, 532482, 540690, 1064964, 1081412, 2129928, 2162952, 2367497, 4259856, 4326416,
+ 1280, 2176, 4160, 8224, 16400, 32776, 65540, 131074, 270345, 540706, 802819, 1081476, 1318917, 2163208,
+ 4327440, 4329489, 5406740, 1536, 2304, 8256, 16416, 32784, 65544, 131076, 262146, 270353, 278529, 540738,
+ 557058, 794627, 1081604, 1082404, 1114116, 2163720, 2228232, 4329490, 4456464, 2560, 4352, 8320, 16448,
+ 32800, 65552, 131080, 262148, 270369, 270592, 294913, 524289, 540802, 541184, 589826, 1081860, 1082368,
+ 1179652, 2164736, 2359304, 3244044, 4329472, 4329492, 4333584, 4866066, 6426648, 3072, 4608, 16512, 32832,
+ 65568, 131088, 135296, 262152, 270401, 327681, 540930, 655362, 1310724, 2164745, 2166792, 4329496, 4341776,
+ 4718608, 5378068, 5120, 8704, 16640, 32896, 65600, 67648, 131104, 262160, 270465, 393217, 524292, 786434,
+ 1048577, 1083396, 2164746, 2170888, 2621448, 2703370, 4358160, 4595729, 4853778, 6359064, 6144, 9216, 33024,
+ 33824, 65664, 131136, 262176, 524296, 541202, 541698, 1048578, 1085444, 1572868, 2164748, 2179080, 4329520,
+ 4390928, 4591633, 5242896, 10240, 16912, 17408, 33280, 65792, 131200, 262208, 270849, 524304, 524352, 542722,
+ 1082373, 1089540, 1622022, 2097153, 2195464, 2433033, 3145736, 3213324, 4329552, 8456, 12288, 18432, 66048,
+ 131328, 262272, 271361, 524320, 544770, 786433, 1048584, 1048704, 1082374, 1097732, 2097154, 2164760,
+ 2689034, 4194323, 4587536, 5276692, 6291472, 7, 4228, 20480, 34816, 66560, 131584, 262400, 272385, 548866,
+ 1048592, 1351685, 1572866, 2097156, 2097163, 2097408, 2164776, 2293768, 2426889, 3179532, 4194305, 4194325,
+ 4329744, 6291481, 11, 2114, 24576, 36864, 132096, 262656, 270601, 274433, 524416, 541187, 1048583, 1048608,
+ 1082380, 1146884, 1310721, 2097165, 2424840, 3145732, 4194306, 4194326, 4194329, 4194816, 4330000, 4849680,
+ 5242901, 6291482, 6492184, 13, 19, 67, 133, 265, 529, 1057, 2113, 4225, 8449, 16897, 33793, 40960, 67585,
+ 69632, 133120, 135169, 263168, 524544, 573442, 811011, 1048640, 1082388, 1212420, 1606662, 2097166, 2097168,
+ 2164872, 2621442, 3145741, 4194308, 4194330, 4330512, 4718611, 4735506, 5242902, 6291464, 6291484, 6490136,
+ 14, 21, 35, 134, 266, 530, 1058, 4226, 8450, 16898, 33794, 49152, 67586, 73728, 135170, 264192, 270338,
+ 286721, 524295, 524800, 540673, 541190, 606210, 1048589, 1343492, 1344517, 2097177, 2097184, 2165000,
+ 2359297, 2621451, 2638346, 2686984, 3145742, 4194312, 4194332, 4194353, 4194386, 4194452, 4194584, 4195376,
+ 4196432, 4198544, 4202768, 4211216, 4228112, 4261904, 4331536, 4464656, 5242884, 5373968, 5410836, 7340060,
+ 22, 25, 37, 70, 268, 532, 1060, 2116, 8452, 16900, 33796, 67588, 81920, 135172, 139264, 266240, 270340,
+ 303105, 524299, 525312, 541194, 671746, 1048590, 1048597, 1048832, 1081345, 1082436, 1572871, 1589766,
+ 2097178, 2097193, 2097216, 2097226, 2097292, 2097688, 2098216, 2099272, 2101384, 2105608, 2114056, 2130952,
+ 2165256, 2232328, 2367496, 4194354, 4194385, 4456467, 4718594, 4718614, 4734992, 5242908, 6815770, 26, 38,
+ 41, 69, 74, 131, 140, 536, 1064, 2120, 4232, 16904, 33800, 67592, 98304, 135176, 147456, 262151, 270344,
+ 270595, 335873, 524307, 526336, 540676, 802818, 1048598, 1048613, 1048646, 1048844, 1049088, 1049108,
+ 1049636, 1050692, 1052804, 1057028, 1065476, 1081346, 1082500, 1116164, 1183748, 1318916, 1605636, 2097180,
+ 2097194, 2097225, 2097280, 2162689, 2165768, 2359307, 2621454, 2637832, 3211272, 4194336, 4194356, 4194449,
+ 4325395, 4337680, 4456449, 4456469, 4464913, 4718618, 4870162, 5275664, 5407764, 5767190, 6291512, 6422544,
+ 6553625, 28, 42, 49, 73, 82, 148, 259, 280, 1072, 2128, 4240, 8464, 33808, 67600, 135184, 163840, 262155,
+ 270352, 270597, 278528, 401409, 524302, 524323, 524422, 524554, 524818, 525346, 526402, 528384, 528514,
+ 532738, 540680, 541218, 558082, 591874, 659458, 794626, 803331, 1048614, 1048645, 1049600, 1082628, 1310727,
+ 1589252, 2097196, 2097289, 2162690, 2359309, 2367753, 3145756, 3178504, 3246092, 3670030, 4194360, 4194368,
+ 4194388, 4194450, 4194577, 4325377, 4325397, 4329620, 4345872, 4456473, 5242932, 5505045, 6291544, 6357008,
+ 6422553, 44, 50, 81, 98, 137, 164, 261, 296, 515, 560, 2144, 4256, 8480, 16928, 67616, 135200, 196608,
+ 262157, 262163, 262211, 262277, 262409, 262673, 263201, 264257, 266369, 270368, 279041, 294912, 295937,
+ 329729, 397313, 524310, 524355, 532480, 540688, 540736, 541250, 1048604, 1048709, 1050624, 1081352, 1081472,
+ 1082884, 1319173, 1572878, 2097228, 2097290, 2097417, 2097664, 2162692, 2162699, 2162944, 2168840, 2621466,
+ 3145772, 3245068, 3407885, 4194392, 4194432, 4194578, 4194833, 4325378, 4325398, 4325401, 4325888, 4362256,
+ 4599825, 4718642, 4980755, 5242964, 5373973, 6291608, 6357017, 6422554, 52, 76, 97, 138, 145, 196, 262, 328,
+ 517, 592, 1027, 1120, 4288, 8512, 16960, 33856, 135232, 262165, 262179, 270400, 270609, 327680, 524314,
+ 524326, 524419, 540704, 541314, 786439, 794883, 802817, 1048620, 1048710, 1048837, 1052672, 1081360, 1310733,
+ 1572886, 1605634, 2097208, 2097418, 2097673, 2098176, 2162701, 2172936, 2359321, 2621482, 2705418, 2883595,
+ 3145804, 3211268, 4194456, 4194560, 4194580, 4194834, 4195345, 4325380, 4325402, 4329617, 4395024, 4456497,
+ 4718674, 4849683, 4866578, 5243028, 5373974, 6291736, 6357018, 6422536, 6422556, 56, 84, 146, 161, 194, 273,
+ 392, 518, 521, 656, 1029, 1184, 2051, 2240, 8576, 17024, 33920, 67712, 262169, 262181, 270464, 270625,
+ 393216, 524330, 524358, 524547, 541442, 557056, 786443, 794625, 1048628, 1048652, 1048838, 1049093, 1056768,
+ 1081351, 1081376, 1084420, 1310741, 1343489, 1572902, 1589250, 1835015, 2097240, 2097420, 2097674, 2098185,
+ 2099200, 2162702, 2162704, 2181128, 2359337, 2621514, 2686978, 3145868, 3178500, 3211277, 4194416, 4194836,
+ 4195346, 4196369, 4325384, 4325404, 4325425, 4325458, 4325524, 4325656, 4326448, 4327504, 4329618, 4333840,
+ 4342288, 4359184, 4392976, 4456529, 4595728, 4718738, 5243156, 5275669, 5373956, 6291992, 6357000, 6357020,
+ 7471132, 162, 193, 289, 1030, 1312, 2053, 4099, 34048, 262185, 262213, 524338, 524362, 524803, 786451,
+ 1048716, 1049605, 1064960, 1086468, 1310757, 2097256, 2097304, 2097676, 2098186, 2099209, 2101248, 2162713,
+ 2424833, 2621578, 2703882, 3145996, 3178509, 4194480, 4198417, 4325426, 4456593, 4587539, 4718866, 4849666,
+ 6292504, 6426776, 7405596 };
+
+static int cbest_24[1023] = {
+ 1, 2, 8388675, 4, 12583010, 8, 6291505, 16, 32, 64, 11534427, 128, 256, 8388674, 4194337, 14155886, 3, 512,
+ 8388673, 6, 12583011, 5, 1024, 10485843, 12, 6291504, 12583008, 9, 24, 2048, 3145752, 8388679, 10, 7077943,
+ 14680178, 48, 1572876, 13631594, 17, 4096, 8388683, 96, 786438, 12583014, 20, 6291507, 18, 33, 192, 8192,
+ 393219, 7340089, 8388691, 6815797, 11534426, 12583018, 384, 131073, 6291489, 6291509, 11927640, 34, 40, 65,
+ 16384, 262146, 8388707, 8585282, 12582978, 66, 768, 524292, 8388611, 8454211, 11534419, 11534425, 12583026,
+ 1048584, 6291513, 36, 129, 1536, 32768, 2097168, 4292641, 5767213, 80, 130, 4194336, 6291473, 11796569,
+ 12615778, 14155882, 132, 3072, 12058719, 12582946, 68, 257, 65536, 5963820, 8388803, 11534411, 11534431,
+ 14155887, 258, 6144, 196609, 393218, 6307889, 7077941, 160, 260, 8388672, 72, 264, 513, 12288, 131072,
+ 8388931, 10534995, 11272277, 14155878, 14155884, 786436, 8486979, 10485842, 12583138, 7, 516, 24576, 4194339,
+ 6291569, 11927641, 136, 320, 514, 520, 1025, 393217, 5242921, 7077939, 7077942, 8388677, 8389187, 8585283,
+ 12583009, 14418028, 13, 528, 49152, 1572872, 2981910, 8388678, 10485841, 11542619, 12583266, 14286959,
+ 15466612, 14, 4194341, 6291633, 12632162, 14680179, 25, 144, 1026, 1032, 2049, 98304, 262144, 786434,
+ 3145753, 8389699, 11534459, 13631595, 11, 26, 640, 1040, 3145744, 6291506, 8388682, 8388687, 11534363,
+ 11927642, 12583012, 12583522, 12681314, 13656170, 14024809, 28, 49, 1056, 196608, 1572877, 4194345, 6291761,
+ 7340088, 8388681, 11010135, 12583015, 14680176, 272, 1028, 2050, 4097, 1572868, 3145754, 3538971, 6291457,
+ 6316081, 6815793, 6815796, 8388699, 8390723, 10485827, 10485847, 13631586, 13631592, 22, 52, 97, 2064,
+ 786439, 4194305, 6291488, 6291508, 7209014, 8388690, 11534403, 12584034, 14155854, 21, 50, 56, 98, 1280,
+ 2080, 1490955, 1572878, 3670044, 4194353, 5963821, 6291497, 6292017, 6340657, 7733306, 8388643, 8388723,
+ 8781888, 9175109, 9961551, 12582914, 12582994, 12583019, 12583022, 14155902, 14159982, 15728762, 19, 193,
+ 288, 2052, 2112, 4098, 8193, 524288, 3145736, 3145756, 3407898, 4390944, 8392771, 10485851, 11534555,
+ 11796571, 12583016, 13893736, 14155874, 14352495, 15204470, 15532148, 44, 104, 393216, 2195472, 6291511,
+ 6291512, 7077927, 7340081, 8388689, 8388706, 8388739, 12582979, 12583034, 12585058, 14680162, 14680182, 112,
+ 196, 385, 4128, 1097736, 1835022, 4227105, 4292640, 5767209, 5767212, 6292529, 6828085, 7077937, 8388610,
+ 8388695, 8454210, 11534418, 11534423, 11534424, 12583027, 13631598, 35, 41, 194, 544, 2056, 2560, 4100, 4160,
+ 8194, 16385, 262147, 548868, 1703949, 2146320, 4259873, 6291472, 6291517, 7340091, 8388609, 8396867, 8519746,
+ 11534417, 11534683, 11927632, 12976225, 13369444, 14155822, 42, 67, 88, 100, 208, 769, 4224, 274434, 524293,
+ 1073160, 6291493, 6815799, 6946868, 7077951, 8388705, 8585280, 8650817, 10158158, 11546715, 12058715,
+ 12582976, 12587106, 14155880, 14286958, 14680186, 38, 134, 224, 386, 392, 770, 131075, 137217, 536580,
+ 786432, 917511, 1048585, 2883606, 3604507, 4194401, 5963816, 6291491, 6291537, 6293553, 7864381, 8389059,
+ 8454209, 10502227, 11272279, 11796568, 12583024, 12583074, 12615779, 14155883, 15401079, 37, 70, 268, 576,
+ 1537, 4104, 8196, 8256, 16386, 32769, 268290, 393223, 1048576, 2097169, 3866653, 6291515, 6307888, 7077911,
+ 7077940, 7079991, 7340073, 7340093, 7602235, 8388715, 8405059, 10485875, 10518611, 11534429, 11534939,
+ 11559003, 12058718, 12582947, 12582986, 12583030, 14680146, 81, 131, 176, 416, 536, 5120, 8320, 134145,
+ 524294, 786446, 2981908, 3153944, 3538970, 5767205, 6684722, 6815781, 8388802, 8389443, 8847424, 8912967,
+ 9134150, 9240645, 10485779, 11370581, 11534410, 11534430, 11927643, 12582944, 12591202, 12714083, 13631562,
+ 14024808, 82, 133, 262, 388, 448, 784, 1072, 1540, 3073, 8448, 131077, 1048586, 1441803, 1572892, 1576972,
+ 3473434, 4194465, 6295601, 7766074, 8388627, 8388801, 9437259, 11272273, 11534443, 11796561, 11927624,
+ 11927644, 12615776, 12845152, 13631610, 14156014, 14418030, 69, 84, 140, 200, 524, 1088, 1538, 2144, 4112,
+ 8200, 16388, 32770, 65537, 262150, 393227, 788486, 1490954, 1572864, 2097170, 3145784, 4292643, 4423712,
+ 5767215, 5898284, 6422576, 8388711, 8390211, 8421443, 8457283, 8585286, 11535451, 12058711, 12058717,
+ 12582982, 12583394, 13639786, 14696562, 15466614, 15564916, 76, 259, 352, 772, 832, 1048, 4288, 6145, 16512,
+ 394243, 786454, 3211288, 4194338, 5963812, 6291475, 6291481, 6291568, 6815805, 8388615, 8388930, 8454215,
+ 10534994, 11272276, 11534409, 11802713, 12599394, 13647978, 14155879, 14155885, 14876787, 74, 161, 261, 266,
+ 896, 1568, 2096, 3074, 3080, 8576, 10240, 16640, 131081, 655365, 1048588, 1310730, 1572908, 1605644, 1769485,
+ 2211856, 2621460, 3342361, 4194593, 5079079, 5242920, 6291697, 6299697, 7077938, 7143479, 7340057, 8388807,
+ 8388929, 8391747, 8455747, 8486978, 8781889, 11534415, 12583139, 12583778, 13107302, 14155876, 14156142,
+ 14418024, 14680114, 14745714, 73, 162, 265, 280, 1152, 4192, 8208, 12289, 16392, 16896, 17152, 32772, 65538,
+ 132097, 262154, 393235, 802822, 1736717, 2097152, 2097172, 2981906, 3145816, 3538969, 4243489, 4292645,
+ 5963822, 6029359, 6291553, 6815765, 7012404, 7209015, 8585290, 8585794, 8716353, 9109574, 10485971, 11536475,
+ 11928664, 12320861, 12582962, 12583136, 13008993, 13631530, 14024811, 14155894, 14162030, 14286955, 14352494,
+ 15466608, 164, 518, 704, 776, 1664, 6146, 8384, 34304, 133121, 196611, 401411, 524300, 786437, 786470,
+ 1105928, 1703948, 2949142, 3145728, 4194340, 6291477, 6291632, 6291889, 6307891, 6819893, 7078007, 7209012,
+ 7348281, 7733307, 8388619, 8388676, 8389186, 8394819, 8454219, 8454723, 10190926, 10534993, 11534451,
+ 11796553, 11796573, 12189790, 12584546, 12615782, 14418029, 138, 168, 400, 517, 532, 1792, 3076, 3136, 6160,
+ 16768, 24577, 33024, 68608, 131089, 135169, 264194, 745477, 851974, 1490953, 1572940, 4194849, 4292897,
+ 4390945, 5267497, 5636138, 6553651, 6823989, 7733304, 8388835, 8388935, 8455235, 8486977, 10321997, 10485840,
+ 11534395, 11542618, 12058703, 12582950, 12583106, 12583267, 12617314, 14156398, 14168174, 15466613, 16089208,
+ 137, 152, 321, 515, 521, 560, 1544, 2176, 8224, 12290, 16400, 20480, 32776, 33280, 33536, 65540, 131074,
+ 137216, 262162, 266242, 393251, 425987, 552964, 917510, 1572874, 2097176, 3145880, 3407896, 3604506, 4292649,
+ 4567075, 6291571, 6292273, 7372857, 8388939, 8400963, 8456259, 8519747, 8585298, 8589378, 10486099, 11538523,
+ 11927633, 12583142, 12583264, 12586082, 12616034, 12632163, 12779619, 12976224, 14155898, 148, 522, 529,
+ 1036, 1408, 3328, 6148, 33792, 49153, 67072, 196613, 270338, 393222, 458755, 524308, 528388, 786502, 1572873,
+ 2981911, 3866652, 3883037, 4194343, 4194344, 4554787, 5242913, 6291760, 6307873, 6307893, 7077925, 7077947,
+ 7078071, 8388867, 8389185, 8389698, 8454227, 8585281, 8781890, 9175111, 9273413, 10158159, 10510419,
+ 11010131, 11534458, 11534491, 12615786, 12616290, 12616546, 12648546, 12746851, 14073961, 14418020, 14680306,
+ 15, 324, 1064, 3584, 6272, 12320, 24578, 131105, 134144, 163841, 276482, 393221, 532484, 1048600, 1474571,
+ 1573004, 1835020, 3145748, 3506202, 4195361, 4292609, 4294689, 5767181, 6291456, 6293041, 6308017, 6316080,
+ 6324273, 6488112, 6815792, 7077923, 7077949, 8388686, 8389191, 8413251, 8601666, 9257029, 10928208, 11403348,
+ 11534347, 11534362, 11542611, 11542617, 11927672, 11960408, 12582954, 12583013, 12583523, 12589154, 12616802,
+ 12681315, 13631722, 13656171, 13893738, 14155846, 14156910, 14286951, 14286957, 15401078, 15532150, 145, 322,
+ 328, 1027, 1033, 1120, 1552, 2304, 12292, 16416, 32784, 65544, 66048, 98305, 131076, 262145, 262178, 393283,
+ 540676, 786435, 1056776, 1441802, 2818069, 3146008, 4194304, 4194321, 4194349, 4292657, 5242923, 5767229,
+ 5771309, 5964332, 6291601, 6291635, 6308145, 6946869, 7209010, 7340153, 7782458, 8388680, 8388685, 8388811,
+ 8389195, 8470595, 8585314, 8618050, 10059855, 10485835, 10485845, 10486355, 10534979, 10534999, 10584147,
+ 11010134, 11272261, 11534361, 11665498, 11798617, 11927576, 11927634, 11993176, 12583270, 12615746, 12632160,
+ 14155790, 14155850, 14155870, 14680177, 15827066, 146, 276, 336, 641, 800, 1034, 2816, 6656, 49154, 66560,
+ 196617, 197121, 524324, 720901, 786442, 786566, 3158040, 3670040, 5963817, 6291637, 6292016, 6294577,
+ 6307897, 6308657, 6340656, 6815861, 7078199, 8389203, 8390722, 8454243, 8486983, 8585218, 8847425, 9134151,
+ 10485846, 10535123, 12583274, 12615794, 13631587, 14159978 };
+
+static int cbest_25[1023] = {
+ 1, 2, 16777220, 4, 8388610, 8, 4194305, 16, 32, 18874372, 64, 128, 9437186, 256, 512, 4718593, 1024, 2048,
+ 4096, 19136516, 8192, 16384, 32768, 9568258, 65536, 131072, 262144, 4784129, 524288, 1048576, 2097152,
+ 4194304, 19169284, 8388608, 3, 9584642, 16777216, 16777221, 5, 8388611, 16777222, 6, 9, 18, 36, 72, 144, 288,
+ 576, 1152, 2304, 4608, 9216, 18432, 36864, 73728, 147456, 294912, 589824, 1179648, 2359296, 4718592,
+ 25165830, 10, 17, 4194307, 8388614, 9437184, 16777228, 20971525, 12, 33, 4194309, 4792321, 8388618, 12582915,
+ 16777236, 18874368, 18874373, 20, 34, 65, 4194313, 8388626, 16777252, 18874374, 24, 66, 129, 4194321,
+ 8388642, 9437187, 16777284, 27262982, 40, 68, 130, 257, 4194337, 8388674, 16777348, 18874380, 23068677,
+ 26214406, 48, 132, 258, 513, 4194369, 8388738, 9437190, 16777476, 18874388, 80, 136, 260, 514, 1025, 4194433,
+ 4718595, 8388866, 9437194, 13631491, 16777732, 18874404, 19173380, 21495813, 96, 264, 516, 1026, 2049,
+ 4194561, 4718597, 8389122, 9437202, 13107203, 16778244, 18874436, 160, 272, 520, 1028, 2050, 4097, 4194817,
+ 4718601, 4784128, 8389634, 9437218, 9568256, 16779268, 18874500, 19136512, 19136517, 28311558, 192, 528,
+ 1032, 2052, 4098, 8193, 2392064, 4195329, 4718609, 8390658, 9437250, 16781316, 18874628, 19136518, 320, 544,
+ 1040, 2056, 4100, 8194, 16385, 1196032, 4196353, 4718625, 8392706, 9437314, 16785412, 18874884, 23592965,
+ 27525126, 384, 1056, 2064, 4104, 8196, 16386, 32769, 598016, 4198401, 4718657, 8396802, 9437442, 9568259,
+ 9586690, 16793604, 18875396, 19136524, 23330821, 640, 1088, 2080, 4112, 8200, 16388, 32770, 65537, 299008,
+ 4202497, 4718721, 8404994, 9437698, 14155779, 16809988, 18876420, 19136532, 26345478, 768, 2112, 4128, 8208,
+ 16392, 32772, 65538, 131073, 149504, 4210689, 4718849, 8421378, 9438210, 9568262, 16842756, 18878468,
+ 19136548, 1280, 2176, 4160, 8224, 16400, 32776, 65540, 74752, 131074, 262145, 4227073, 4719105, 8454146,
+ 9439234, 9568266, 13762563, 16908292, 18882564, 19136580, 1536, 4224, 8256, 16416, 32784, 37376, 65544,
+ 131076, 262146, 524289, 4259841, 4719617, 4784131, 8519682, 9441282, 9568274, 17039364, 18890756, 19136644,
+ 21561349, 28573702, 2560, 4352, 8320, 16448, 18688, 32800, 65552, 131080, 262148, 524290, 1048577, 4325377,
+ 4720641, 4784133, 4793345, 8650754, 9445378, 9568290, 13172739, 17301508, 18907140, 19136772, 28442630, 3072,
+ 8448, 9344, 16512, 32832, 65568, 131088, 262152, 524292, 1048578, 2097153, 4456449, 4722689, 4784137,
+ 8912898, 9453570, 9568322, 17825796, 18939908, 19137028, 23855109, 4672, 5120, 8704, 16640, 32896, 65600,
+ 131104, 262160, 524296, 1048580, 2097154, 4726785, 4784145, 9469954, 9568386, 19005444, 19137540, 19169280,
+ 19169285, 2336, 6144, 16896, 33024, 65664, 131136, 262176, 524304, 1048584, 2097156, 4194306, 4734977,
+ 4784161, 5242881, 9502722, 9568514, 9584640, 10485762, 19138564, 19169286, 20971524, 23658501, 1168, 10240,
+ 17408, 33280, 65792, 131200, 262208, 524320, 1048592, 2097160, 4194308, 4751361, 4784193, 4792320, 6291457,
+ 8388609, 9568770, 12582914, 14286851, 19140612, 19398660, 27557894, 584, 12288, 33792, 66048, 131328, 262272,
+ 524352, 1048608, 2097168, 4194312, 4784257, 9569282, 9699330, 14221315, 19144708, 19169292, 19922948,
+ 23363589, 25165828, 292, 20480, 34816, 66560, 131584, 262400, 524416, 1048640, 2097184, 2396160, 4194320,
+ 4784385, 4849665, 8388612, 8388624, 9570306, 9584643, 9961474, 16777217, 19152900, 19169300, 19173892, 146,
+ 24576, 67584, 132096, 262656, 524544, 1048704, 2097216, 4194336, 4784641, 4980737, 8388616, 9572354,
+ 12582913, 16777218, 16777223, 16777248, 19169316, 23068676, 26361862, 28704774, 7, 19, 37, 73, 145, 289, 577,
+ 1153, 2305, 4609, 9217, 18433, 36865, 40960, 69632, 73729, 133120, 147457, 263168, 294913, 524800, 589825,
+ 1048832, 1179649, 1198080, 2097280, 2359297, 4194368, 4785153, 9576450, 9584646, 11534338, 19169348,
+ 19202052, 25165826, 25165831, 11, 38, 74, 290, 578, 1154, 2306, 4610, 9218, 18434, 36866, 49152, 73730,
+ 135168, 147458, 264192, 294914, 525312, 589826, 1049088, 1179650, 2097408, 2359298, 4194432, 4718594,
+ 4786177, 5767169, 8388615, 8388640, 9437185, 9584650, 13631490, 13778947, 16777224, 16777229, 16777238,
+ 16777292, 16777364, 16777508, 16777796, 16778372, 16779524, 16781828, 16786436, 16795652, 16814084, 16850948,
+ 16924676, 17072132, 17367044, 17956868, 19169412, 19267588, 20971521, 21495812, 27262980, 28606470, 13, 22,
+ 76, 148, 580, 1156, 2308, 4612, 9220, 18436, 36868, 73732, 81920, 139264, 147460, 266240, 294916, 526336,
+ 589828, 599040, 1049600, 1179652, 2097664, 2359300, 4194560, 4718596, 4788225, 6815745, 8388619, 8388646,
+ 8388672, 8388682, 8388754, 8388898, 8389186, 8389762, 8390914, 8393218, 8397826, 8407042, 8425474, 8462338,
+ 8536066, 8683522, 8978434, 9584658, 9601026, 10747906, 13107202, 16777230, 16777232, 16777237, 18874369,
+ 19169540, 20971527, 23920645, 26214404, 14, 21, 26, 35, 44, 152, 296, 1160, 2312, 4616, 9224, 18440, 36872,
+ 73736, 98304, 147464, 270336, 294920, 528384, 589832, 1050624, 1179656, 2098176, 2359304, 4194311, 4194323,
+ 4194341, 4194377, 4194449, 4194593, 4194816, 4194881, 4195457, 4196609, 4198913, 4203521, 4212737, 4231169,
+ 4268033, 4341761, 4489217, 4718600, 4792323, 5373953, 6553601, 8388627, 8388736, 9437188, 9437200, 9584674,
+ 9633794, 16777253, 18874370, 18874375, 18874400, 19169796, 19660804, 21569541, 23887877, 25165838, 28459014,
+ 29360135, 25, 52, 67, 88, 304, 592, 2320, 4624, 9232, 18448, 36880, 73744, 147472, 163840, 278528, 294928,
+ 299520, 532480, 589840, 1052672, 1179664, 2099200, 2359312, 4194315, 4195328, 4718608, 4792325, 4800513,
+ 8388622, 8388643, 8388864, 9437192, 9584706, 9586946, 12582919, 13180931, 13631489, 16777254, 16777280,
+ 16777285, 19170308, 20185092, 20971533, 25165846, 27262978, 27262983, 41, 50, 69, 104, 131, 176, 608, 1184,
+ 4640, 9248, 18464, 36896, 73760, 147488, 196608, 294944, 540672, 589856, 1056768, 1179680, 2101248, 2359328,
+ 4194317, 4196352, 4718624, 4792329, 4816897, 8388630, 8388675, 8389120, 9584770, 9830402, 12582923, 13107201,
+ 14352387, 16777244, 16777286, 16777344, 16777349, 18874376, 18874381, 18874390, 18874444, 18874516, 18874660,
+ 18874948, 18875524, 18876676, 18878980, 18883588, 18892804, 18911236, 18948100, 19021828, 19171332, 19464196,
+ 20054020, 20971541, 23068673, 23592964, 25165862, 26214402, 26214407, 28, 42, 49, 70, 82, 100, 133, 208, 259,
+ 352, 1216, 2368, 9280, 18496, 36928, 73792, 147520, 149760, 294976, 327680, 557056, 589888, 1064960, 1179712,
+ 2105344, 2359360, 4194325, 4194339, 4198400, 4718656, 4792337, 8388634, 8388739, 8389632, 9437191, 9437216,
+ 9584898, 10092546, 12582931, 16777260, 16777350, 16777472, 16777477, 18874382, 18874384, 18874389, 20971557,
+ 23068679, 23330820, 25165894, 28311556, 81, 134, 137, 164, 200, 261, 416, 515, 704, 2432, 4736, 18560, 36992,
+ 73856, 147584, 295040, 393216, 589952, 1081344, 1179776, 2113536, 2359424, 4194329, 4194371, 4202496,
+ 4718720, 4792353, 4915201, 8388650, 8388678, 8388867, 8390656, 9437195, 9437222, 9437248, 9437258, 9437330,
+ 9437474, 9437762, 9438338, 9439490, 9441794, 9446402, 9455618, 9474050, 9510914, 9585154, 9732098, 10027010,
+ 11796482, 12582947, 14155778, 14303235, 16777268, 16777478, 16777728, 16777733, 18874405, 19173376, 19173381,
+ 19177476, 20971589, 21495809, 23666693, 25165958, 27262990, 31457287, 97, 138, 262, 265, 274, 328, 400, 517,
+ 832, 1027, 1408, 4864, 9472, 37120, 73984, 74880, 147712, 295168, 590080, 655360, 1114112, 1179904, 2129920,
+ 2359552, 4194345, 4194373, 4194435, 4210688, 4718848, 4792385, 5046273, 8388658, 8388742, 8389123, 8392704,
+ 9437203, 9437312, 9585666, 11665410, 12582979, 16777300, 16777356, 16777734, 16778240, 16778245, 18874406,
+ 18874432, 18874437, 19173382, 19185668, 20971653, 21495815, 23068685, 25166086, 26214414, 27262998, 27525124,
+ 30408711, 56, 84, 98, 140, 161, 266, 518, 521, 530, 656, 800, 1029, 1664, 2051, 9728, 18944, 74240, 147968,
+ 295424, 590336, 786432, 1180160, 2162688, 2359808, 4194353, 4194437, 4194563, 4227072, 4718599, 4718611,
+ 4718629, 4718665, 4718737, 4718881, 4719104, 4719169, 4720897, 4723201, 4727809, 4737025, 4755457, 4792449,
+ 4866049, 5013505, 5898241, 7077889, 8388690, 8388870, 8389635, 8396800, 9437198, 9437219, 9437440, 9568257,
+ 12583043, 13631495, 13762562, 14229507, 16777316, 16777484, 16778246, 16779264, 16779269, 18874396, 18874438,
+ 18874496, 18874501, 19136513, 20971781, 23068693, 25166342, 26214422, 27263014, 27561990, 28311554, 28311559,
+ 28737542, 29884423 };
+
+static int cbest_26[1023] = {
+ 1, 2, 33554467, 4, 50331698, 8, 25165849, 16, 32, 64, 46137391, 128, 256, 56623156, 512, 33554466, 16777233,
+ 3, 1024, 28311578, 33554465, 6, 50331699, 5, 41943083, 12, 2048, 25165848, 50331696, 9, 24, 12582924,
+ 33554471, 10, 4096, 14155789, 58720314, 48, 6291462, 54526006, 17, 33554475, 96, 8192, 3145731, 50331702, 20,
+ 25165841, 25165851, 18, 33, 192, 29360157, 33554483, 50331682, 34, 16384, 27263003, 33554435, 35127330,
+ 46137387, 46137390, 50331706, 384, 25165853, 40632357, 40, 65, 66, 768, 32768, 17563665, 25165833, 46137389,
+ 56623158, 68, 1048577, 50331666, 36, 129, 1536, 2097154, 23068695, 33554531, 46137383, 48234541, 80, 130,
+ 65536, 4194308, 28311579, 34078755, 47185966, 56623152, 56623157, 132, 3072, 8388616, 28311576, 136, 257,
+ 14155788, 16777232, 33554595, 6144, 131072, 42336299, 45088808, 50331762, 50593842, 53870641, 160, 260,
+ 25165881, 72, 258, 264, 513, 12288, 1572865, 3145730, 33554723, 272, 262144, 7077894, 23592983, 25296921,
+ 33554464, 50331826, 57671733, 24576, 22544404, 25165913, 47710254, 61866041, 320, 514, 520, 1025, 6291460,
+ 33554979, 34340899, 41943082, 46137407, 7, 528, 49152, 524288, 16777235, 46137359, 50331954, 544, 3145729,
+ 3538947, 20971541, 25165977, 33554469, 35127331, 50331697, 54722614, 56623140, 13, 144, 516, 1026, 2049,
+ 98304, 11272202, 12582920, 28311570, 33554470, 33555491, 41943081, 45350952, 56623164, 14, 640, 1040,
+ 1048576, 14155785, 16777237, 46202927, 50332210, 50724914, 58720315, 25, 1056, 196608, 6291458, 12582925,
+ 25166105, 27263001, 41943075, 44040233, 54526002, 54526007, 11, 26, 1028, 1088, 2050, 4097, 16777217,
+ 23855127, 25165825, 25165840, 25165850, 28311582, 33554474, 33554479, 33556515, 46137455, 50331700, 51118130,
+ 60489787, 62390329, 28, 49, 393216, 5636101, 6291463, 16777241, 29360156, 33554473, 35323938, 46137379,
+ 50331703, 50332722, 56623124, 58720312, 50, 1280, 2080, 12582916, 12582926, 22675476, 25165845, 25166361,
+ 25362457, 27263002, 28311562, 33554451, 33554491, 36700192, 39845925, 40632359, 41943087, 47185967, 50331650,
+ 50331690, 54526004, 22, 52, 97, 288, 1032, 2052, 2112, 4098, 8193, 786432, 14155781, 14155791, 18350096,
+ 25165852, 27361307, 29360153, 33554482, 33558563, 46137385, 46137519, 50331683, 56623154, 56655924, 58720306,
+ 60817464, 64487487, 21, 56, 2176, 2097152, 9175048, 14680078, 23068693, 25559065, 33554434, 33554499,
+ 46137386, 50331707, 50331710, 50333746, 62914622, 19, 100, 193, 1572864, 4587524, 13631501, 17563664,
+ 25165832, 25165843, 25166873, 28311577, 33554433, 40632353, 40632356, 45088809, 50331704, 53870640, 55574583,
+ 35, 44, 98, 104, 2056, 2560, 4100, 4160, 8194, 16385, 2293762, 8781832, 17661969, 25165855, 33554481,
+ 33562659, 36372513, 45482024, 46137647, 48234543, 50331680, 53477425, 56623220, 58720318, 42, 70, 112, 385,
+ 4224, 1146881, 4390916, 7340039, 11337738, 16777265, 23068694, 30408732, 33554487, 35127328, 46137388,
+ 50331686, 50335794, 56623159, 57147444, 38, 41, 140, 194, 200, 386, 4352, 2195458, 25165865, 25167897,
+ 28327962, 29360149, 29360159, 33554659, 41943099, 50331667, 50331730, 57671732, 58720298, 67, 88, 208, 280,
+ 576, 769, 2064, 4104, 8196, 16386, 32769, 1097729, 3145735, 23068691, 27262995, 28311610, 33554443, 33554530,
+ 33570851, 37224480, 40370213, 41943051, 46137382, 46137903, 46235695, 47710255, 48234537, 48234540, 50331664,
+ 54525990, 56623284, 61866040, 69, 84, 134, 224, 560, 5120, 8320, 3145728, 3538946, 4194304, 6291470,
+ 11534347, 16777297, 17039377, 20316178, 31457311, 33554529, 33554851, 34078754, 45088810, 46137381, 47185962,
+ 50339890, 53870643, 54526014, 56623153, 61341752, 37, 76, 196, 268, 400, 772, 1120, 1537, 8448, 2097155,
+ 7077892, 12582940, 15204366, 17301521, 18612240, 22544405, 23592981, 25165837, 25169945, 26935320, 34127907,
+ 34603042, 46137399, 46333999, 47235118, 64749631, 81, 131, 176, 416, 536, 770, 2240, 4112, 8200, 8704, 16388,
+ 32770, 65537, 1769473, 3145739, 4194309, 5668869, 14155784, 14155805, 17563667, 22741012, 25165835, 25165880,
+ 27263007, 28311642, 28573722, 33554439, 33554594, 33555235, 33587235, 35127334, 35651617, 46138415, 50331890,
+ 56098871, 56623412, 63799358, 65798204, 82, 133, 138, 168, 388, 448, 1072, 3073, 4480, 1048579, 6291478,
+ 8388617, 9306120, 14163981, 16777361, 24117270, 25165873, 28835866, 29360141, 33554535, 33554593, 34078753,
+ 34103331, 42074155, 42336298, 45350953, 47185964, 50331674, 50331763, 50348082, 50593843, 56623160, 57671735,
+ 58720282, 60293179, 137, 152, 800, 1538, 1544, 2144, 8960, 10240, 16640, 6291456, 12582956, 23592982,
+ 25165945, 25174041, 25296920, 27262987, 28311568, 30933020, 33556003, 39321638, 41943147, 42205227, 42385451,
+ 46137403, 48234533, 50331760, 50332082, 51740723, 53739569, 54525974, 61866043, 262, 352, 832, 1152, 4128,
+ 4288, 6145, 8208, 16392, 16896, 17920, 32772, 65538, 131073, 3145747, 3538945, 4194310, 4653060, 5636100,
+ 7602183, 10158089, 12648460, 14155821, 17563669, 25165912, 28311571, 28311580, 28311706, 30670876, 33554547,
+ 33554722, 33620003, 35127338, 37748775, 46139439, 47185958, 49283116, 50331746, 51380275, 56623668, 56672308,
+ 57409588, 74, 161, 261, 276, 336, 392, 896, 3074, 8576, 17408, 35840, 1048581, 6291494, 6324230, 7077890,
+ 8388608, 8388618, 11272200, 16777489, 23592979, 25166041, 33554599, 33557539, 40632365, 42336297, 44826665,
+ 50331670, 50331827, 50332466, 50364466, 50593840, 50618418, 52428848, 56623136, 56623166, 57671729, 60489786,
+ 62390328, 73, 259, 265, 304, 1540, 1600, 3088, 12289, 17152, 71680, 2097158, 2326530, 2818050, 3162115,
+ 12058635, 12582988, 13467660, 14155780, 14155790, 14286861, 16777234, 17563649, 21168149, 22544400, 25165883,
+ 25182233, 26214424, 33554563, 33554603, 34439203, 35323939, 41943211, 46137375, 47710250, 50331766, 50331824,
+ 54591542, 56623142, 56721460, 58851386, 164, 266, 273, 524, 704, 776, 1664, 6146, 8224, 16400, 20480, 32776,
+ 33280, 34304, 65540, 131074, 143360, 262145, 3145763, 5242885, 7077895, 10485770, 11370506, 11796491,
+ 12582912, 13107212, 14155853, 14417933, 17563673, 19660819, 20971540, 25165976, 25166233, 28311583, 28311834,
+ 33554721, 33554978, 33560611, 33685539, 34078759, 34340898, 35127346, 36388897, 36700193, 37355552, 42336291,
+ 45088800, 45350954, 46137351, 46137406, 46141487, 50333234, 50606130, 54657078, 56624180, 58720378, 61603896,
+ 162, 552, 672, 1792, 3076, 24577, 33792, 68608, 286720, 1048585, 1163265, 1409025, 6291526, 6553606, 8388620,
+ 13631500, 14155777, 15466510, 16777745, 17170449, 23068679, 23617559, 23855125, 25165897, 25165969, 28311554,
+ 33554727, 35127298, 35135522, 36175905, 40632373, 40636453, 46137355, 46137358, 46137423, 50331955, 50397234,
+ 54526070, 56623108, 56623148, 59244602, 64487486, 321, 515, 521, 608, 3200, 6176, 12290, 34816, 137216,
+ 573440, 1572867, 2097162, 3276803, 6291461, 6815750, 12583052, 15335438, 16777236, 20971537, 22675477,
+ 23068703, 24641558, 25165915, 25166617, 25198617, 28311563, 29360189, 33554468, 33554539, 33554731, 33566755,
+ 40632325, 41943339, 44040235, 46137357, 46137451, 47710252, 50331830, 50334770, 54722610, 54722615, 56623126,
+ 56623141, 148, 274, 322, 522, 529, 784, 1048, 1408, 2304, 3328, 6148, 8256, 16416, 32784, 49153, 65544,
+ 131076, 262146, 274432, 524289, 1146880, 3145795, 3407875, 4194316, 7340038, 14155917, 17567761, 18350097,
+ 18677776, 25165917, 25166104, 25296913, 25296923, 25309209, 27263000, 27263035, 27295771, 28311560, 28311574,
+ 28312090, 28336154, 28704794, 29425693, 32899102, 33554739, 33554977, 33555490, 33816611, 34078763, 34340897,
+ 37093408, 41943080, 42336303, 45088812, 46137405, 46145583, 50331794, 50331834, 50331938, 50593846, 53870645,
+ 55574582, 56229943, 56623165, 56625204, 58720442, 62390331, 63701054, 65929276, 518, 545, 1104, 1344, 3080,
+ 3584, 24578, 40960, 66560, 548864, 1048593, 3670019, 6291590, 12582922, 16777216, 16778257, 22544406,
+ 23855126, 25165824, 25165885, 25165905, 25166097, 25167385, 27328539, 27361305, 31195164, 33579043, 35192866,
+ 39583782, 40632355, 44138537, 45482025, 46202923, 46202926, 47710246, 50331952, 50332211, 50337842, 50462770,
+ 50724915, 51904563, 53477424, 54526000, 54526134, 60424251, 61866033, 145, 328, 517, 532, 546, 1027, 1216,
+ 1552, 6400, 12292, 12352, 67584, 98305, 1097728, 1572869, 2097170, 3145734, 6733830, 11272203, 12582921,
+ 12583180, 14155787, 14680076, 16777225, 16777239, 16777240, 25165979, 25231385, 25303065, 28360730, 29360221,
+ 29622301, 33554787, 33554947, 33554987, 35127329, 35258402, 36700194, 38633511, 39452710, 39845927, 40697893,
+ 41943074, 41943079, 41943595, 42139691, 43384874, 44040232, 46137515, 46137583, 48234557, 50331958, 50593826,
+ 50774066, 51216434, 51642419, 53870625, 54526003, 54722612, 54747190, 59424826, 15, 641, 6152, 6656, 49154,
+ 69632, 131080, 4194324, 11534346, 14156045, 17596433, 25168921, 25296925, 25362456, 27263067, 33556514,
+ 35389474, 41943073, 50331701, 50331770, 50332194, 50343986, 50593850, 60817466 };
+
+static int cbest_27[1023] = {
+ 1, 2, 67108883, 4, 100663322, 8, 50331661, 16, 32, 92274709, 64, 128, 113246233, 256, 512, 67108882,
+ 33554441, 3, 1024, 67108881, 123731999, 6, 100663323, 5, 2048, 83886103, 12, 50331660, 100663320, 9, 24,
+ 4096, 25165830, 67108887, 109051928, 10, 117440542, 48, 12582915, 50331657, 17, 8192, 54525964, 67108891,
+ 100663314, 128974876, 18, 96, 67108867, 92274711, 100663326, 20, 50331663, 73400338, 92274708, 113246232, 33,
+ 192, 16384, 27262982, 58720271, 34, 50331653, 92274705, 36, 384, 36700169, 46137354, 100663306, 40, 65,
+ 32768, 13631491, 56623116, 67108915, 66, 768, 96469012, 113246235, 68, 23068677, 64487438, 72, 129, 1536,
+ 65536, 67108947, 80, 4194305, 28311558, 73924626, 100663354, 132, 3072, 8388610, 48234506, 50331677,
+ 85458967, 123731998, 130, 136, 257, 131072, 16777220, 67109011, 69206035, 78643217, 144, 6144, 33554440,
+ 92274717, 100663386, 160, 14155779, 36962313, 50331693, 92274693, 258, 264, 513, 12288, 262144, 24117253,
+ 67109139, 101711898, 113246225, 272, 6291457, 12582914, 32243719, 100663450, 109838360, 113246237, 288,
+ 24576, 50331725, 67108880, 260, 320, 514, 1025, 524288, 50855949, 61865999, 67109395, 528, 49152, 25165828,
+ 70254611, 74186770, 83886102, 92274741, 100663578, 106430491, 123731995, 7, 544, 33554443, 50331789,
+ 79167505, 113246217, 123731997, 516, 576, 1026, 2049, 98304, 1048576, 12582913, 41943051, 54525965, 54919180,
+ 67108885, 67109907, 73400339, 83886099, 85590039, 100663321, 109051930, 13, 640, 33554433, 50331656,
+ 67108886, 83886101, 92274773, 92536853, 100663834, 109051929, 14, 26, 1056, 196608, 13631490, 33554445,
+ 50331649, 50331917, 83230736, 102236186, 117440543, 25, 52, 520, 1028, 1088, 2050, 4097, 2097152, 6815745,
+ 25165826, 25165831, 37093385, 67110931, 88080406, 92274707, 98041876, 123731991, 128974878, 11, 104, 1152,
+ 393216, 23068676, 27262980, 50331659, 50331662, 58720269, 67108875, 67108890, 67108895, 79691792, 92274837,
+ 94371861, 100663298, 100663315, 100663318, 100663324, 100664346, 103809050, 113246265, 117440538, 128974877,
+ 22, 28, 49, 208, 1280, 11534338, 27459590, 39845896, 46137355, 50332173, 58720270, 67108866, 67108889,
+ 92274710, 96469013, 100663312, 100663327, 113246234, 117440540, 44, 416, 1032, 2052, 2112, 4098, 8193,
+ 786432, 4194304, 5767169, 19922948, 44040203, 46137352, 50331652, 51118093, 54525960, 56623117, 67108865,
+ 67108899, 67112979, 109903896, 113377305, 120324126, 125829148, 128974872, 19, 88, 97, 832, 2176, 9961474,
+ 13631489, 36700168, 70516755, 92274965, 100665370, 106692635, 113246297, 121634847, 21, 38, 50, 56, 176,
+ 1664, 2304, 1572864, 4980737, 18350084, 29360135, 33554457, 50332685, 51904525, 64487439, 73924627, 92274704,
+ 109051920, 123731983, 76, 193, 352, 1040, 2056, 2560, 3328, 4100, 8194, 16385, 9175042, 27262978, 27262983,
+ 41615368, 49020938, 54525966, 58720267, 64487436, 67108871, 67117075, 73400336, 83886111, 90177558,
+ 100663307, 109051932, 114294809, 117440534, 35, 98, 152, 194, 704, 4224, 6656, 3145728, 4587521, 13729795,
+ 14155778, 50331669, 62914574, 67108914, 67108979, 69992467, 81788944, 83886087, 89128982, 92275221, 96469014,
+ 100663304, 100663338, 100667418, 113246361, 115343385, 37, 42, 70, 112, 304, 385, 1408, 4352, 13312,
+ 12582919, 24117252, 28311556, 33554473, 48234507, 50331655, 50333709, 54525956, 67108913, 80740368, 85655575,
+ 92274713, 92667925, 95420437, 128974868, 41, 140, 608, 2064, 2816, 4104, 4608, 8196, 16386, 26624, 32769,
+ 6291456, 7077889, 8388608, 25165838, 32243718, 40894472, 48234504, 54951948, 56623112, 67109075, 67125267,
+ 69599251, 102367258, 127926300, 67, 100, 280, 388, 769, 1216, 5120, 5632, 53248, 12058626, 40370184,
+ 45088779, 50331673, 50331676, 67108946, 92275733, 93061141, 96469008, 100663310, 100671514, 104071194,
+ 109051912, 113246224, 113246489, 123732031, 131596317, 69, 74, 84, 224, 386, 560, 2432, 8448, 11264, 106496,
+ 12582923, 20447236, 23068673, 31457287, 33554505, 36700171, 44564491, 50335757, 58720263, 60162063, 64487434,
+ 67108919, 67108945, 67109267, 69402643, 73400342, 92274719, 100663355, 100663418, 113246236, 117440526,
+ 123797535, 73, 196, 1120, 1537, 2080, 4112, 4864, 8200, 8704, 16388, 22528, 32770, 65537, 212992, 6029313,
+ 14155777, 20185092, 24510469, 25165846, 56623118, 67108923, 67141651, 78643219, 83886135, 85458966, 92274695,
+ 99352596, 100663346, 100663352, 102105114, 113246239, 81, 134, 770, 776, 2240, 9216, 9728, 45056, 425984,
+ 10223618, 20807684, 28311554, 28311559, 34603017, 36700161, 36962312, 46137346, 50331689, 50331692, 50331709,
+ 67109010, 67109651, 69206034, 73973778, 78643216, 91226134, 92274689, 92276757, 100663514, 100679706,
+ 113246227, 113246745, 120455198, 123732063, 128974860, 133, 148, 168, 448, 3073, 4480, 10240, 19456, 90112,
+ 851968, 8388611, 10092546, 12582912, 12582931, 23068679, 32243717, 33554569, 35651593, 36700173, 46137358,
+ 50339853, 51183629, 56623108, 63963150, 67108931, 67108951, 70647827, 71303186, 73400346, 73924624, 74186771,
+ 82837520, 85458963, 92274701, 92274716, 100663387, 101908506, 106430490, 113442841, 114819097, 123731994,
+ 124256287, 131, 137, 200, 1538, 4128, 8208, 8960, 16392, 16896, 32772, 38912, 65538, 131073, 180224, 1703936,
+ 5111809, 13631495, 16121859, 16777216, 16777221, 18481156, 25165862, 27262990, 27475974, 39321608, 50331679,
+ 50331757, 52035597, 54525980, 67108955, 67110419, 67174419, 73400322, 75497489, 83886167, 85458965, 89653270,
+ 92274692, 100663358, 100663378, 100663384, 100663706, 109051960, 109936664, 113246216, 123731996, 124780575,
+ 82, 138, 145, 268, 772, 1552, 6145, 17408, 17920, 77824, 360448, 3407872, 4194307, 5046273, 42729483,
+ 50331685, 50331721, 50331724, 61865997, 64487430, 67109009, 67109138, 69206033, 81264656, 84410391, 89391126,
+ 92274725, 92278805, 100696090, 101711899, 101810202, 109838362, 113246209, 113246229, 113247257, 113639449,
+ 117440574, 123732127, 146, 161, 296, 336, 392, 896, 3074, 18432, 35840, 155648, 720896, 6815744, 9240578,
+ 12582947, 19660804, 33554697, 41418760, 50331853, 50348045, 50855948, 51052557, 61865998, 67108995, 67109015,
+ 67111955, 70123539, 84934679, 92274743, 100663451, 100664090, 109838361, 162, 259, 265, 1540, 4160, 8224,
+ 12289, 16400, 20480, 32776, 65540, 71680, 131074, 262145, 311296, 1441792, 13631499, 16777222, 24117249,
+ 25165824, 25165894, 25427974, 27262998, 45613067, 50331695, 54525996, 54919181, 58720287, 67109019, 67239955,
+ 83230737, 83886231, 92274737, 97189908, 100663370, 100663390, 100663442, 104857627, 109051992, 109314072,
+ 113246219, 128974908, 129007644, 266, 273, 536, 3104, 6146, 33792, 143360, 622592, 2883584, 4194309, 4620289,
+ 9830402, 12713987, 31981575, 32243715, 33554442, 36986889, 40632328, 41943049, 48234498, 49676298, 50331717,
+ 50331785, 50331788, 50332045, 50954253, 53215245, 67109137, 67109394, 67115027, 73924630, 79364113, 88080407,
+ 92282901, 96469020, 98041877, 100664858, 100728858, 101711896, 109576216, 113248281, 117440606, 123731987,
+ 123731990, 123731993, 123732255, 262, 274, 289, 400, 592, 672, 1792, 3076, 24577, 34816, 286720, 1245184,
+ 5767168, 8388614, 10403842, 12582979, 20709380, 20971525, 23068685, 33554953, 36700185, 36962315, 41943050,
+ 44826635, 46137370, 48234510, 50364429, 54657036, 67109043, 67109123, 69664787, 70254610, 73400370, 79167507,
+ 79691793, 92274740, 92274775, 92274805, 92700693, 96468996, 100663448, 100663579, 102432794, 104202266,
+ 113246264, 117964830, 132907037, 261, 276, 290, 321, 515, 1544, 8256, 12290, 16416, 32784, 36864, 65544,
+ 131076, 262146, 524289, 573440, 2490368, 4915201, 13631488, 13631507, 13737987, 24117255, 25165958, 27263014,
+ 33554432, 35127305, 44695563, 50331727, 50332429, 50855945, 50905101, 54526028, 54788108, 56623132, 58720303,
+ 61865995, 67109147, 67121171, 67371027, 69206039, 69632019, 73465874, 77594641, 78643221, 79167504, 83886097,
+ 83886359, 88473622, 92274769, 93126677, 100663434, 100663454, 100663570, 100666394, 109052056, 111149080,
+ 128974940, 129237020, 292, 322, 529, 784, 1072, 6148, 6208, 40960, 49153, 1146880, 4194313, 4980736, 6291459,
+ 20316164, 25165829, 27328518, 28311566, 29360134, 30932999, 33554444, 36962305, 50331648, 50331781, 50331916,
+ 54968332, 60227599, 67108884, 67109393, 67109906, 73924634, 75169810, 83886098, 85590038, 92274901, 92291093,
+ 92536855, 100794394, 109051931, 113246249, 113250329, 113377304, 117440670, 119537694, 123732511, 129499164,
+ 164, 324, 518, 530, 545, 1184, 1344, 3080, 3584, 24578, 67584, 2293760, 8388618, 10354690, 12583043,
+ 14155783, 14680067, 23068693, 27394054, 33554437, 33555465, 36700201, 36732937, 36962317, 37093384, 39845897,
+ 46137386, 50333197, 50397197, 55574540, 64487454, 67109171, 67109379, 67133459, 70254609, 73400402, 73924610,
+ 74186768, 83099664, 83230738, 83886100, 85458975, 92274772, 92536852, 92635157, 100663576, 100663835,
+ 100669466, 101711890, 102170650, 106430489, 106692634, 113246267, 113246296, 121634846, 123830303, 124518431,
+ 128974874, 517, 532, 16448, 69632, 524290, 1048577, 4587520, 27262976, 46137353, 50331658, 50331741,
+ 54525961, 56623148, 58720268, 58982415, 67109143, 67109203, 67633171, 78643225, 100663482, 102236187,
+ 106954779, 123731982 };
+
+static int cbest_28[1023] = {
+ 1, 2, 134217732, 4, 67108866, 8, 33554433, 16, 32, 150994948, 64, 128, 75497474, 256, 512, 37748737, 1024,
+ 2048, 4096, 153092100, 8192, 16384, 32768, 76546050, 65536, 131072, 262144, 38273025, 524288, 1048576,
+ 2097152, 4194304, 153354244, 8388608, 16777216, 33554432, 76677122, 67108864, 3, 134217728, 134217733, 5,
+ 38338561, 67108867, 134217734, 6, 9, 18, 36, 72, 144, 288, 576, 1152, 2304, 4608, 9216, 18432, 36864, 73728,
+ 147456, 294912, 589824, 1179648, 2359296, 4718592, 9437184, 18874368, 37748736, 201326598, 10, 17, 33554435,
+ 67108870, 75497472, 134217740, 167772165, 12, 33, 33554437, 67108874, 100663299, 134217748, 150994944,
+ 150994949, 20, 34, 65, 33554441, 67108882, 134217764, 150994950, 24, 66, 129, 33554449, 67108898, 75497475,
+ 134217796, 153387012, 218103814, 40, 68, 130, 257, 33554465, 67108930, 134217860, 150994956, 184549381,
+ 209715206, 48, 132, 258, 513, 33554497, 67108994, 75497478, 134217988, 150994964, 80, 136, 260, 514, 1025,
+ 33554561, 37748739, 67109122, 75497482, 109051907, 134218244, 150994980, 171966469, 96, 264, 516, 1026, 2049,
+ 33554689, 37748741, 67109378, 75497490, 104857603, 134218756, 150995012, 160, 272, 520, 1028, 2050, 4097,
+ 33554945, 37748745, 38273024, 67109890, 75497506, 76546048, 76693506, 134219780, 150995076, 153092096,
+ 153092101, 226492422, 192, 528, 1032, 2052, 4098, 8193, 19136512, 33555457, 37748753, 67110914, 75497538,
+ 134221828, 150995204, 153092102, 320, 544, 1040, 2056, 4100, 8194, 16385, 9568256, 33556481, 37748769,
+ 67112962, 75497602, 134225924, 150995460, 188743685, 220200966, 384, 1056, 2064, 4104, 8196, 16386, 32769,
+ 4784128, 33558529, 37748801, 67117058, 75497730, 76546051, 134234116, 150995972, 153092108, 186646533, 640,
+ 1088, 2080, 4112, 8200, 16388, 32770, 65537, 2392064, 33562625, 37748865, 67125250, 75497986, 113246211,
+ 134250500, 150996996, 153092116, 210763782, 768, 2112, 4128, 8208, 16392, 32772, 65538, 131073, 1196032,
+ 33570817, 37748993, 38346753, 67141634, 75498498, 76546054, 134283268, 150999044, 153092132, 1280, 2176,
+ 4160, 8224, 16400, 32776, 65540, 131074, 262145, 598016, 33587201, 37749249, 67174402, 75499522, 76546058,
+ 110100483, 134348804, 151003140, 153092164, 1536, 4224, 8256, 16416, 32784, 65544, 131076, 262146, 299008,
+ 524289, 33619969, 37749761, 38273027, 67239938, 75501570, 76546066, 134479876, 151011332, 153092228,
+ 172490757, 228589574, 2560, 4352, 8320, 16448, 32800, 65552, 131080, 149504, 262148, 524290, 1048577,
+ 33685505, 37750785, 38273029, 67371010, 75505666, 76546082, 105381891, 134742020, 151027716, 153092356,
+ 227540998, 3072, 8448, 16512, 32832, 65568, 74752, 131088, 262152, 524292, 1048578, 2097153, 33816577,
+ 37752833, 38273033, 67633154, 75513858, 76546114, 135266308, 151060484, 153092612, 190840837, 5120, 8704,
+ 16640, 32896, 37376, 65600, 131104, 262160, 524296, 1048580, 2097154, 4194305, 34078721, 37756929, 38273041,
+ 68157442, 75530242, 76546178, 136314884, 151126020, 153093124, 153354240, 153354245, 6144, 16896, 18688,
+ 33024, 65664, 131136, 262176, 524304, 1048584, 2097156, 4194306, 8388609, 34603009, 37765121, 38273057,
+ 69206018, 75563010, 76546306, 76677120, 138412036, 151257092, 153094148, 153354246, 153391108, 189267973,
+ 9344, 10240, 17408, 33280, 65792, 131200, 262208, 524320, 1048592, 2097160, 4194308, 8388610, 16777217,
+ 35651585, 37781505, 38273089, 38338560, 71303170, 75628546, 76546562, 114294787, 142606340, 151519236,
+ 153096196, 220463110, 4672, 12288, 33792, 66048, 131328, 262272, 524352, 1048608, 2097168, 4194312, 8388612,
+ 16777218, 37814273, 38273153, 75759618, 76547074, 113770499, 152043524, 153100292, 153354252, 186908677,
+ 2336, 20480, 34816, 66560, 131584, 262400, 524416, 1048640, 2097184, 4194320, 8388616, 16777220, 19169280,
+ 33554434, 37879809, 38273281, 41943041, 76021762, 76548098, 76677123, 83886082, 153108484, 153354260,
+ 167772164, 1168, 24576, 67584, 132096, 262656, 524544, 1048704, 2097216, 4194336, 8388624, 16777224,
+ 33554436, 38010881, 38273537, 50331649, 67108865, 76550146, 100663298, 153124868, 153354276, 155189252,
+ 210894854, 229638150, 584, 40960, 69632, 133120, 263168, 524800, 1048832, 2097280, 4194368, 8388640, 9584640,
+ 16777232, 33554440, 38274049, 76554242, 76677126, 77594626, 153157636, 153354308, 159383556, 201326596, 292,
+ 49152, 135168, 264192, 525312, 1049088, 2097408, 4194432, 8388672, 16777248, 33554448, 38275073, 38797313,
+ 67108868, 67108880, 76562434, 76677130, 76695554, 79691778, 110231555, 134217729, 153223172, 153354372,
+ 228851718, 146, 81920, 139264, 266240, 526336, 1049600, 2097664, 4194560, 4792320, 8388736, 16777280,
+ 33554464, 38277121, 39845889, 67108872, 76578818, 76677138, 100663297, 134217730, 134217735, 134217760,
+ 153354500, 184549380, 191365125, 7, 19, 37, 73, 145, 289, 577, 1153, 2305, 4609, 9217, 18433, 36865, 73729,
+ 98304, 147457, 270336, 294913, 528384, 589825, 1050624, 1179649, 2098176, 2359297, 4194816, 4718593, 8388864,
+ 9437185, 16777344, 18874369, 33554496, 38281217, 38338563, 76611586, 76677154, 92274690, 153354756,
+ 153616388, 172556293, 191102981, 201326594, 201326599, 227672070, 11, 38, 74, 290, 578, 1154, 2306, 4610,
+ 9218, 18434, 36866, 73730, 147458, 163840, 278528, 294914, 532480, 589826, 1052672, 1179650, 2099200,
+ 2359298, 2396160, 4195328, 4718594, 8389120, 9437186, 16777472, 18874370, 33554560, 37748738, 38289409,
+ 38338565, 46137345, 67108871, 67108896, 75497473, 76677186, 105447427, 109051906, 134217736, 134217741,
+ 134217750, 134217804, 134217876, 134218020, 134218308, 134218884, 134220036, 134222340, 134226948, 134236164,
+ 134254596, 134291460, 134365188, 134512644, 134807556, 135397380, 136577028, 138936324, 143654916, 153355268,
+ 154140676, 167772161, 171966468, 218103812, 13, 22, 76, 148, 580, 1156, 2308, 4612, 9220, 18436, 36868,
+ 73732, 147460, 196608, 294916, 540672, 589828, 1056768, 1179652, 2101248, 2359300, 4196352, 4718596, 8389632,
+ 9437188, 16777728, 18874372, 33554688, 37748740, 38305793, 38338569, 54525953, 67108875, 67108902, 67108928,
+ 67108938, 67109010, 67109154, 67109442, 67110018, 67111170, 67113474, 67118082, 67127298, 67145730, 67182594,
+ 67256322, 67403778, 67698690, 68288514, 69468162, 71827458, 76677250, 76808194, 85983234, 104857602,
+ 114819075, 134217742, 134217744, 134217749, 150994945, 153356292, 167772167, 209715204, 14, 21, 26, 35, 44,
+ 152, 296, 1160, 2312, 4616, 9224, 18440, 36872, 73736, 147464, 294920, 327680, 557056, 589832, 1064960,
+ 1179656, 1198080, 2105344, 2359304, 4198400, 4718600, 8390656, 9437192, 16778240, 18874376, 33554439,
+ 33554451, 33554469, 33554505, 33554577, 33554721, 33554944, 33555009, 33555585, 33556737, 33559041, 33563649,
+ 33572865, 33591297, 33628161, 33701889, 33849345, 34144257, 34734081, 35913729, 37748744, 38338577, 42991617,
+ 52428801, 67108883, 67108992, 75497476, 75497488, 76677378, 77070338, 134217765, 150994946, 150994951,
+ 150994976, 153358340, 157286404, 201326606, 234881031, 25, 52, 67, 88, 304, 592, 2320, 4624, 9232, 18448,
+ 36880, 73744, 147472, 294928, 393216, 589840, 1081344, 1179664, 2113536, 2359312, 4202496, 4718608, 8392704,
+ 9437200, 16779264, 18874384, 33554443, 33555456, 37748752, 38338593, 38347777, 38404097, 67108878, 67108899,
+ 67109120, 75497480, 76677634, 100663303, 109051905, 114425859, 134217766, 134217792, 134217797, 153362436,
+ 153387008, 153387013, 161480708, 167772173, 189333509, 201326614, 218103810, 218103815, 41, 50, 69, 104, 131,
+ 176, 608, 1184, 4640, 9248, 18464, 36896, 73760, 147488, 294944, 589856, 599040, 655360, 1114112, 1179680,
+ 2129920, 2359328, 4210688, 4718624, 8396800, 9437216, 16781312, 18874400, 33554445, 33556480, 37748768,
+ 38338625, 38535169, 67108886, 67108931, 67109376, 76678146, 78643202, 100663307, 104857601, 134217756,
+ 134217798, 134217856, 134217861, 150994952, 150994957, 150994966, 150995020, 150995092, 150995236, 150995524,
+ 150996100, 150997252, 150999556, 151004164, 151013380, 151031812, 151068676, 151142404, 151289860, 151584772,
+ 152174596, 153370628, 153387014, 155713540, 160432132, 167772181, 184549377, 188743684, 201326630, 209715202,
+ 209715207, 28, 42, 49, 70, 82, 100, 133, 208, 259, 352, 1216, 2368, 9280, 18496, 36928, 73792, 147520,
+ 294976, 589888, 786432, 1179712, 2162688, 2359360, 4227072, 4718656, 8404992, 9437248, 16785408, 18874432,
+ 33554453, 33554467, 33558528, 37748800, 38338689, 67108890, 67108995, 67109888, 75497479, 75497504, 76679170,
+ 76693504, 80740354, 100663315, 113836035, 134217772, 134217862, 134217984, 134217989, 150994958, 150994960,
+ 150994965, 167772197, 184549383, 186646532, 201326662, 220495878, 226492420, 229900294, 81, 134, 137, 164,
+ 200, 261, 416, 515, 704, 4736, 36992, 73856, 147584, 295040, 299520, 1179776, 2228224, 2359424, 8421376,
+ 9437312, 18874496, 33554457, 33554499, 33562624, 37748864, 67108906, 67108934, 67109123, 67110912, 75497483,
+ 75497510, 75497536, 75497546, 75497762, 75498050, 75499778, 75502082, 75506690, 75515906, 75534338, 75571202,
+ 75644930, 75792386, 76087298, 76681218, 77856770, 80216066, 94371842, 100663331, 113246210, 134217780,
+ 134217990, 134218240, 150994981, 153419780, 167772229, 171966465, 201326726, 218103822, 229769222, 251658247 };
+
+static int cbest_29[1023] = {
+ 1, 2, 268435458, 4, 134217729, 8, 16, 335544322, 32, 64, 167772161, 128, 256, 512, 352321538, 1024, 2048,
+ 4096, 176160769, 8192, 16384, 32768, 65536, 356515842, 131072, 262144, 524288, 1048576, 178257921, 2097152,
+ 4194304, 8388608, 16777216, 33554432, 357564418, 67108864, 134217728, 268435456, 3, 178782209, 268435459, 5,
+ 10, 20, 40, 80, 160, 320, 640, 1280, 2560, 5120, 10240, 20480, 40960, 81920, 163840, 327680, 655360, 1310720,
+ 2621440, 5242880, 10485760, 20971520, 41943040, 83886080, 167772160, 6, 9, 134217731, 268435462, 335544320,
+ 402653187, 17, 134217733, 268435466, 335544323, 12, 18, 33, 134217737, 268435474, 34, 65, 134217745,
+ 268435490, 335544326, 469762051, 24, 36, 66, 129, 134217761, 167772163, 268435522, 335544330, 357826562,
+ 436207619, 68, 130, 257, 134217793, 167772165, 176160768, 268435586, 335544338, 352321536, 48, 72, 132, 258,
+ 513, 88080384, 134217857, 167772169, 268435714, 335544354, 352321539, 136, 260, 514, 1025, 44040192,
+ 134217985, 167772177, 268435970, 335544386, 503316483, 96, 144, 264, 516, 1026, 2049, 22020096, 134218241,
+ 167772193, 268436482, 335544450, 352321542, 486539267, 272, 520, 1028, 2050, 4097, 11010048, 134218753,
+ 167772225, 268437506, 335544578, 352321546, 192, 288, 528, 1032, 2052, 4098, 8193, 5505024, 134219777,
+ 167772289, 176160771, 178913281, 268439554, 335544834, 352321554, 444596227, 544, 1040, 2056, 4100, 8194,
+ 16385, 2752512, 134221825, 167772417, 176160773, 268443650, 335545346, 352321570, 384, 576, 1056, 2064, 4104,
+ 8196, 16386, 32769, 1376256, 134225921, 167772673, 176160777, 268451842, 335546370, 352321602, 356515840,
+ 520093699, 1088, 2080, 4112, 8200, 16388, 32770, 65537, 688128, 134234113, 167773185, 176160785, 178257920,
+ 268468226, 335548418, 352321666, 356515843, 511705091, 768, 1152, 2112, 4128, 8208, 16392, 32772, 65538,
+ 131073, 344064, 134250497, 167774209, 176160801, 268500994, 335552514, 352321794, 2176, 4160, 8224, 16400,
+ 32776, 65540, 131074, 172032, 262145, 89128960, 134283265, 167776257, 176160833, 268566530, 335560706,
+ 352322050, 356515846, 490733571, 1536, 2304, 4224, 8256, 16416, 32784, 65544, 86016, 131076, 262146, 524289,
+ 134348801, 167780353, 176160897, 268697602, 335577090, 352322562, 356515850, 4352, 8320, 16448, 32800, 43008,
+ 65552, 131080, 262148, 524290, 1048577, 44564480, 134479873, 167788545, 176161025, 268959746, 335609858,
+ 352323586, 356515858, 357892098, 3072, 4608, 8448, 16512, 21504, 32832, 65568, 131088, 262152, 524292,
+ 1048578, 2097153, 134742017, 167804929, 176161281, 178257923, 269484034, 335675394, 352325634, 356515874,
+ 446693379, 528482307, 8704, 10752, 16640, 32896, 65600, 131104, 262160, 524296, 1048580, 2097154, 4194305,
+ 22282240, 135266305, 167837697, 176161793, 178257925, 270532610, 335806466, 352329730, 356515906, 524288003,
+ 5376, 6144, 9216, 16896, 33024, 65664, 131136, 262176, 524304, 1048584, 2097156, 4194306, 8388609, 136314881,
+ 167903233, 176162817, 178257929, 272629762, 336068610, 352337922, 356515970, 2688, 17408, 33280, 65792,
+ 131200, 262208, 524320, 1048592, 2097160, 4194308, 8388610, 11141120, 16777217, 138412033, 168034305,
+ 176164865, 178257937, 276824066, 336592898, 352354306, 356516098, 357564416, 513802243, 1344, 12288, 18432,
+ 33792, 66048, 131328, 262272, 524352, 1048608, 2097168, 4194312, 8388612, 16777218, 33554433, 142606337,
+ 168296449, 176168961, 178257953, 285212674, 337641474, 352387074, 356516354, 357564419, 672, 34816, 66560,
+ 131584, 262400, 524416, 1048640, 2097184, 4194320, 5570560, 8388616, 16777220, 33554434, 67108865, 150994945,
+ 168820737, 176177153, 178257985, 178782208, 301989890, 339738626, 352452610, 356516866, 336, 24576, 36864,
+ 67584, 132096, 262656, 524544, 1048704, 2097216, 4194336, 8388624, 16777224, 33554436, 67108866, 169869313,
+ 176193537, 178258049, 178946049, 343932930, 352583682, 356517890, 357564422, 491782147, 168, 69632, 133120,
+ 263168, 524800, 1048832, 2097280, 2785280, 4194368, 8388640, 16777232, 33554440, 67108868, 134217730,
+ 171966465, 176226305, 178258177, 201326593, 352845826, 356519938, 357564426, 402653186, 532676611, 84, 49152,
+ 73728, 135168, 264192, 525312, 1049088, 2097408, 4194432, 8388672, 16777248, 33554448, 67108872, 89391104,
+ 134217732, 176291841, 178258433, 268435457, 353370114, 356524034, 357564434, 369098754, 530579459, 42,
+ 139264, 266240, 526336, 1049600, 1392640, 2097664, 4194560, 8388736, 16777280, 33554464, 67108880, 134217736,
+ 176422913, 178258945, 184549377, 268435464, 354418690, 356532226, 357564450, 11, 21, 41, 81, 161, 321, 641,
+ 1281, 2561, 5121, 10241, 20481, 40961, 81921, 98304, 147456, 163841, 270336, 327681, 528384, 655361, 1050624,
+ 1310721, 2098176, 2621441, 4194816, 5242881, 8388864, 10485761, 16777344, 20971521, 33554496, 41943041,
+ 67108896, 83886081, 134217744, 176685057, 178259969, 178782211, 268435460, 356548610, 357564482, 402653185,
+ 447217667, 469762050, 525336579, 7, 22, 82, 162, 322, 642, 1282, 2562, 5122, 10242, 20482, 40962, 81922,
+ 163842, 278528, 327682, 532480, 655362, 696320, 1052672, 1310722, 2099200, 2621442, 4195328, 5242882,
+ 8389120, 10485762, 16777472, 20971522, 33554560, 41943042, 44695552, 67108928, 83886082, 134217760,
+ 167772162, 177209345, 178262017, 178782213, 234881025, 268435463, 268435478, 268435498, 268435538, 268435618,
+ 268435778, 268436098, 268436738, 268438018, 268440578, 268445698, 268455938, 268476418, 268517378, 268599298,
+ 268763138, 269090818, 269746178, 271056898, 273678338, 278921218, 289406978, 310378498, 335544321, 356581378,
+ 357564546, 360710146, 436207618, 14, 44, 164, 324, 644, 1284, 2564, 5124, 10244, 20484, 40964, 81924, 163844,
+ 196608, 294912, 327684, 540672, 655364, 1056768, 1310724, 2101248, 2621444, 4196352, 5242884, 8389632,
+ 10485764, 16777728, 20971524, 33554688, 41943044, 67108992, 83886084, 134217739, 134217749, 134217769,
+ 134217792, 134217809, 134217889, 134218049, 134218369, 134219009, 134220289, 134222849, 134227969, 134238209,
+ 134258689, 134299649, 134381569, 134545409, 134873089, 135528449, 136839169, 139460609, 144703489, 155189249,
+ 167772164, 178266113, 178782217, 218103809, 268435467, 268435472, 335544328, 356646914, 357564674, 13, 19,
+ 28, 88, 328, 648, 1288, 2568, 5128, 10248, 20488, 40968, 81928, 163848, 327688, 348160, 557056, 655368,
+ 1064960, 1310728, 2105344, 2621448, 4198400, 5242888, 8390656, 10485768, 16778240, 20971528, 33554944,
+ 41943048, 67109120, 83886088, 134217735, 134217856, 167772168, 178274305, 178782225, 180355073, 268435475,
+ 268435488, 335544324, 356777986, 357564930, 385875970, 402653191, 469762049, 514326531, 26, 35, 56, 176, 656,
+ 1296, 2576, 5136, 10256, 20496, 40976, 81936, 163856, 327696, 393216, 589824, 655376, 1081344, 1310736,
+ 2113536, 2621456, 4202496, 5242896, 8392704, 10485776, 16779264, 20971536, 22347776, 33555456, 41943056,
+ 67109376, 83886096, 134217984, 167772176, 178290689, 178782241, 268435470, 268435491, 268435520, 335544327,
+ 335544342, 335544362, 335544402, 335544482, 335544642, 335544962, 335545602, 335546882, 335549442, 335554562,
+ 335564802, 335585282, 335626242, 335708162, 335872002, 336199682, 336855042, 338165762, 340787202, 346030082,
+ 357040130, 357565442, 357826560, 357908482, 377487362, 402653195, 436207617, 503316482, 25, 37, 52, 67, 112,
+ 352, 1312, 2592, 5152, 10272, 20512, 40992, 81952, 163872, 174080, 327712, 655392, 1114112, 1310752, 2129920,
+ 2621472, 4210688, 5242912, 8396800, 10485792, 16781312, 20971552, 33556480, 41943072, 67109888, 83886112,
+ 134217741, 134217747, 134218240, 167772192, 178323457, 178782273, 192937985, 268435523, 268435584, 335544331,
+ 335544336, 357566466, 357826563, 402653203, 486539266, 534773763, 38, 69, 74, 104, 131, 224, 704, 2624, 5184,
+ 10304, 20544, 41024, 81984, 163904, 327744, 655424, 786432, 1179648, 1310784, 2162688, 2621504, 4227072,
+ 5242944, 8404992, 10485824, 16785408, 20971584, 33558528, 41943104, 67110912, 83886144, 134217763, 134218752,
+ 167772171, 167772181, 167772201, 167772224, 167772241, 167772321, 167772481, 167772801, 167773441, 167774721,
+ 167777281, 167782401, 167792641, 167813121, 167854081, 167936001, 168099841, 168427521, 169082881, 170393601,
+ 173015041, 178388993, 178782337, 188743681, 251658241, 268435482, 268435494, 268435587, 268435712, 335544339,
+ 335544352, 352321537, 357568514, 358612994, 402653219, 469762055, 533725187, 49, 70, 73, 133, 138, 148, 208,
+ 259, 448, 1408, 5248, 10368, 20608, 41088, 82048, 87040, 163968, 327808, 655488, 1310848, 2228224, 2621568,
+ 4259840, 5243008, 8421376, 10485888, 11173888, 16793600, 20971648, 33562624, 41943168, 67112960, 83886208,
+ 88080385, 134217753, 134217765, 134217795, 134219776, 167772167, 167772288, 176160770, 178520065, 178782465,
+ 178913280, 243269633, 268435526, 268435715, 268435968, 335544334, 335544355, 335544384, 352321544, 357572610,
+ 357826566, 402653251, 436207623, 444596226, 469762059, 492044291, 503316481, 50, 134, 137, 261, 266, 296,
+ 416, 515, 896, 2816, 20736, 41216, 82176, 164096, 327936, 655616, 1310976, 1572864, 2359296, 2621696,
+ 8454144, 10486016, 16809984, 20971776, 33570816, 44040193, 67117056, 83886336, 134217797, 167772416,
+ 178782721, 268435506, 268435530, 268435590, 268435971, 268436480, 335544387, 335544448, 352321540, 357826570,
+ 364904450, 402653315, 436207627, 469762067, 486539265 };
+
+static int cbest_30[1023] = {
+ 1, 2, 541065219, 4, 811597826, 8, 405798913, 16, 32, 64, 743964675, 128, 913047554, 256, 541065218,
+ 270532609, 3, 512, 456523777, 541065217, 6, 811597827, 5, 676331523, 12, 129, 405798912, 258, 811597824, 9,
+ 24, 516, 1024, 202899456, 541065223, 541065283, 10, 1032, 946864130, 48, 2064, 101449728, 879230978, 17,
+ 4128, 541065227, 811597858, 96, 8256, 50724864, 811597830, 20, 16512, 405798915, 18, 33, 192, 2048, 33024,
+ 25362432, 405798929, 473432065, 541065235, 769327107, 66048, 439615489, 743964674, 811597834, 132096,
+ 12681216, 405798917, 34, 40, 65, 264192, 541065251, 528384, 6340608, 743964673, 811597842, 384, 1056768,
+ 405798921, 36, 66, 4096, 2113536, 3170304, 371982337, 743964683, 80, 4227072, 1585152, 8454144, 777781251,
+ 68, 130, 768, 16908288, 541065347, 743964679, 760872963, 913047555, 792576, 33816576, 811597890, 917340162,
+ 160, 67633152, 405798945, 72, 132, 257, 8192, 396288, 135266304, 913047552, 1536, 270532608, 811597954,
+ 913047558, 260, 198144, 405798977, 727056387, 136, 456523776, 541065475, 743964691, 193, 386, 99072, 514,
+ 3072, 405799041, 541065216, 929955842, 144, 320, 513, 16384, 49536, 743964707, 921501698, 997588994, 264,
+ 520, 772, 541065315, 676331522, 811598082, 7, 24768, 270532611, 456523779, 458670081, 913047562, 385, 6144,
+ 228261888, 338165761, 541065221, 541065411, 541065731, 743964739, 811597825, 13, 1028, 1544, 12384,
+ 270532641, 541065222, 541065282, 676331521, 14, 259, 270532613, 270532673, 405799169, 541065346, 811597874,
+ 904593410, 913047570, 946864131, 25, 272, 517, 770, 1025, 1040, 6192, 202899457, 541065473, 743964803,
+ 879230979, 11, 26, 131, 640, 1026, 1033, 3088, 12288, 405798914, 456523781, 541065226, 541065231, 541065281,
+ 676331539, 811597828, 811597859, 811597922, 811598338, 28, 49, 528, 2065, 3096, 101449729, 270532617,
+ 405798928, 464977921, 473432064, 541065225, 676331555, 710148099, 811597831, 913047586, 946864128, 161, 288,
+ 518, 1540, 2056, 4129, 32768, 114130944, 202899458, 202899464, 405798937, 439615488, 460750849, 498794497,
+ 541065243, 541065735, 541066243, 676331527, 811597955, 879230976, 22, 52, 97, 133, 1034, 1548, 6176, 8257,
+ 50724865, 101449732, 405798916, 456523785, 541065234, 541065299, 541066251, 769327106, 811597856, 811598080,
+ 21, 50, 56, 262, 322, 2066, 2080, 16513, 24576, 50724866, 101449730, 236716032, 270532625, 405798961,
+ 405799040, 405799425, 541065267, 541067283, 642514947, 811597835, 811597838, 879230986, 913047618, 946864146,
+ 1014497282, 19, 774, 2049, 3080, 4130, 33025, 25362433, 202899460, 202899520, 219807744, 541065287,
+ 541069347, 676331531, 743964931, 811597832, 811597866, 879230994, 980680706, 44, 98, 104, 137, 644, 1036,
+ 8258, 12352, 66049, 101449760, 405798919, 405798920, 456523793, 541065233, 541065250, 541073475, 591790083,
+ 769327105, 811597850, 811598342, 811598850, 946864134, 946864194, 112, 266, 387, 2052, 2068, 4112, 16514,
+ 132097, 12681217, 50724880, 118358016, 371982336, 405798933, 452296705, 541065239, 541081731, 743964672,
+ 811597843, 811598858, 879230982, 896139266, 913047682, 35, 41, 194, 524, 544, 1056, 1280, 2050, 4132, 6160,
+ 33026, 264193, 25362434, 25362440, 57065472, 109903872, 405798925, 405798931, 405800961, 439615493,
+ 473432067, 473432073, 541065291, 541067267, 541073411, 541098243, 545357827, 566427651, 770400259, 811597862,
+ 811599890, 811601922, 42, 88, 100, 145, 208, 4160, 8260, 16385, 24704, 66050, 528385, 6340609, 12681220,
+ 50724868, 405799171, 439615491, 439615497, 456523809, 541065249, 541131267, 743964682, 743964687, 811601954,
+ 946864138, 38, 224, 274, 2072, 16516, 132098, 1056769, 6340610, 12681218, 59179008, 101449736, 185991168,
+ 405799429, 405799937, 473432097, 507248641, 539017219, 541065410, 541197315, 743964681, 805355522, 811597840,
+ 811606082, 862322690, 37, 67, 196, 532, 576, 1288, 4097, 4136, 12320, 33028, 49152, 65536, 264194, 2113537,
+ 3170305, 25362436, 54951936, 202899472, 405798923, 405799945, 473432069, 490340353, 541065259, 541329411,
+ 743964677, 743964699, 743965187, 743965699, 765165571, 769327111, 777781250, 811597846, 811614338, 81, 176,
+ 388, 1048, 8224, 8264, 49408, 66052, 528386, 4227073, 50724872, 384663553, 402677761, 405798944, 405800977,
+ 456523841, 541593603, 743964678, 760872962, 782073859, 811598018, 811599874, 811630850, 815890434, 836960258,
+ 70, 290, 16520, 32770, 132100, 1056770, 1585153, 4227074, 8454145, 29589504, 92995584, 101449744, 270532705,
+ 270532737, 371982341, 405803041, 448069633, 536903683, 541065603, 542121987, 743964802, 760872961, 811597891,
+ 811597986, 811663874, 913047810, 917340163, 69, 84, 200, 548, 769, 4098, 4104, 4144, 8320, 24640, 33032,
+ 264196, 2113538, 3170306, 8454146, 16908289, 27475968, 28532736, 202899488, 371982339, 405799009, 405807169,
+ 541065255, 541065539, 541069315, 543178755, 544235523, 676331587, 743964929, 769327115, 771440643, 777781249,
+ 809517058, 810573826, 811729922, 76, 82, 416, 1064, 2112, 8272, 66056, 528388, 792577, 6340612, 8454148,
+ 16908290, 33816577, 404758529, 405798976, 405798993, 405815425, 456523905, 541065345, 777781255, 811862018,
+ 913047553, 913047556, 913048066, 946864162, 134, 448, 641, 2096, 2560, 2576, 16528, 132104, 1056772, 1585154,
+ 12681224, 14794752, 16908292, 46497792, 67633153, 405799105, 405831937, 407945217, 431161345, 542650371,
+ 549519363, 760872967, 811597888, 812126210, 879231010, 913047559, 917340160, 73, 392, 580, 776, 1088, 4100,
+ 8193, 16448, 33040, 65540, 98304, 264200, 396289, 2113540, 3170308, 13737984, 16908296, 25362448, 33816580,
+ 135266305, 371982401, 388890625, 405864961, 473432081, 541065987, 557973507, 676331651, 727056386, 742404099,
+ 743965191, 758824963, 769327123, 777781267, 807436290, 811598210, 812654594, 74, 140, 352, 1096, 1537, 8288,
+ 66064, 98816, 528392, 792578, 4227076, 6340616, 33816578, 33816584, 50724896, 380436481, 403718145,
+ 405286913, 405931009, 418480129, 439615505, 541065474, 541857795, 574881795, 727056385, 743964690, 743965707,
+ 162, 168, 261, 1538, 2128, 16544, 132112, 198145, 1056776, 1585156, 7397376, 12681232, 23248896, 33816592,
+ 67633154, 67633160, 101449792, 270532865, 405798947, 405798953, 406063105, 456524033, 541065379, 608698371,
+ 676331571, 743444483, 743964723, 743966739, 794689539, 811597906, 811597952, 879231042, 913047566, 152, 832,
+ 1152, 4192, 8194, 24577, 33056, 49280, 131072, 264208, 396290, 2113544, 3170312, 6868992, 14266368, 25362464,
+ 67633168, 135266306, 202899584, 371982345, 406327297, 541065351, 541461507, 743964715, 743968803, 756711427,
+ 769327139, 786235395, 813182978, 820051970, 915259394, 917876738, 929955843, 138, 896, 1160, 1282, 5152,
+ 8208, 66080, 99073, 131080, 528400, 792580, 4227080, 6340624, 50724928, 67633184, 135266320, 270532610,
+ 405799297, 439615521, 456523778, 458670080, 541066755, 541077507, 743964689, 743964706, 743964771, 743972931,
+ 811597894, 811598594, 828506114, 913047683, 917340166, 921501699, 923713538, 929955840, 946864258, 997588995,
+ 164, 268, 400, 515, 645, 784, 1290, 1552, 2192, 2580, 3073, 4224, 5160, 10320, 16576, 16640, 20640, 41280,
+ 49154, 82560, 132128, 165120, 198146, 330240, 660480, 1056784, 1320960, 1585160, 2641920, 3698688, 5283840,
+ 8454152, 10567680, 11624448, 12681248, 21135360, 42270720, 67633156, 84541440, 101449856, 135266336,
+ 169082880, 338165760, 405798949, 405798979, 405803009, 541065314, 541065479, 541065601, 541263363, 725008387,
+ 742924291, 743964695, 743981187, 777781259, 811598083, 812390402, 845414402, 879231106, 913047808, 921501696,
+ 991346690, 997588992, 321, 3074, 4256, 8196, 33088, 49537, 196608, 264224, 396292, 2113552, 3170320, 3434496,
+ 25362496, 135266308, 135266368, 270532640, 270532657, 363528193, 371982353, 410025985, 473432129, 541065355,
+ 541065537, 541065729, 676331779, 743964867, 743966723, 743997699, 745037827, 748257283, 760872971, 769327171,
+ 811603970, 912267266, 913047563, 195, 265, 280, 521, 704, 773, 3076, 5120, 8384, 66112, 99074, 197632,
+ 528416, 792584, 4227088, 6340640, 50724992, 270532612, 270532672, 405799168, 414253057, 439615553, 541065220,
+ 541065730, 541164291, 727056419, 743964705, 743964738, 744030723, 811597898, 811597958, 811606018, 811994114,
+ 912787458, 913047560, 917340170, 148, 209, 324, 522, 1664, 24769, 32896, 132160, 198148, 262160, 1056800,
+ 1585168, 1849344, 5812224, 8454160, 12681280, 270532736, 270532801, 270533121, 405798981, 405799043,
+ 405799681, 405801985, 422707201, 541065313, 541068291, 541089795, 676331520, 722894851, 727056391, 741916675,
+ 743708675, 744096771, 811599362, 913047578, 913048070, 913048578, 918396930, 929955850, 146, 225, 418, 1546,
+ 2176, 2564, 6145, 8200, 10304, 16386, 33152, 49538, 98560, 264256, 396296, 1717248, 2113568, 3170336,
+ 7133184, 16908304, 25362560, 135266312, 202899712, 228261889, 371982369, 405799057, 406591489, 456523780,
+ 473432193, 541065363, 541081603, 541114755, 735510531, 744228867, 760872979, 769327235, 811597875, 811598019,
+ 811598086, 811598208, 811795970, 879231002, 904593411, 913047571, 913047574, 913048586, 197, 336, 1029, 1545,
+ 1568, 1792, 12385, 49153, 66176, 98308, 99076, 792592, 4227104, 270532615, 270532616, 456524289, 464977920,
+ 541065409, 541065472, 541065602, 541065991, 676331547, 743964737, 811597962, 811597987, 811598336, 811610114,
+ 912527362, 913049618, 917340178, 946864386 };
+
+static int cbest_31[1023] = {
+ 1, 2, 1073741828, 4, 536870914, 8, 268435457, 16, 32, 1207959556, 64, 128, 603979778, 256, 512, 301989889,
+ 1024, 2048, 4096, 1224736772, 8192, 16384, 32768, 612368386, 65536, 131072, 262144, 306184193, 524288,
+ 1048576, 2097152, 4194304, 1226833924, 8388608, 16777216, 33554432, 67108864, 613416962, 134217728,
+ 268435456, 306708481, 536870912, 3, 1073741824, 1073741829, 5, 536870915, 1073741830, 6, 9, 18, 36, 72, 144,
+ 288, 576, 1152, 2304, 4608, 9216, 18432, 36864, 73728, 147456, 294912, 589824, 1179648, 2359296, 4718592,
+ 9437184, 18874368, 37748736, 75497472, 150994944, 301989888, 1610612742, 10, 17, 268435459, 536870918,
+ 603979776, 1073741836, 1227096068, 1342177285, 12, 33, 268435461, 536870922, 805306371, 1073741844,
+ 1207959552, 1207959557, 20, 34, 65, 268435465, 536870930, 1073741860, 1207959558, 24, 66, 129, 268435473,
+ 536870946, 603979779, 1073741892, 1744830470, 40, 68, 130, 257, 268435489, 536870978, 1073741956, 1207959564,
+ 1476395013, 1677721606, 48, 132, 258, 513, 268435521, 536871042, 603979782, 613548034, 1073742084,
+ 1207959572, 80, 136, 260, 514, 1025, 268435585, 301989891, 536871170, 603979786, 872415235, 1073742340,
+ 1207959588, 1375731717, 96, 264, 516, 1026, 2049, 268435713, 301989893, 536871426, 603979794, 838860803,
+ 1073742852, 1207959620, 160, 272, 520, 1028, 2050, 4097, 268435969, 301989897, 306184192, 536871938,
+ 603979810, 612368384, 1073743876, 1207959684, 1224736768, 1224736773, 1811939334, 192, 528, 1032, 2052, 4098,
+ 8193, 153092096, 268436481, 301989905, 536872962, 603979842, 1073745924, 1207959812, 1224736774, 320, 544,
+ 1040, 2056, 4100, 8194, 16385, 76546048, 268437505, 301989921, 306774017, 536875010, 603979906, 1073750020,
+ 1207960068, 1509949445, 1761607686, 384, 1056, 2064, 4104, 8196, 16386, 32769, 38273024, 268439553,
+ 301989953, 536879106, 603980034, 612368387, 1073758212, 1207960580, 1224736780, 1493172229, 640, 1088, 2080,
+ 4112, 8200, 16388, 32770, 65537, 19136512, 268443649, 301990017, 536887298, 603980290, 905969667, 1073774596,
+ 1207961604, 1224736788, 1686110214, 768, 2112, 4128, 8208, 16392, 32772, 65538, 131073, 9568256, 268451841,
+ 301990145, 536903682, 603980802, 612368390, 1073807364, 1207963652, 1224736804, 1280, 2176, 4160, 8224,
+ 16400, 32776, 65540, 131074, 262145, 4784128, 268468225, 301990401, 536936450, 603981826, 612368394,
+ 880803843, 1073872900, 1207967748, 1224736836, 1536, 4224, 8256, 16416, 32784, 65544, 131076, 262146, 524289,
+ 2392064, 268500993, 301990913, 306184195, 537001986, 603983874, 612368402, 1074003972, 1207975940,
+ 1224736900, 1379926021, 1828716550, 2560, 4352, 8320, 16448, 32800, 65552, 131080, 262148, 524290, 1048577,
+ 1196032, 268566529, 301991937, 306184197, 537133058, 603987970, 612368418, 843055107, 1074266116, 1207992324,
+ 1224737028, 1227128836, 1820327942, 3072, 8448, 16512, 32832, 65568, 131088, 262152, 524292, 598016, 1048578,
+ 2097153, 268697601, 301993985, 306184201, 537395202, 603996162, 612368450, 1074790404, 1208025092,
+ 1224737284, 1526726661, 5120, 8704, 16640, 32896, 65600, 131104, 262160, 299008, 524296, 1048580, 2097154,
+ 4194305, 268959745, 301998081, 306184209, 537919490, 604012546, 612368514, 1075838980, 1208090628,
+ 1224737796, 1226833920, 1226833925, 6144, 16896, 33024, 65664, 131136, 149504, 262176, 524304, 1048584,
+ 2097156, 4194306, 8388609, 269484033, 302006273, 306184225, 538968066, 604045314, 612368642, 613416960,
+ 1077936132, 1208221700, 1224738820, 1226833926, 1514143749, 10240, 17408, 33280, 65792, 74752, 131200,
+ 262208, 524320, 1048592, 2097160, 4194308, 8388610, 16777217, 270532609, 302022657, 306184257, 306708480,
+ 541065218, 604110850, 612368898, 914358275, 1082130436, 1208483844, 1224740868, 1763704838, 12288, 33792,
+ 37376, 66048, 131328, 262272, 524352, 1048608, 2097168, 4194312, 8388612, 16777218, 33554433, 272629761,
+ 302055425, 306184321, 545259522, 604241922, 612369410, 910163971, 1090519044, 1209008132, 1224744964,
+ 1226833932, 1495269381, 18688, 20480, 34816, 66560, 131584, 262400, 524416, 1048640, 2097184, 4194320,
+ 8388616, 16777220, 33554434, 67108865, 153354240, 276824065, 302120961, 306184449, 553648130, 604504066,
+ 612370434, 613416963, 613564418, 1107296260, 1210056708, 1224753156, 1226833940, 9344, 24576, 67584, 132096,
+ 262656, 524544, 1048704, 2097216, 4194336, 8388624, 16777224, 33554436, 67108866, 134217729, 285212673,
+ 302252033, 306184705, 570425346, 605028354, 612372482, 1140850692, 1212153860, 1224769540, 1226833956,
+ 1687158790, 1837105158, 4672, 40960, 69632, 133120, 263168, 524800, 1048832, 2097280, 4194368, 8388640,
+ 16777232, 33554440, 67108868, 76677120, 134217730, 302514177, 306185217, 606076930, 612376578, 613416966,
+ 1216348164, 1224802308, 1226833988, 2336, 49152, 135168, 264192, 525312, 1049088, 2097408, 4194432, 8388672,
+ 16777248, 33554448, 67108872, 134217732, 268435458, 303038465, 306186241, 335544321, 608174082, 612384770,
+ 613416970, 671088642, 881852419, 1224867844, 1226834052, 1342177284, 1830813702, 1168, 81920, 139264, 266240,
+ 526336, 1049600, 2097664, 4194560, 8388736, 16777280, 33554464, 38338560, 67108880, 134217736, 268435460,
+ 304087041, 306188289, 402653185, 536870913, 612401154, 613416978, 805306370, 1224998916, 1226834180,
+ 1241513988, 1530920965, 584, 98304, 270336, 528384, 1050624, 2098176, 4194816, 8388864, 16777344, 33554496,
+ 67108896, 134217744, 268435464, 306192385, 306708483, 612433922, 613416994, 620756994, 1225261060,
+ 1226834436, 1275068420, 1380450309, 1528823813, 1610612740, 1821376518, 292, 163840, 278528, 532480, 1052672,
+ 2099200, 4195328, 8389120, 16777472, 19169280, 33554560, 67108928, 134217760, 268435472, 306200577,
+ 306708485, 306782209, 310378497, 536870916, 536870928, 612499458, 613417026, 637534210, 843579395,
+ 1073741825, 1225785348, 1226834948, 146, 196608, 540672, 1056768, 2101248, 4196352, 8389632, 16777728,
+ 33554688, 67108992, 134217792, 268435488, 306216961, 306708489, 318767105, 536870920, 612630530, 613417090,
+ 805306369, 918552579, 1073741826, 1073741831, 1073741856, 1226835972, 1476395012, 7, 19, 37, 73, 145, 289,
+ 577, 1153, 2305, 4609, 9217, 18433, 36865, 73729, 147457, 294913, 327680, 557056, 589825, 1064960, 1179649,
+ 2105344, 2359297, 4198400, 4718593, 8390656, 9437185, 9584640, 16778240, 18874369, 33554944, 37748737,
+ 67109120, 75497473, 134217856, 150994945, 268435520, 306249729, 306708497, 612892674, 613417218, 738197506,
+ 1226838020, 1228931076, 1610612738, 1610612743, 11, 38, 74, 290, 578, 1154, 2306, 4610, 9218, 18434, 36866,
+ 73730, 147458, 294914, 393216, 589826, 1081344, 1179650, 2113536, 2359298, 4202496, 4718594, 8392704,
+ 9437186, 16779264, 18874370, 33555456, 37748738, 67109376, 75497474, 134217984, 150994946, 268435584,
+ 301989890, 306315265, 306708513, 369098753, 536870919, 536870944, 603979777, 613417474, 872415234, 915406851,
+ 1073741832, 1073741837, 1073741846, 1073741900, 1073741972, 1073742116, 1073742404, 1073742980, 1073744132,
+ 1073746436, 1073751044, 1073760260, 1073778692, 1073815556, 1073889284, 1074036740, 1074331652, 1074921476,
+ 1076101124, 1078460420, 1083179012, 1092616196, 1111490564, 1149239300, 1226842116, 1227096064, 1227096069,
+ 1233125380, 1342177281, 1375731716, 1514668037, 1744830468, 13, 22, 76, 148, 580, 1156, 2308, 4612, 9220,
+ 18436, 36868, 73732, 147460, 294916, 589828, 655360, 1114112, 1179652, 2129920, 2359300, 4210688, 4718596,
+ 4792320, 8396800, 9437188, 16781312, 18874372, 33556480, 37748740, 67109888, 75497476, 134218240, 150994948,
+ 268435712, 301989892, 306446337, 306708545, 436207617, 536870923, 536870950, 536870976, 536870986, 536871058,
+ 536871202, 536871490, 536872066, 536873218, 536875522, 536880130, 536889346, 536907778, 536944642, 537018370,
+ 537165826, 537460738, 538050562, 539230210, 541589506, 546308098, 555745282, 574619650, 613417986, 614465538,
+ 687865858, 838860802, 1073741838, 1073741840, 1073741845, 1207959553, 1226850308, 1227096070, 1342177287,
+ 1677721604, 14, 21, 26, 35, 44, 152, 296, 1160, 2312, 4616, 9224, 18440, 36872, 73736, 147464, 294920,
+ 589832, 786432, 1179656, 2162688, 2359304, 4227072, 4718600, 8404992, 9437192, 16785408, 18874376, 33558528,
+ 37748744, 67110912, 75497480, 134218752, 150994952, 268435463, 268435475, 268435493, 268435529, 268435601,
+ 268435745, 268435968, 268436033, 268436609, 268437761, 268440065, 268444673, 268453889, 268472321, 268509185,
+ 268582913, 268730369, 269025281, 269615105, 270794753, 273154049, 277872641, 287309825, 301989896, 306708609,
+ 343932929, 419430401, 536870931, 536871040, 603979780, 603979792, 613419010, 613548032, 616562690, 910688259,
+ 1073741861, 1207959554, 1207959559, 1207959584, 1226866692, 1258291204, 1610612750, 1763966982, 1839202310,
+ 1879048199, 25, 52, 67, 88, 304, 592, 2320, 4624, 9232, 18448, 36880, 73744, 147472, 294928, 589840, 1179664,
+ 1310720, 2228224, 2359312, 2396160, 4259840, 4718608, 8421376, 9437200, 16793600, 18874384, 33562624,
+ 37748752, 67112960, 75497488, 134219776, 150994960, 268435467, 268436480, 301989904, 306708737, 307232769,
+ 536870926, 536870947, 536871168, 603979784, 613421058, 805306375, 872415233, 1073741862, 1073741888,
+ 1073741893, 1226899460, 1227096076, 1291845636, 1342177293, 1495531525, 1610612758, 1744830466, 1744830471,
+ 1838153734, 41, 50, 69, 104, 131, 176, 608, 18464, 73760, 147488, 589856, 1179680, 1572864, 2359328, 4325376,
+ 4718624, 8454144, 9437216, 16809984, 18874400, 33570816, 67117056, 75497504, 134221824, 301989920, 306708993,
+ 306774016, 308281345, 536870934, 536870979, 629145602, 805306379, 1073741894, 1073741952, 1207959560,
+ 1207959700, 1207959844, 1207960708, 1207961860, 1207964164, 1207968772, 1207977988, 1207996420, 1208033284,
+ 1208107012, 1208549380, 1209139204, 1210318852, 1212678148, 1217396740, 1226964996, 1283457028, 1342177301,
+ 1476395009, 1509949444, 1610612774, 1677721607 };
+
+static unsigned int cbest_32[1023] = {
+ 1, 2149580803, 2, 4, 3224371202, 8, 1612185601, 16, 32, 2955673603, 64, 128, 256, 3627417602, 512, 1024,
+ 2149580802, 1074790401, 1813708801, 3, 2149580801, 2048, 3224371203, 6, 2686976003, 5, 12, 1612185600,
+ 3224371200, 2149580807, 9, 24, 806092800, 3761766402, 10, 1025, 4096, 3493068802, 48, 2050, 403046400,
+ 2149580811, 2149581315, 17, 4100, 3056435203, 3224371206, 96, 8200, 201523200, 20, 16400, 1612185603,
+ 2149580819, 3224371458, 18, 33, 192, 32800, 100761600, 1880883201, 2955673602, 3224371210, 8192, 65600,
+ 1746534401, 384, 131200, 50380800, 1612185605, 2149580835, 34, 40, 65, 262400, 1612185729, 2955673601,
+ 3224371218, 768, 524800, 25190400, 1049600, 1612185609, 2149580867, 36, 66, 129, 1536, 2099200, 12595200,
+ 1477836801, 3224371234, 80, 16384, 4198400, 3090022403, 6297600, 8396800, 1612185617, 2149580931, 2955673607,
+ 2955673667, 3022848003, 3627417603, 68, 130, 257, 16793600, 3224371266, 3677798402, 3148800, 33587200, 160,
+ 3072, 67174400, 1612185633, 2149581059, 2955673611, 3627417600, 72, 132, 258, 513, 1574400, 134348800,
+ 3224371330, 32768, 268697600, 2888499203, 787200, 537395200, 1612185665, 2955673619, 136, 260, 320, 514,
+ 6144, 1074790400, 1813708800, 3627417634, 393600, 3627417606, 3694592002, 2149581827, 2955673635, 3661004802,
+ 3963289602, 144, 264, 516, 1026, 1537, 3074, 196800, 2149580800, 3224371714, 640, 12288, 65536, 3627417610,
+ 98400, 1612185857, 2149581571, 2686976002, 272, 520, 1028, 2049, 6148, 906854400, 3224372226, 7, 49200,
+ 1074790403, 1813708803, 1838899201, 2149580805, 2149582339, 3224371201, 3593830402, 3627417618, 1280, 2052,
+ 3073, 24576, 1343488001, 1612186113, 1813708817, 2149580806, 2149582851, 2686976001, 2955673731, 13, 288,
+ 528, 1032, 12296, 24600, 3224371586, 3761766403, 14, 131072, 1074790405, 1813708805, 3493068803, 25, 4098,
+ 6146, 12300, 806092801, 1612186625, 1847296001, 2149580810, 2149580815, 2149581314, 2955673859, 3224371204,
+ 3224371970, 11, 26, 544, 1040, 4097, 24592, 49152, 453427200, 1074790657, 1612185602, 1830502401, 1981644801,
+ 2149580809, 2149581826, 2821324803, 3056435202, 3224371207, 3224373250, 3761766400, 28, 49, 2051, 2056, 4104,
+ 6150, 403046401, 1074790409, 1074790913, 1813708809, 1880883200, 2149580827, 2149582849, 2686976007,
+ 3493068800, 3627417666, 4101, 12292, 806092802, 1612185793, 1746534400, 2149580818, 2149581313, 2149584899,
+ 2686976131, 2955674115, 3224371459, 22, 52, 97, 576, 1027, 1056, 2560, 3075, 8201, 49184, 201523201,
+ 1612185604, 2149580851, 2552627203, 2686976259, 3056435201, 3224371211, 3224371214, 4030464002, 21, 50, 56,
+ 8196, 16401, 98304, 262144, 403046402, 940441600, 1074790417, 1612185728, 1612185985, 2149584903, 2686976011,
+ 3224371208, 3224372227, 3627417730, 3896115202, 19, 193, 4102, 24584, 32801, 100761601, 806092804, 806092864,
+ 873267200, 1612187649, 1796915201, 2149580817, 2149580834, 2149580899, 2149582338, 2149589003, 2351104003,
+ 2955674627, 3056435211, 3224371226, 3224371456, 3224373248, 3761766406, 44, 98, 104, 1029, 1088, 2064, 8193,
+ 8202, 8208, 65601, 98368, 201523202, 226713600, 403046432, 1612185607, 1612185608, 2149580823, 2149597203,
+ 2955673600, 3224371219, 3224375298, 3493068806, 3493068866, 3560243202, 3761766530, 112, 385, 2054, 8194,
+ 16402, 131201, 50380801, 201523216, 403046404, 470220800, 1074790433, 1477836800, 1612186624, 1813708833,
+ 2149580995, 2149581319, 2149613603, 2250342403, 2686976019, 3224371250, 3493068930, 3627417858, 35, 41, 194,
+ 1281, 4112, 32802, 49168, 196608, 262401, 100761602, 100761608, 436633600, 806092808, 806093312, 1612185613,
+ 1880883203, 2149580833, 2149580866, 2149581443, 2149588995, 2149646403, 2955673699, 3056435207, 3224375302,
+ 3761766410, 3761766914, 42, 88, 100, 208, 769, 1033, 1152, 2080, 8204, 16392, 65602, 196736, 524801,
+ 25190401, 50380804, 201523204, 403046656, 1074791169, 1612185616, 1746534403, 2149581187, 2149712003,
+ 2199961603, 3069030403, 3224371216, 3224371235, 3224371298, 3224379402, 3425894402, 3493068810, 38, 224, 386,
+ 2058, 2562, 5120, 16404, 131202, 524288, 1049601, 25190402, 50380802, 201523328, 235110400, 403046408,
+ 738918400, 1074790465, 1528217601, 1612185625, 1813708865, 2015232001, 2149580843, 2149581323, 2149843203,
+ 2686976035, 2955673605, 2955673795, 3073228803, 3090022402, 3224371222, 3224371462, 3224371522, 3224387602,
+ 3627418114, 37, 67, 196, 4108, 16416, 32804, 98336, 262402, 2099201, 12595201, 100761604, 100761664,
+ 218316800, 806092816, 1612185611, 1612185731, 1612189697, 1746534433, 1880883205, 1880883265, 1948057601,
+ 2147485699, 2149580865, 2149580930, 2150105603, 2174771203, 2955673606, 2955673666, 2955675651, 3022848002,
+ 3064832003, 3140403203, 3224371394, 3224404002, 3325132802, 3761766418, 81, 176, 416, 770, 1041, 2112, 5124,
+ 16385, 65604, 393216, 393472, 524802, 2099202, 4198401, 50380832, 113356800, 201523208, 1612185632,
+ 1612185649, 1612187651, 1746534405, 1746534465, 2150630403, 3223323650, 3224371232, 3224371267, 3224379394,
+ 3224436802, 3493068818, 3677798403, 70, 388, 448, 2066, 16408, 131204, 1049602, 4198402, 6297601, 8396801,
+ 25190416, 117555200, 369459200, 403046416, 1074790529, 1612185761, 1612189701, 1780121601, 1813708929,
+ 1880883457, 2149580839, 2149581331, 2151684099, 2162176003, 2686976067, 3090022401, 3224371466, 3224502402,
+ 3274752002, 3627418626, 69, 84, 131, 200, 1538, 4116, 16388, 32784, 32808, 196672, 262404, 4198404, 8396802,
+ 12595202, 12595208, 16793601, 109158400, 806092832, 1477836803, 1611661825, 1612185697, 1612185733,
+ 1612193801, 1880883209, 2149580883, 2149580929, 2149581058, 2149597187, 2153779203, 2686976387, 2955673610,
+ 2955673615, 2955673665, 3022848001, 3056435219, 3224371242, 3224633602, 3627417601, 3761766434, 76, 82, 352,
+ 772, 832, 1057, 2176, 4128, 8216, 8224, 16386, 65608, 524804, 786944, 3148801, 6297604, 8396804, 25190404,
+ 33587201, 1612185621, 1612185664, 1612202001, 1746534409, 2148535299, 2157977603, 2955673609, 2955674626,
+ 3221228546, 3222276098, 3224371264, 3224371331, 3224896002, 3249561602, 3493068834, 3677798400, 134, 161,
+ 392, 896, 2082, 32832, 131208, 786432, 1048576, 1049604, 3148802, 6297602, 8396808, 16793604, 50380808,
+ 58777600, 67174401, 184729600, 1477836833, 1612185619, 1612218401, 1712947201, 1813709057, 2149580871,
+ 2149581347, 2166374403, 2888499202, 2955673627, 2955675649, 3048038403, 3224371474, 3627417650, 3677798406,
+ 73, 133, 259, 1540, 4132, 10248, 32816, 262408, 393344, 1574401, 2099204, 12595204, 16793602, 16793608,
+ 54579200, 100761616, 134348801, 1477836805, 1545011201, 1611138049, 1612185737, 1612193793, 1612251201,
+ 1880883217, 2149581057, 2183168003, 2955411715, 2955673618, 2955673683, 2955677699, 3022848019, 3056435235,
+ 3090022435, 3224371238, 3224372738, 3226474498, 3236966402, 3761766466, 74, 140, 704, 776, 1089, 1664, 2304,
+ 3076, 4160, 8232, 10240, 32769, 65616, 524808, 1573888, 16793616, 25190408, 33587202, 33587208, 56678400,
+ 201523232, 268697601, 1511424001, 1610614273, 1612316801, 1662566401, 1746534417, 2149580963, 2149583875,
+ 2216755203, 2955673651, 3022848035, 3090022407, 3157196803, 3224371282, 3224371328, 3224372482, 3224387586,
+ 3228569602, 3627417604, 3627417635, 3627417698, 162, 168, 262, 400, 1792, 2114, 16432, 65568, 131216, 787201,
+ 1049608, 29388800, 33587216, 50380816, 67174402, 92364800, 403046464, 537395201, 1612185635, 1612185641,
+ 1612448001, 1813709313, 2149580875, 2149580935, 2149581379, 2149583363, 2151155203, 2283929603, 2888499201,
+ 2955673671, 2955677703, 3022848007, 3090022531, 3123609603, 3223848450, 3224371490, 3232768002, 3627417607,
+ 3627419650, 3694592003, 137, 152, 261, 321, 515, 1544, 4164, 6145, 262416, 786688, 1572864, 1574402, 2099208,
+ 27289600, 33587232, 67174416, 100761632, 134348802, 806092928, 1477836809, 1477837313, 1612185745,
+ 1612186369, 1637376001, 1880883233, 2149613571, 2418278403, 2955149827, 2955673617, 2955673634, 2955681803,
+ 3056435267, 3224371270, 3241164802, 3661004803, 3963289603, 138, 268, 784, 1153, 1408, 4224, 8264, 32770,
+ 32776, 65632, 65664, 393601, 524816, 3147776, 3148804, 4198408, 33587204, 67174432, 201523264, 268697602,
+ 1612185856, 1612186241, 2150368003, 2913689603, 2955673623, 2955690003, 3039641603, 3090022411, 3224371715,
+ 3257958402, 3493068994, 3627417632, 164, 518, 2178, 5121, 16448, 16464, 20496, 131232, 787202, 1049616,
+ 6297608, 14694400, 46182400, 67174404, 67174464, 134348832, 403046528, 537395202, 1074791425, 1612185637,
+ 1612185667, 1612187137, 1613237249, 1614284801, 1624780801, 1813709825, 2149580939, 2149581063, 2149581123,
+ 2149586947, 2686976515, 2888499219, 2954887939, 2955673675, 2955706403, 3022848011, 3224371362, 3224374274,
+ 3291545602, 3627286658, 3627417611, 3627417614, 3694592000, 145, 265, 280, 322, 517, 1552, 3080, 4228, 6152,
+ 32864, 196801, 262432, 1573376, 1574404, 2097152, 2099216, 12595216, 13644800, 134348804, 134348864,
+ 806093056, 1074790402, 1444249601, 1477836817, 1612185681, 1612201985, 1616384001, 1813708802, 1838899200,
+ 2149581570, 2149581825, 2149583873, 2149974403, 2955673633, 2955681795, 2955739203, 3056435331, 3224371274,
+ 3224371334, 3358720002, 3627417608, 3627417642, 3627418627, 3661004800, 3677798410, 3679901698, 3686195202,
+ 3761766658, 3963289600, 266, 336, 524, 641, 800, 3328, 4352, 5125, 8256, 8328, 10250, 12289, 20500, 32772,
+ 41000, 65537, 82000, 131136, 164000, 328000, 393602, 524832, 656000, 1312000, 2624000, 3148808, 4198416,
+ 5248000, 10496000, 20992000, 25190432, 28339200, 41984000, 83968000, 134348928, 167936000, 268697604,
+ 268697664, 335872000, 671744000, 1343488000, 1611924225, 1612186112, 1618483201, 1620582401, 1813708816,
+ 2149583361, 2955673987, 2955804803, 3006054403, 3090022419, 3224371712, 3224403970, 3493069058, 3627417626,
+ 3627419648, 3681996802, 3719782402, 148, 304, 2306, 3584, 12290, 16528, 98401, 131264, 787204, 1049632,
+ 3145728, 6297616, 8396816, 23091200, 50380864, 67174408, 268697728, 1074790785, 1612185669, 1813708825 };
diff --git a/src/osd/ErasureCodePluginJerasure/galois.c b/src/osd/ErasureCodePluginJerasure/galois.c
new file mode 100755
index 00000000000..be8be59affa
--- /dev/null
+++ b/src/osd/ErasureCodePluginJerasure/galois.c
@@ -0,0 +1,821 @@
+/* Galois.c
+ * James S. Plank
+
+Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure Coding Techniques
+
+Revision 1.2A
+May 24, 2011
+
+James S. Plank
+Department of Electrical Engineering and Computer Science
+University of Tennessee
+Knoxville, TN 37996
+plank@cs.utk.edu
+
+Copyright (c) 2011, James S. Plank
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ - Neither the name of the University of Tennessee nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "galois.h"
+
+#define NONE (10)
+#define TABLE (11)
+#define SHIFT (12)
+#define LOGS (13)
+#define SPLITW8 (14)
+
+static int prim_poly[33] =
+{ 0,
+/* 1 */ 1,
+/* 2 */ 07,
+/* 3 */ 013,
+/* 4 */ 023,
+/* 5 */ 045,
+/* 6 */ 0103,
+/* 7 */ 0211,
+/* 8 */ 0435,
+/* 9 */ 01021,
+/* 10 */ 02011,
+/* 11 */ 04005,
+/* 12 */ 010123,
+/* 13 */ 020033,
+/* 14 */ 042103,
+/* 15 */ 0100003,
+/* 16 */ 0210013,
+/* 17 */ 0400011,
+/* 18 */ 01000201,
+/* 19 */ 02000047,
+/* 20 */ 04000011,
+/* 21 */ 010000005,
+/* 22 */ 020000003,
+/* 23 */ 040000041,
+/* 24 */ 0100000207,
+/* 25 */ 0200000011,
+/* 26 */ 0400000107,
+/* 27 */ 01000000047,
+/* 28 */ 02000000011,
+/* 29 */ 04000000005,
+/* 30 */ 010040000007,
+/* 31 */ 020000000011,
+/* 32 */ 00020000007 }; /* Really 40020000007, but we're omitting the high order bit */
+
+static int mult_type[33] =
+{ NONE,
+/* 1 */ TABLE,
+/* 2 */ TABLE,
+/* 3 */ TABLE,
+/* 4 */ TABLE,
+/* 5 */ TABLE,
+/* 6 */ TABLE,
+/* 7 */ TABLE,
+/* 8 */ TABLE,
+/* 9 */ TABLE,
+/* 10 */ LOGS,
+/* 11 */ LOGS,
+/* 12 */ LOGS,
+/* 13 */ LOGS,
+/* 14 */ LOGS,
+/* 15 */ LOGS,
+/* 16 */ LOGS,
+/* 17 */ LOGS,
+/* 18 */ LOGS,
+/* 19 */ LOGS,
+/* 20 */ LOGS,
+/* 21 */ LOGS,
+/* 22 */ LOGS,
+/* 23 */ SHIFT,
+/* 24 */ SHIFT,
+/* 25 */ SHIFT,
+/* 26 */ SHIFT,
+/* 27 */ SHIFT,
+/* 28 */ SHIFT,
+/* 29 */ SHIFT,
+/* 30 */ SHIFT,
+/* 31 */ SHIFT,
+/* 32 */ SPLITW8 };
+
+static int nw[33] = { 0, (1 << 1), (1 << 2), (1 << 3), (1 << 4),
+(1 << 5), (1 << 6), (1 << 7), (1 << 8), (1 << 9), (1 << 10),
+(1 << 11), (1 << 12), (1 << 13), (1 << 14), (1 << 15), (1 << 16),
+(1 << 17), (1 << 18), (1 << 19), (1 << 20), (1 << 21), (1 << 22),
+(1 << 23), (1 << 24), (1 << 25), (1 << 26), (1 << 27), (1 << 28),
+(1 << 29), (1 << 30), (1 << 31), -1 };
+
+static int nwm1[33] = { 0, (1 << 1)-1, (1 << 2)-1, (1 << 3)-1, (1 << 4)-1,
+(1 << 5)-1, (1 << 6)-1, (1 << 7)-1, (1 << 8)-1, (1 << 9)-1, (1 << 10)-1,
+(1 << 11)-1, (1 << 12)-1, (1 << 13)-1, (1 << 14)-1, (1 << 15)-1, (1 << 16)-1,
+(1 << 17)-1, (1 << 18)-1, (1 << 19)-1, (1 << 20)-1, (1 << 21)-1, (1 << 22)-1,
+(1 << 23)-1, (1 << 24)-1, (1 << 25)-1, (1 << 26)-1, (1 << 27)-1, (1 << 28)-1,
+(1 << 29)-1, (1 << 30)-1, 0x7fffffff, 0xffffffff };
+
+static int *galois_log_tables[33] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL };
+
+static int *galois_ilog_tables[33] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL };
+
+static int *galois_mult_tables[33] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL };
+
+static int *galois_div_tables[33] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL };
+
+/* Special case for w = 32 */
+
+static int *galois_split_w8[7] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL };
+
+int galois_create_log_tables(int w)
+{
+ int j, b;
+
+ if (w > 30) return -1;
+ if (galois_log_tables[w] != NULL) return 0;
+ galois_log_tables[w] = (int *) malloc(sizeof(int)*nw[w]);
+ if (galois_log_tables[w] == NULL) return -1;
+
+ galois_ilog_tables[w] = (int *) malloc(sizeof(int)*nw[w]*3);
+ if (galois_ilog_tables[w] == NULL) {
+ free(galois_log_tables[w]);
+ galois_log_tables[w] = NULL;
+ return -1;
+ }
+
+ for (j = 0; j < nw[w]; j++) {
+ galois_log_tables[w][j] = nwm1[w];
+ galois_ilog_tables[w][j] = 0;
+ }
+
+ b = 1;
+ for (j = 0; j < nwm1[w]; j++) {
+ if (galois_log_tables[w][b] != nwm1[w]) {
+ fprintf(stderr, "Galois_create_log_tables Error: j=%d, b=%d, B->J[b]=%d, J->B[j]=%d (0%o)\n",
+ j, b, galois_log_tables[w][b], galois_ilog_tables[w][j], (b << 1) ^ prim_poly[w]);
+ exit(1);
+ }
+ galois_log_tables[w][b] = j;
+ galois_ilog_tables[w][j] = b;
+ b = b << 1;
+ if (b & nw[w]) b = (b ^ prim_poly[w]) & nwm1[w];
+ }
+ for (j = 0; j < nwm1[w]; j++) {
+ galois_ilog_tables[w][j+nwm1[w]] = galois_ilog_tables[w][j];
+ galois_ilog_tables[w][j+nwm1[w]*2] = galois_ilog_tables[w][j];
+ }
+ galois_ilog_tables[w] += nwm1[w];
+ return 0;
+}
+
+int galois_logtable_multiply(int x, int y, int w)
+{
+ int sum_j;
+
+ if (x == 0 || y == 0) return 0;
+
+ sum_j = galois_log_tables[w][x] + galois_log_tables[w][y];
+ /* if (sum_j >= nwm1[w]) sum_j -= nwm1[w]; Don't need to do this,
+ because we replicate the ilog table twice. */
+ return galois_ilog_tables[w][sum_j];
+}
+
+int galois_logtable_divide(int x, int y, int w)
+{
+ int sum_j;
+ int z;
+
+ if (y == 0) return -1;
+ if (x == 0) return 0;
+ sum_j = galois_log_tables[w][x] - galois_log_tables[w][y];
+ /* if (sum_j < 0) sum_j += nwm1[w]; Don't need to do this, because we replicate the ilog table twice. */
+ z = galois_ilog_tables[w][sum_j];
+ return z;
+}
+
+int galois_create_mult_tables(int w)
+{
+ int j, x, y, logx;
+
+ if (w >= 14) return -1;
+
+ if (galois_mult_tables[w] != NULL) return 0;
+ galois_mult_tables[w] = (int *) malloc(sizeof(int) * nw[w] * nw[w]);
+ if (galois_mult_tables[w] == NULL) return -1;
+
+ galois_div_tables[w] = (int *) malloc(sizeof(int) * nw[w] * nw[w]);
+ if (galois_div_tables[w] == NULL) {
+ free(galois_mult_tables[w]);
+ galois_mult_tables[w] = NULL;
+ return -1;
+ }
+ if (galois_log_tables[w] == NULL) {
+ if (galois_create_log_tables(w) < 0) {
+ free(galois_mult_tables[w]);
+ free(galois_div_tables[w]);
+ galois_mult_tables[w] = NULL;
+ galois_div_tables[w] = NULL;
+ return -1;
+ }
+ }
+
+ /* Set mult/div tables for x = 0 */
+ j = 0;
+ galois_mult_tables[w][j] = 0; /* y = 0 */
+ galois_div_tables[w][j] = -1;
+ j++;
+ for (y = 1; y < nw[w]; y++) { /* y > 0 */
+ galois_mult_tables[w][j] = 0;
+ galois_div_tables[w][j] = 0;
+ j++;
+ }
+
+ for (x = 1; x < nw[w]; x++) { /* x > 0 */
+ galois_mult_tables[w][j] = 0; /* y = 0 */
+ galois_div_tables[w][j] = -1;
+ j++;
+ logx = galois_log_tables[w][x];
+ for (y = 1; y < nw[w]; y++) { /* y > 0 */
+ galois_mult_tables[w][j] = galois_ilog_tables[w][logx+galois_log_tables[w][y]];
+ galois_div_tables[w][j] = galois_ilog_tables[w][logx-galois_log_tables[w][y]];
+ j++;
+ }
+ }
+ return 0;
+}
+
+int galois_ilog(int value, int w)
+{
+ if (galois_ilog_tables[w] == NULL) {
+ if (galois_create_log_tables(w) < 0) {
+ fprintf(stderr, "Error: galois_ilog - w is too big. Sorry\n");
+ exit(1);
+ }
+ }
+ return galois_ilog_tables[w][value];
+}
+
+int galois_log(int value, int w)
+{
+ if (galois_log_tables[w] == NULL) {
+ if (galois_create_log_tables(w) < 0) {
+ fprintf(stderr, "Error: galois_log - w is too big. Sorry\n");
+ exit(1);
+ }
+ }
+ return galois_log_tables[w][value];
+}
+
+
+int galois_shift_multiply(int x, int y, int w)
+{
+ int prod;
+ int i, j, ind;
+ int k;
+ int scratch[33];
+
+ prod = 0;
+ for (i = 0; i < w; i++) {
+ scratch[i] = y;
+ if (y & (1 << (w-1))) {
+ y = y << 1;
+ y = (y ^ prim_poly[w]) & nwm1[w];
+ } else {
+ y = y << 1;
+ }
+ }
+ for (i = 0; i < w; i++) {
+ ind = (1 << i);
+ if (ind & x) {
+ j = 1;
+ for (k = 0; k < w; k++) {
+ prod = prod ^ (j & scratch[i]);
+ j = (j << 1);
+ }
+ }
+ }
+ return prod;
+}
+
+int galois_single_multiply(int x, int y, int w)
+{
+ int sum_j;
+ int z;
+
+ if (x == 0 || y == 0) return 0;
+
+ if (mult_type[w] == TABLE) {
+ if (galois_mult_tables[w] == NULL) {
+ if (galois_create_mult_tables(w) < 0) {
+ fprintf(stderr, "ERROR -- cannot make multiplication tables for w=%d\n", w);
+ exit(1);
+ }
+ }
+ return galois_mult_tables[w][(x<<w)|y];
+ } else if (mult_type[w] == LOGS) {
+ if (galois_log_tables[w] == NULL) {
+ if (galois_create_log_tables(w) < 0) {
+ fprintf(stderr, "ERROR -- cannot make log tables for w=%d\n", w);
+ exit(1);
+ }
+ }
+ sum_j = galois_log_tables[w][x] + galois_log_tables[w][y];
+ z = galois_ilog_tables[w][sum_j];
+ return z;
+ } else if (mult_type[w] == SPLITW8) {
+ if (galois_split_w8[0] == NULL) {
+ if (galois_create_split_w8_tables() < 0) {
+ fprintf(stderr, "ERROR -- cannot make log split_w8_tables for w=%d\n", w);
+ exit(1);
+ }
+ }
+ return galois_split_w8_multiply(x, y);
+ } else if (mult_type[w] == SHIFT) {
+ return galois_shift_multiply(x, y, w);
+ }
+ fprintf(stderr, "Galois_single_multiply - no implementation for w=%d\n", w);
+ exit(1);
+}
+
+int galois_multtable_multiply(int x, int y, int w)
+{
+ return galois_mult_tables[w][(x<<w)|y];
+}
+
+int galois_single_divide(int a, int b, int w)
+{
+ int sum_j;
+
+ if (mult_type[w] == TABLE) {
+ if (galois_div_tables[w] == NULL) {
+ if (galois_create_mult_tables(w) < 0) {
+ fprintf(stderr, "ERROR -- cannot make multiplication tables for w=%d\n", w);
+ exit(1);
+ }
+ }
+ return galois_div_tables[w][(a<<w)|b];
+ } else if (mult_type[w] == LOGS) {
+ if (b == 0) return -1;
+ if (a == 0) return 0;
+ if (galois_log_tables[w] == NULL) {
+ if (galois_create_log_tables(w) < 0) {
+ fprintf(stderr, "ERROR -- cannot make log tables for w=%d\n", w);
+ exit(1);
+ }
+ }
+ sum_j = galois_log_tables[w][a] - galois_log_tables[w][b];
+ return galois_ilog_tables[w][sum_j];
+ } else {
+ if (b == 0) return -1;
+ if (a == 0) return 0;
+ sum_j = galois_inverse(b, w);
+ return galois_single_multiply(a, sum_j, w);
+ }
+ fprintf(stderr, "Galois_single_divide - no implementation for w=%d\n", w);
+ exit(1);
+}
+
+int galois_shift_divide(int a, int b, int w)
+{
+ int inverse;
+
+ if (b == 0) return -1;
+ if (a == 0) return 0;
+ inverse = galois_shift_inverse(b, w);
+ return galois_shift_multiply(a, inverse, w);
+}
+
+int galois_multtable_divide(int x, int y, int w)
+{
+ return galois_div_tables[w][(x<<w)|y];
+}
+
+void galois_w08_region_multiply(char *region, /* Region to multiply */
+ int multby, /* Number to multiply by */
+ int nbytes, /* Number of bytes in region */
+ char *r2, /* If r2 != NULL, products go here */
+ int add)
+{
+ unsigned char *ur1, *ur2, *cp;
+ unsigned char prod;
+ int i, srow, j;
+ unsigned long l, *lp2;
+ unsigned char *lp;
+ int sol;
+
+ ur1 = (unsigned char *) region;
+ ur2 = (r2 == NULL) ? ur1 : (unsigned char *) r2;
+
+/* This is used to test its performance with respect to just calling galois_single_multiply
+ if (r2 == NULL || !add) {
+ for (i = 0; i < nbytes; i++) ur2[i] = galois_single_multiply(ur1[i], multby, 8);
+ } else {
+ for (i = 0; i < nbytes; i++) {
+ ur2[i] = (ur2[i]^galois_single_multiply(ur1[i], multby, 8));
+ }
+ }
+ */
+
+ if (galois_mult_tables[8] == NULL) {
+ if (galois_create_mult_tables(8) < 0) {
+ fprintf(stderr, "galois_08_region_multiply -- couldn't make multiplication tables\n");
+ exit(1);
+ }
+ }
+ srow = multby * nw[8];
+ if (r2 == NULL || !add) {
+ for (i = 0; i < nbytes; i++) {
+ prod = galois_mult_tables[8][srow+ur1[i]];
+ ur2[i] = prod;
+ }
+ } else {
+ sol = sizeof(long);
+ lp2 = &l;
+ lp = (unsigned char *) lp2;
+ for (i = 0; i < nbytes; i += sol) {
+ cp = ur2+i;
+ lp2 = (unsigned long *) cp;
+ for (j = 0; j < sol; j++) {
+ prod = galois_mult_tables[8][srow+ur1[i+j]];
+ lp[j] = prod;
+ }
+ *lp2 = (*lp2) ^ l;
+ }
+ }
+ return;
+}
+
+void galois_w16_region_multiply(char *region, /* Region to multiply */
+ int multby, /* Number to multiply by */
+ int nbytes, /* Number of bytes in region */
+ char *r2, /* If r2 != NULL, products go here */
+ int add)
+{
+ unsigned short *ur1, *ur2, *cp;
+ int prod;
+ int i, log1, j, log2;
+ unsigned long l, *lp2, *lptop;
+ unsigned short *lp;
+ int sol;
+
+ ur1 = (unsigned short *) region;
+ ur2 = (r2 == NULL) ? ur1 : (unsigned short *) r2;
+ nbytes /= 2;
+
+
+/* This is used to test its performance with respect to just calling galois_single_multiply */
+/*
+ if (r2 == NULL || !add) {
+ for (i = 0; i < nbytes; i++) ur2[i] = galois_single_multiply(ur1[i], multby, 16);
+ } else {
+ for (i = 0; i < nbytes; i++) {
+ ur2[i] = (ur2[i]^galois_single_multiply(ur1[i], multby, 16));
+ }
+ }
+ return;
+ */
+
+ if (multby == 0) {
+ if (!add) {
+ lp2 = (unsigned long *) ur2;
+ ur2 += nbytes;
+ lptop = (unsigned long *) ur2;
+ while (lp2 < lptop) { *lp2 = 0; lp2++; }
+ }
+ return;
+ }
+
+ if (galois_log_tables[16] == NULL) {
+ if (galois_create_log_tables(16) < 0) {
+ fprintf(stderr, "galois_16_region_multiply -- couldn't make log tables\n");
+ exit(1);
+ }
+ }
+ log1 = galois_log_tables[16][multby];
+
+ if (r2 == NULL || !add) {
+ for (i = 0; i < nbytes; i++) {
+ if (ur1[i] == 0) {
+ ur2[i] = 0;
+ } else {
+ prod = galois_log_tables[16][ur1[i]] + log1;
+ ur2[i] = galois_ilog_tables[16][prod];
+ }
+ }
+ } else {
+ sol = sizeof(long)/2;
+ lp2 = &l;
+ lp = (unsigned short *) lp2;
+ for (i = 0; i < nbytes; i += sol) {
+ cp = ur2+i;
+ lp2 = (unsigned long *) cp;
+ for (j = 0; j < sol; j++) {
+ if (ur1[i+j] == 0) {
+ lp[j] = 0;
+ } else {
+ log2 = galois_log_tables[16][ur1[i+j]];
+ prod = log2 + log1;
+ lp[j] = galois_ilog_tables[16][prod];
+ }
+ }
+ *lp2 = (*lp2) ^ l;
+ }
+ }
+ return;
+}
+
+/* This will destroy mat, by the way */
+
+void galois_invert_binary_matrix(int *mat, int *inv, int rows)
+{
+ int cols, i, j;
+ int tmp;
+
+ cols = rows;
+
+ for (i = 0; i < rows; i++) inv[i] = (1 << i);
+
+ /* First -- convert into upper triangular */
+
+ for (i = 0; i < cols; i++) {
+
+ /* Swap rows if we ave a zero i,i element. If we can't swap, then the
+ matrix was not invertible */
+
+ if ((mat[i] & (1 << i)) == 0) {
+ for (j = i+1; j < rows && (mat[j] & (1 << i)) == 0; j++) ;
+ if (j == rows) {
+ fprintf(stderr, "galois_invert_matrix: Matrix not invertible!!\n");
+ exit(1);
+ }
+ tmp = mat[i]; mat[i] = mat[j]; mat[j] = tmp;
+ tmp = inv[i]; inv[i] = inv[j]; inv[j] = tmp;
+ }
+
+ /* Now for each j>i, add A_ji*Ai to Aj */
+ for (j = i+1; j != rows; j++) {
+ if ((mat[j] & (1 << i)) != 0) {
+ mat[j] ^= mat[i];
+ inv[j] ^= inv[i];
+ }
+ }
+ }
+
+ /* Now the matrix is upper triangular. Start at the top and multiply down */
+
+ for (i = rows-1; i >= 0; i--) {
+ for (j = 0; j < i; j++) {
+ if (mat[j] & (1 << i)) {
+/* mat[j] ^= mat[i]; */
+ inv[j] ^= inv[i];
+ }
+ }
+ }
+}
+
+int galois_inverse(int y, int w)
+{
+
+ if (y == 0) return -1;
+ if (mult_type[w] == SHIFT || mult_type[w] == SPLITW8) return galois_shift_inverse(y, w);
+ return galois_single_divide(1, y, w);
+}
+
+int galois_shift_inverse(int y, int w)
+{
+ int mat2[32];
+ int inv2[32];
+ int i;
+
+ for (i = 0; i < w; i++) {
+ mat2[i] = y;
+
+ if (y & nw[w-1]) {
+ y = y << 1;
+ y = (y ^ prim_poly[w]) & nwm1[w];
+ } else {
+ y = y << 1;
+ }
+ }
+
+ galois_invert_binary_matrix(mat2, inv2, w);
+
+ return inv2[0];
+}
+
+int *galois_get_mult_table(int w)
+{
+ if (galois_mult_tables[w] == NULL) {
+ if (galois_create_mult_tables(w)) {
+ return NULL;
+ }
+ }
+ return galois_mult_tables[w];
+}
+
+int *galois_get_div_table(int w)
+{
+ if (galois_mult_tables[w] == NULL) {
+ if (galois_create_mult_tables(w)) {
+ return NULL;
+ }
+ }
+ return galois_div_tables[w];
+}
+
+int *galois_get_log_table(int w)
+{
+ if (galois_log_tables[w] == NULL) {
+ if (galois_create_log_tables(w)) {
+ return NULL;
+ }
+ }
+ return galois_log_tables[w];
+}
+
+int *galois_get_ilog_table(int w)
+{
+ if (galois_ilog_tables[w] == NULL) {
+ if (galois_create_log_tables(w)) {
+ return NULL;
+ }
+ }
+ return galois_ilog_tables[w];
+}
+
+void galois_w32_region_multiply(char *region, /* Region to multiply */
+ int multby, /* Number to multiply by */
+ int nbytes, /* Number of bytes in region */
+ char *r2, /* If r2 != NULL, products go here */
+ int add)
+{
+ unsigned int *ur1, *ur2;
+ int i, j, a, b, accumulator, i8, j8, k;
+ int acache[4];
+
+ ur1 = (unsigned int *) region;
+ ur2 = (r2 == NULL) ? ur1 : (unsigned int *) r2;
+ nbytes /= sizeof(int);
+
+ if (galois_split_w8[0]== NULL) {
+ if (galois_create_split_w8_tables(8) < 0) {
+ fprintf(stderr, "galois_32_region_multiply -- couldn't make split multiplication tables\n");
+ exit(1);
+ }
+ }
+
+ /* If we're overwriting r2, then we can't do better than just calling split_multiply.
+ We'll inline it here to save on the procedure call overhead */
+
+ i8 = 0;
+ for (i = 0; i < 4; i++) {
+ acache[i] = (((multby >> i8) & 255) << 8);
+ i8 += 8;
+ }
+ if (!add) {
+ for (k = 0; k < nbytes; k++) {
+ accumulator = 0;
+ for (i = 0; i < 4; i++) {
+ a = acache[i];
+ j8 = 0;
+ for (j = 0; j < 4; j++) {
+ b = ((ur1[k] >> j8) & 255);
+ accumulator ^= galois_split_w8[i+j][a|b];
+ j8 += 8;
+ }
+ }
+ ur2[k] = accumulator;
+ }
+ } else {
+ for (k = 0; k < nbytes; k++) {
+ accumulator = 0;
+ for (i = 0; i < 4; i++) {
+ a = acache[i];
+ j8 = 0;
+ for (j = 0; j < 4; j++) {
+ b = ((ur1[k] >> j8) & 255);
+ accumulator ^= galois_split_w8[i+j][a|b];
+ j8 += 8;
+ }
+ }
+ ur2[k] = (ur2[k] ^ accumulator);
+ }
+ }
+ return;
+
+}
+
+void galois_region_xor( char *r1, /* Region 1 */
+ char *r2, /* Region 2 */
+ char *r3, /* Sum region (r3 = r1 ^ r2) -- can be r1 or r2 */
+ int nbytes) /* Number of bytes in region */
+{
+ long *l1;
+ long *l2;
+ long *l3;
+ long *ltop;
+ char *ctop;
+
+ ctop = r1 + nbytes;
+ ltop = (long *) ctop;
+ l1 = (long *) r1;
+ l2 = (long *) r2;
+ l3 = (long *) r3;
+
+ while (l1 < ltop) {
+ *l3 = ((*l1) ^ (*l2));
+ l1++;
+ l2++;
+ l3++;
+ }
+}
+
+int galois_create_split_w8_tables()
+{
+ int p1, p2, i, j, p1elt, p2elt, index, ishift, jshift, *table;
+
+ if (galois_split_w8[0] != NULL) return 0;
+
+ if (galois_create_mult_tables(8) < 0) return -1;
+
+ for (i = 0; i < 7; i++) {
+ galois_split_w8[i] = (int *) malloc(sizeof(int) * (1 << 16));
+ if (galois_split_w8[i] == NULL) {
+ for (i--; i >= 0; i--) free(galois_split_w8[i]);
+ return -1;
+ }
+ }
+
+ for (i = 0; i < 4; i += 3) {
+ ishift = i * 8;
+ for (j = ((i == 0) ? 0 : 1) ; j < 4; j++) {
+ jshift = j * 8;
+ table = galois_split_w8[i+j];
+ index = 0;
+ for (p1 = 0; p1 < 256; p1++) {
+ p1elt = (p1 << ishift);
+ for (p2 = 0; p2 < 256; p2++) {
+ p2elt = (p2 << jshift);
+ table[index] = galois_shift_multiply(p1elt, p2elt, 32);
+ index++;
+ }
+ }
+ }
+ }
+ return 0;
+}
+
+int galois_split_w8_multiply(int x, int y)
+{
+ int i, j, a, b, accumulator, i8, j8;
+
+ accumulator = 0;
+
+ i8 = 0;
+ for (i = 0; i < 4; i++) {
+ a = (((x >> i8) & 255) << 8);
+ j8 = 0;
+ for (j = 0; j < 4; j++) {
+ b = ((y >> j8) & 255);
+ accumulator ^= galois_split_w8[i+j][a|b];
+ j8 += 8;
+ }
+ i8 += 8;
+ }
+ return accumulator;
+}
diff --git a/src/osd/ErasureCodePluginJerasure/galois.h b/src/osd/ErasureCodePluginJerasure/galois.h
new file mode 100755
index 00000000000..b08fd9488cf
--- /dev/null
+++ b/src/osd/ErasureCodePluginJerasure/galois.h
@@ -0,0 +1,111 @@
+/* Galois.h
+ * James S. Plank
+
+Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure Coding Techniques
+
+Revision 1.2A
+May 24, 2011
+
+James S. Plank
+Department of Electrical Engineering and Computer Science
+University of Tennessee
+Knoxville, TN 37996
+plank@cs.utk.edu
+
+Copyright (c) 2011, James S. Plank
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ - Neither the name of the University of Tennessee nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+ */
+
+#ifndef _GALOIS_H
+#define _GALOIS_H
+
+#include <stdio.h>
+#include <stdlib.h>
+
+extern int galois_single_multiply(int a, int b, int w);
+extern int galois_single_divide(int a, int b, int w);
+extern int galois_log(int value, int w);
+extern int galois_ilog(int value, int w);
+
+extern int galois_create_log_tables(int w); /* Returns 0 on success, -1 on failure */
+extern int galois_logtable_multiply(int x, int y, int w);
+extern int galois_logtable_divide(int x, int y, int w);
+
+extern int galois_create_mult_tables(int w); /* Returns 0 on success, -1 on failure */
+extern int galois_multtable_multiply(int x, int y, int w);
+extern int galois_multtable_divide(int x, int y, int w);
+
+extern int galois_shift_multiply(int x, int y, int w);
+extern int galois_shift_divide(int x, int y, int w);
+
+extern int galois_create_split_w8_tables();
+extern int galois_split_w8_multiply(int x, int y);
+
+extern int galois_inverse(int x, int w);
+extern int galois_shift_inverse(int y, int w);
+
+extern int *galois_get_mult_table(int w);
+extern int *galois_get_div_table(int w);
+extern int *galois_get_log_table(int w);
+extern int *galois_get_ilog_table(int w);
+
+void galois_region_xor( char *r1, /* Region 1 */
+ char *r2, /* Region 2 */
+ char *r3, /* Sum region (r3 = r1 ^ r2) -- can be r1 or r2 */
+ int nbytes); /* Number of bytes in region */
+
+/* These multiply regions in w=8, w=16 and w=32. They are much faster
+ than calling galois_single_multiply. The regions must be long word aligned. */
+
+void galois_w08_region_multiply(char *region, /* Region to multiply */
+ int multby, /* Number to multiply by */
+ int nbytes, /* Number of bytes in region */
+ char *r2, /* If r2 != NULL, products go here.
+ Otherwise region is overwritten */
+ int add); /* If (r2 != NULL && add) the produce is XOR'd with r2 */
+
+void galois_w16_region_multiply(char *region, /* Region to multiply */
+ int multby, /* Number to multiply by */
+ int nbytes, /* Number of bytes in region */
+ char *r2, /* If r2 != NULL, products go here.
+ Otherwise region is overwritten */
+ int add); /* If (r2 != NULL && add) the produce is XOR'd with r2 */
+
+void galois_w32_region_multiply(char *region, /* Region to multiply */
+ int multby, /* Number to multiply by */
+ int nbytes, /* Number of bytes in region */
+ char *r2, /* If r2 != NULL, products go here.
+ Otherwise region is overwritten */
+ int add); /* If (r2 != NULL && add) the produce is XOR'd with r2 */
+
+#endif
diff --git a/src/osd/ErasureCodePluginJerasure/jerasure.c b/src/osd/ErasureCodePluginJerasure/jerasure.c
new file mode 100755
index 00000000000..9efae02e5fb
--- /dev/null
+++ b/src/osd/ErasureCodePluginJerasure/jerasure.c
@@ -0,0 +1,1376 @@
+/* jerasure.c
+ * James S. Plank
+
+Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure Coding Techniques
+
+Revision 1.2A
+May 24, 2011
+
+James S. Plank
+Department of Electrical Engineering and Computer Science
+University of Tennessee
+Knoxville, TN 37996
+plank@cs.utk.edu
+
+Copyright (c) 2011, James S. Plank
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ - Neither the name of the University of Tennessee nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "galois.h"
+#include "jerasure.h"
+
+#define talloc(type, num) (type *) malloc(sizeof(type)*(num))
+
+static double jerasure_total_xor_bytes = 0;
+static double jerasure_total_gf_bytes = 0;
+static double jerasure_total_memcpy_bytes = 0;
+
+void jerasure_print_matrix(int *m, int rows, int cols, int w)
+{
+ int i, j;
+ int fw;
+ char s[30];
+ unsigned int w2;
+
+ if (w == 32) {
+ fw = 10;
+ } else {
+ w2 = (1 << w);
+ sprintf(s, "%u", w2-1);
+ fw = strlen(s);
+ }
+
+ for (i = 0; i < rows; i++) {
+ for (j = 0; j < cols; j++) {
+ if (j != 0) printf(" ");
+ printf("%*u", fw, m[i*cols+j]);
+ }
+ printf("\n");
+ }
+}
+
+void jerasure_print_bitmatrix(int *m, int rows, int cols, int w)
+{
+ int i, j;
+
+ for (i = 0; i < rows; i++) {
+ if (i != 0 && i%w == 0) printf("\n");
+ for (j = 0; j < cols; j++) {
+ if (j != 0 && j%w == 0) printf(" ");
+ printf("%d", m[i*cols+j]);
+ }
+ printf("\n");
+ }
+}
+
+int jerasure_make_decoding_matrix(int k, int m, int w, int *matrix, int *erased, int *decoding_matrix, int *dm_ids)
+{
+ int i, j, *tmpmat;
+
+ j = 0;
+ for (i = 0; j < k; i++) {
+ if (erased[i] == 0) {
+ dm_ids[j] = i;
+ j++;
+ }
+ }
+
+ tmpmat = talloc(int, k*k);
+ if (tmpmat == NULL) { return -1; }
+ for (i = 0; i < k; i++) {
+ if (dm_ids[i] < k) {
+ for (j = 0; j < k; j++) tmpmat[i*k+j] = 0;
+ tmpmat[i*k+dm_ids[i]] = 1;
+ } else {
+ for (j = 0; j < k; j++) {
+ tmpmat[i*k+j] = matrix[(dm_ids[i]-k)*k+j];
+ }
+ }
+ }
+
+ i = jerasure_invert_matrix(tmpmat, decoding_matrix, k, w);
+ free(tmpmat);
+ return i;
+}
+
+/* Internal Routine */
+int jerasure_make_decoding_bitmatrix(int k, int m, int w, int *matrix, int *erased, int *decoding_matrix, int *dm_ids)
+{
+ int i, j, *tmpmat;
+ int index, mindex;
+
+ j = 0;
+ for (i = 0; j < k; i++) {
+ if (erased[i] == 0) {
+ dm_ids[j] = i;
+ j++;
+ }
+ }
+
+ tmpmat = talloc(int, k*k*w*w);
+ if (tmpmat == NULL) { return -1; }
+ for (i = 0; i < k; i++) {
+ if (dm_ids[i] < k) {
+ index = i*k*w*w;
+ for (j = 0; j < k*w*w; j++) tmpmat[index+j] = 0;
+ index = i*k*w*w+dm_ids[i]*w;
+ for (j = 0; j < w; j++) {
+ tmpmat[index] = 1;
+ index += (k*w+1);
+ }
+ } else {
+ index = i*k*w*w;
+ mindex = (dm_ids[i]-k)*k*w*w;
+ for (j = 0; j < k*w*w; j++) {
+ tmpmat[index+j] = matrix[mindex+j];
+ }
+ }
+ }
+
+ i = jerasure_invert_bitmatrix(tmpmat, decoding_matrix, k*w);
+ free(tmpmat);
+ return i;
+}
+
+int jerasure_matrix_decode(int k, int m, int w, int *matrix, int row_k_ones, int *erasures,
+ char **data_ptrs, char **coding_ptrs, int size)
+{
+ int i, edd, lastdrive;
+ int *tmpids;
+ int *erased, *decoding_matrix, *dm_ids;
+
+ if (w != 8 && w != 16 && w != 32) return -1;
+
+ erased = jerasure_erasures_to_erased(k, m, erasures);
+ if (erased == NULL) return -1;
+
+ /* Find the number of data drives failed */
+
+ lastdrive = k;
+
+ edd = 0;
+ for (i = 0; i < k; i++) {
+ if (erased[i]) {
+ edd++;
+ lastdrive = i;
+ }
+ }
+
+ /* You only need to create the decoding matrix in the following cases:
+
+ 1. edd > 0 and row_k_ones is false.
+ 2. edd > 0 and row_k_ones is true and coding device 0 has been erased.
+ 3. edd > 1
+
+ We're going to use lastdrive to denote when to stop decoding data.
+ At this point in the code, it is equal to the last erased data device.
+ However, if we can't use the parity row to decode it (i.e. row_k_ones=0
+ or erased[k] = 1, we're going to set it to k so that the decoding
+ pass will decode all data.
+ */
+
+ if (!row_k_ones || erased[k]) lastdrive = k;
+
+ dm_ids = NULL;
+ decoding_matrix = NULL;
+
+ if (edd > 1 || (edd > 0 && (!row_k_ones || erased[k]))) {
+ dm_ids = talloc(int, k);
+ if (dm_ids == NULL) {
+ free(erased);
+ return -1;
+ }
+
+ decoding_matrix = talloc(int, k*k);
+ if (decoding_matrix == NULL) {
+ free(erased);
+ free(dm_ids);
+ return -1;
+ }
+
+ if (jerasure_make_decoding_matrix(k, m, w, matrix, erased, decoding_matrix, dm_ids) < 0) {
+ free(erased);
+ free(dm_ids);
+ free(decoding_matrix);
+ return -1;
+ }
+ }
+
+ /* Decode the data drives.
+ If row_k_ones is true and coding device 0 is intact, then only decode edd-1 drives.
+ This is done by stopping at lastdrive.
+ We test whether edd > 0 so that we can exit the loop early if we're done.
+ */
+
+ for (i = 0; edd > 0 && i < lastdrive; i++) {
+ if (erased[i]) {
+ jerasure_matrix_dotprod(k, w, decoding_matrix+(i*k), dm_ids, i, data_ptrs, coding_ptrs, size);
+ edd--;
+ }
+ }
+
+ /* Then if necessary, decode drive lastdrive */
+
+ if (edd > 0) {
+ tmpids = talloc(int, k);
+ for (i = 0; i < k; i++) {
+ tmpids[i] = (i < lastdrive) ? i : i+1;
+ }
+ jerasure_matrix_dotprod(k, w, matrix, tmpids, lastdrive, data_ptrs, coding_ptrs, size);
+ free(tmpids);
+ }
+
+ /* Finally, re-encode any erased coding devices */
+
+ for (i = 0; i < m; i++) {
+ if (erased[k+i]) {
+ jerasure_matrix_dotprod(k, w, matrix+(i*k), NULL, i+k, data_ptrs, coding_ptrs, size);
+ }
+ }
+
+ free(erased);
+ if (dm_ids != NULL) free(dm_ids);
+ if (decoding_matrix != NULL) free(decoding_matrix);
+
+ return 0;
+}
+
+
+int *jerasure_matrix_to_bitmatrix(int k, int m, int w, int *matrix)
+{
+ int *bitmatrix;
+ int rowelts, rowindex, colindex, elt, i, j, l, x;
+
+ bitmatrix = talloc(int, k*m*w*w);
+ if (matrix == NULL) { return NULL; }
+
+ rowelts = k * w;
+ rowindex = 0;
+
+ for (i = 0; i < m; i++) {
+ colindex = rowindex;
+ for (j = 0; j < k; j++) {
+ elt = matrix[i*k+j];
+ for (x = 0; x < w; x++) {
+ for (l = 0; l < w; l++) {
+ bitmatrix[colindex+x+l*rowelts] = ((elt & (1 << l)) ? 1 : 0);
+ }
+ elt = galois_single_multiply(elt, 2, w);
+ }
+ colindex += w;
+ }
+ rowindex += rowelts * w;
+ }
+ return bitmatrix;
+}
+
+void jerasure_matrix_encode(int k, int m, int w, int *matrix,
+ char **data_ptrs, char **coding_ptrs, int size)
+{
+ int i;
+
+ if (w != 8 && w != 16 && w != 32) {
+ fprintf(stderr, "ERROR: jerasure_matrix_encode() and w is not 8, 16 or 32\n");
+ exit(1);
+ }
+
+ for (i = 0; i < m; i++) {
+ jerasure_matrix_dotprod(k, w, matrix+(i*k), NULL, k+i, data_ptrs, coding_ptrs, size);
+ }
+}
+
+void jerasure_bitmatrix_dotprod(int k, int w, int *bitmatrix_row,
+ int *src_ids, int dest_id,
+ char **data_ptrs, char **coding_ptrs, int size, int packetsize)
+{
+ int j, sindex, pstarted, index, x, y;
+ char *dptr, *pptr, *bdptr, *bpptr;
+
+ if (size%(w*packetsize) != 0) {
+ fprintf(stderr, "jerasure_bitmatrix_dotprod - size%c(w*packetsize)) must = 0\n", '%');
+ exit(1);
+ }
+
+ bpptr = (dest_id < k) ? data_ptrs[dest_id] : coding_ptrs[dest_id-k];
+
+ for (sindex = 0; sindex < size; sindex += (packetsize*w)) {
+ index = 0;
+ for (j = 0; j < w; j++) {
+ pstarted = 0;
+ pptr = bpptr + sindex + j*packetsize;
+ for (x = 0; x < k; x++) {
+ if (src_ids == NULL) {
+ bdptr = data_ptrs[x];
+ } else if (src_ids[x] < k) {
+ bdptr = data_ptrs[src_ids[x]];
+ } else {
+ bdptr = coding_ptrs[src_ids[x]-k];
+ }
+ for (y = 0; y < w; y++) {
+ if (bitmatrix_row[index]) {
+ dptr = bdptr + sindex + y*packetsize;
+ if (!pstarted) {
+ memcpy(pptr, dptr, packetsize);
+ jerasure_total_memcpy_bytes += packetsize;
+ pstarted = 1;
+ } else {
+ galois_region_xor(pptr, dptr, pptr, packetsize);
+ jerasure_total_xor_bytes += packetsize;
+ }
+ }
+ index++;
+ }
+ }
+ }
+ }
+}
+
+void jerasure_do_parity(int k, char **data_ptrs, char *parity_ptr, int size)
+{
+ int i;
+
+ memcpy(parity_ptr, data_ptrs[0], size);
+ jerasure_total_memcpy_bytes += size;
+
+ for (i = 1; i < k; i++) {
+ galois_region_xor(data_ptrs[i], parity_ptr, parity_ptr, size);
+ jerasure_total_xor_bytes += size;
+ }
+}
+
+int jerasure_invert_matrix(int *mat, int *inv, int rows, int w)
+{
+ int cols, i, j, k, x, rs2;
+ int row_start, tmp, inverse;
+
+ cols = rows;
+
+ k = 0;
+ for (i = 0; i < rows; i++) {
+ for (j = 0; j < cols; j++) {
+ inv[k] = (i == j) ? 1 : 0;
+ k++;
+ }
+ }
+
+ /* First -- convert into upper triangular */
+ for (i = 0; i < cols; i++) {
+ row_start = cols*i;
+
+ /* Swap rows if we ave a zero i,i element. If we can't swap, then the
+ matrix was not invertible */
+
+ if (mat[row_start+i] == 0) {
+ for (j = i+1; j < rows && mat[cols*j+i] == 0; j++) ;
+ if (j == rows) return -1;
+ rs2 = j*cols;
+ for (k = 0; k < cols; k++) {
+ tmp = mat[row_start+k];
+ mat[row_start+k] = mat[rs2+k];
+ mat[rs2+k] = tmp;
+ tmp = inv[row_start+k];
+ inv[row_start+k] = inv[rs2+k];
+ inv[rs2+k] = tmp;
+ }
+ }
+
+ /* Multiply the row by 1/element i,i */
+ tmp = mat[row_start+i];
+ if (tmp != 1) {
+ inverse = galois_single_divide(1, tmp, w);
+ for (j = 0; j < cols; j++) {
+ mat[row_start+j] = galois_single_multiply(mat[row_start+j], inverse, w);
+ inv[row_start+j] = galois_single_multiply(inv[row_start+j], inverse, w);
+ }
+ }
+
+ /* Now for each j>i, add A_ji*Ai to Aj */
+ k = row_start+i;
+ for (j = i+1; j != cols; j++) {
+ k += cols;
+ if (mat[k] != 0) {
+ if (mat[k] == 1) {
+ rs2 = cols*j;
+ for (x = 0; x < cols; x++) {
+ mat[rs2+x] ^= mat[row_start+x];
+ inv[rs2+x] ^= inv[row_start+x];
+ }
+ } else {
+ tmp = mat[k];
+ rs2 = cols*j;
+ for (x = 0; x < cols; x++) {
+ mat[rs2+x] ^= galois_single_multiply(tmp, mat[row_start+x], w);
+ inv[rs2+x] ^= galois_single_multiply(tmp, inv[row_start+x], w);
+ }
+ }
+ }
+ }
+ }
+
+ /* Now the matrix is upper triangular. Start at the top and multiply down */
+
+ for (i = rows-1; i >= 0; i--) {
+ row_start = i*cols;
+ for (j = 0; j < i; j++) {
+ rs2 = j*cols;
+ if (mat[rs2+i] != 0) {
+ tmp = mat[rs2+i];
+ mat[rs2+i] = 0;
+ for (k = 0; k < cols; k++) {
+ inv[rs2+k] ^= galois_single_multiply(tmp, inv[row_start+k], w);
+ }
+ }
+ }
+ }
+ return 0;
+}
+
+int jerasure_invertible_matrix(int *mat, int rows, int w)
+{
+ int cols, i, j, k, x, rs2;
+ int row_start, tmp, inverse;
+
+ cols = rows;
+
+ /* First -- convert into upper triangular */
+ for (i = 0; i < cols; i++) {
+ row_start = cols*i;
+
+ /* Swap rows if we ave a zero i,i element. If we can't swap, then the
+ matrix was not invertible */
+
+ if (mat[row_start+i] == 0) {
+ for (j = i+1; j < rows && mat[cols*j+i] == 0; j++) ;
+ if (j == rows) return 0;
+ rs2 = j*cols;
+ for (k = 0; k < cols; k++) {
+ tmp = mat[row_start+k];
+ mat[row_start+k] = mat[rs2+k];
+ mat[rs2+k] = tmp;
+ }
+ }
+
+ /* Multiply the row by 1/element i,i */
+ tmp = mat[row_start+i];
+ if (tmp != 1) {
+ inverse = galois_single_divide(1, tmp, w);
+ for (j = 0; j < cols; j++) {
+ mat[row_start+j] = galois_single_multiply(mat[row_start+j], inverse, w);
+ }
+ }
+
+ /* Now for each j>i, add A_ji*Ai to Aj */
+ k = row_start+i;
+ for (j = i+1; j != cols; j++) {
+ k += cols;
+ if (mat[k] != 0) {
+ if (mat[k] == 1) {
+ rs2 = cols*j;
+ for (x = 0; x < cols; x++) {
+ mat[rs2+x] ^= mat[row_start+x];
+ }
+ } else {
+ tmp = mat[k];
+ rs2 = cols*j;
+ for (x = 0; x < cols; x++) {
+ mat[rs2+x] ^= galois_single_multiply(tmp, mat[row_start+x], w);
+ }
+ }
+ }
+ }
+ }
+ return 1;
+}
+
+/* Converts a list-style version of the erasures into an array of k+m elements
+ where the element = 1 if the index has been erased, and zero otherwise */
+
+int *jerasure_erasures_to_erased(int k, int m, int *erasures)
+{
+ int td;
+ int t_non_erased;
+ int *erased;
+ int i;
+
+ td = k+m;
+ erased = talloc(int, td);
+ if (erased == NULL) return NULL;
+ t_non_erased = td;
+
+ for (i = 0; i < td; i++) erased[i] = 0;
+
+ for (i = 0; erasures[i] != -1; i++) {
+ if (erased[erasures[i]] == 0) {
+ erased[erasures[i]] = 1;
+ t_non_erased--;
+ if (t_non_erased < k) {
+ free(erased);
+ return NULL;
+ }
+ }
+ }
+ return erased;
+}
+
+void jerasure_free_schedule(int **schedule)
+{
+ int i;
+
+ for (i = 0; schedule[i][0] >= 0; i++) free(schedule[i]);
+ free(schedule[i]);
+ free(schedule);
+}
+
+void jerasure_free_schedule_cache(int k, int m, int ***cache)
+{
+ int e1, e2;
+
+ if (m != 2) {
+ fprintf(stderr, "jerasure_free_schedule_cache(): m must equal 2\n");
+ exit(1);
+ }
+
+ for (e1 = 0; e1 < k+m; e1++) {
+ for (e2 = 0; e2 < e1; e2++) {
+ jerasure_free_schedule(cache[e1*(k+m)+e2]);
+ }
+ jerasure_free_schedule(cache[e1*(k+m)+e1]);
+ }
+ free(cache);
+}
+
+void jerasure_matrix_dotprod(int k, int w, int *matrix_row,
+ int *src_ids, int dest_id,
+ char **data_ptrs, char **coding_ptrs, int size)
+{
+ int init;
+ char *dptr, *sptr;
+ int i;
+
+ if (w != 1 && w != 8 && w != 16 && w != 32) {
+ fprintf(stderr, "ERROR: jerasure_matrix_dotprod() called and w is not 1, 8, 16 or 32\n");
+ exit(1);
+ }
+
+ init = 0;
+
+ dptr = (dest_id < k) ? data_ptrs[dest_id] : coding_ptrs[dest_id-k];
+
+ /* First copy or xor any data that does not need to be multiplied by a factor */
+
+ for (i = 0; i < k; i++) {
+ if (matrix_row[i] == 1) {
+ if (src_ids == NULL) {
+ sptr = data_ptrs[i];
+ } else if (src_ids[i] < k) {
+ sptr = data_ptrs[src_ids[i]];
+ } else {
+ sptr = coding_ptrs[src_ids[i]-k];
+ }
+ if (init == 0) {
+ memcpy(dptr, sptr, size);
+ jerasure_total_memcpy_bytes += size;
+ init = 1;
+ } else {
+ galois_region_xor(sptr, dptr, dptr, size);
+ jerasure_total_xor_bytes += size;
+ }
+ }
+ }
+
+ /* Now do the data that needs to be multiplied by a factor */
+
+ for (i = 0; i < k; i++) {
+ if (matrix_row[i] != 0 && matrix_row[i] != 1) {
+ if (src_ids == NULL) {
+ sptr = data_ptrs[i];
+ } else if (src_ids[i] < k) {
+ sptr = data_ptrs[src_ids[i]];
+ } else {
+ sptr = coding_ptrs[src_ids[i]-k];
+ }
+ switch (w) {
+ case 8: galois_w08_region_multiply(sptr, matrix_row[i], size, dptr, init); break;
+ case 16: galois_w16_region_multiply(sptr, matrix_row[i], size, dptr, init); break;
+ case 32: galois_w32_region_multiply(sptr, matrix_row[i], size, dptr, init); break;
+ }
+ jerasure_total_gf_bytes += size;
+ init = 1;
+ }
+ }
+}
+
+
+int jerasure_bitmatrix_decode(int k, int m, int w, int *bitmatrix, int row_k_ones, int *erasures,
+ char **data_ptrs, char **coding_ptrs, int size, int packetsize)
+{
+ int i;
+ int *erased;
+ int *decoding_matrix;
+ int *dm_ids;
+ int edd, *tmpids, lastdrive;
+
+ erased = jerasure_erasures_to_erased(k, m, erasures);
+ if (erased == NULL) return -1;
+
+ /* See jerasure_matrix_decode for the logic of this routine. This one works just like
+ it, but calls the bitmatrix ops instead */
+
+ lastdrive = k;
+
+ edd = 0;
+ for (i = 0; i < k; i++) {
+ if (erased[i]) {
+ edd++;
+ lastdrive = i;
+ }
+ }
+
+ if (row_k_ones != 1 || erased[k]) lastdrive = k;
+
+ dm_ids = NULL;
+ decoding_matrix = NULL;
+
+ if (edd > 1 || (edd > 0 && (row_k_ones != 1 || erased[k]))) {
+
+ dm_ids = talloc(int, k);
+ if (dm_ids == NULL) {
+ free(erased);
+ return -1;
+ }
+
+ decoding_matrix = talloc(int, k*k*w*w);
+ if (decoding_matrix == NULL) {
+ free(erased);
+ free(dm_ids);
+ return -1;
+ }
+
+ if (jerasure_make_decoding_bitmatrix(k, m, w, bitmatrix, erased, decoding_matrix, dm_ids) < 0) {
+ free(erased);
+ free(dm_ids);
+ free(decoding_matrix);
+ return -1;
+ }
+ }
+
+ for (i = 0; edd > 0 && i < lastdrive; i++) {
+ if (erased[i]) {
+ jerasure_bitmatrix_dotprod(k, w, decoding_matrix+i*k*w*w, dm_ids, i, data_ptrs, coding_ptrs, size, packetsize);
+ edd--;
+ }
+ }
+
+ if (edd > 0) {
+ tmpids = talloc(int, k);
+ for (i = 0; i < k; i++) {
+ tmpids[i] = (i < lastdrive) ? i : i+1;
+ }
+ jerasure_bitmatrix_dotprod(k, w, bitmatrix, tmpids, lastdrive, data_ptrs, coding_ptrs, size, packetsize);
+ free(tmpids);
+ }
+
+ for (i = 0; i < m; i++) {
+ if (erased[k+i]) {
+ jerasure_bitmatrix_dotprod(k, w, bitmatrix+i*k*w*w, NULL, k+i, data_ptrs, coding_ptrs, size, packetsize);
+ }
+ }
+
+ free(erased);
+ if (dm_ids != NULL) free(dm_ids);
+ if (decoding_matrix != NULL) free(decoding_matrix);
+
+ return 0;
+}
+
+static char **set_up_ptrs_for_scheduled_decoding(int k, int m, int *erasures, char **data_ptrs, char **coding_ptrs)
+{
+ int ddf, cdf;
+ int *erased;
+ char **ptrs;
+ int i, j, x;
+
+ ddf = 0;
+ cdf = 0;
+ for (i = 0; erasures[i] != -1; i++) {
+ if (erasures[i] < k) ddf++; else cdf++;
+ }
+
+ erased = jerasure_erasures_to_erased(k, m, erasures);
+ if (erased == NULL) return NULL;
+
+ /* Set up ptrs. It will be as follows:
+
+ - If data drive i has not failed, then ptrs[i] = data_ptrs[i].
+ - If data drive i has failed, then ptrs[i] = coding_ptrs[j], where j is the
+ lowest unused non-failed coding drive.
+ - Elements k to k+ddf-1 are data_ptrs[] of the failed data drives.
+ - Elements k+ddf to k+ddf+cdf-1 are coding_ptrs[] of the failed data drives.
+
+ The array row_ids contains the ids of ptrs.
+ The array ind_to_row_ids contains the row_id of drive i.
+
+ However, we're going to set row_ids and ind_to_row in a different procedure.
+ */
+
+ ptrs = talloc(char *, k+m);
+
+ j = k;
+ x = k;
+ for (i = 0; i < k; i++) {
+ if (erased[i] == 0) {
+ ptrs[i] = data_ptrs[i];
+ } else {
+ while (erased[j]) j++;
+ ptrs[i] = coding_ptrs[j-k];
+ j++;
+ ptrs[x] = data_ptrs[i];
+ x++;
+ }
+ }
+ for (i = k; i < k+m; i++) {
+ if (erased[i]) {
+ ptrs[x] = coding_ptrs[i-k];
+ x++;
+ }
+ }
+ free(erased);
+ return ptrs;
+}
+
+static int set_up_ids_for_scheduled_decoding(int k, int m, int *erasures, int *row_ids, int *ind_to_row)
+{
+ int ddf, cdf;
+ int *erased;
+ int i, j, x;
+
+ ddf = 0;
+ cdf = 0;
+ for (i = 0; erasures[i] != -1; i++) {
+ if (erasures[i] < k) ddf++; else cdf++;
+ }
+
+ erased = jerasure_erasures_to_erased(k, m, erasures);
+ if (erased == NULL) return -1;
+
+ /* See set_up_ptrs_for_scheduled_decoding for how these are set */
+
+ j = k;
+ x = k;
+ for (i = 0; i < k; i++) {
+ if (erased[i] == 0) {
+ row_ids[i] = i;
+ ind_to_row[i] = i;
+ } else {
+ while (erased[j]) j++;
+ row_ids[i] = j;
+ ind_to_row[j] = i;
+ j++;
+ row_ids[x] = i;
+ ind_to_row[i] = x;
+ x++;
+ }
+ }
+ for (i = k; i < k+m; i++) {
+ if (erased[i]) {
+ row_ids[x] = i;
+ ind_to_row[i] = x;
+ x++;
+ }
+ }
+ free(erased);
+ return 0;
+}
+
+static int **jerasure_generate_decoding_schedule(int k, int m, int w, int *bitmatrix, int *erasures, int smart)
+{
+ int i, j, x, drive, y, index, z;
+ int *decoding_matrix, *inverse, *real_decoding_matrix;
+ int *ptr;
+ int *row_ids;
+ int *ind_to_row;
+ int ddf, cdf;
+ int **schedule;
+ int *b1, *b2;
+
+ /* First, figure out the number of data drives that have failed, and the
+ number of coding drives that have failed: ddf and cdf */
+
+ ddf = 0;
+ cdf = 0;
+ for (i = 0; erasures[i] != -1; i++) {
+ if (erasures[i] < k) ddf++; else cdf++;
+ }
+
+ row_ids = talloc(int, k+m);
+ ind_to_row = talloc(int, k+m);
+
+ if (set_up_ids_for_scheduled_decoding(k, m, erasures, row_ids, ind_to_row) < 0) return NULL;
+
+ /* Now, we're going to create one decoding matrix which is going to
+ decode everything with one call. The hope is that the scheduler
+ will do a good job. This matrix has w*e rows, where e is the
+ number of erasures (ddf+cdf) */
+
+ real_decoding_matrix = talloc(int, k*w*(cdf+ddf)*w);
+
+ /* First, if any data drives have failed, then initialize the first
+ ddf*w rows of the decoding matrix from the standard decoding
+ matrix inversion */
+
+ if (ddf > 0) {
+
+ decoding_matrix = talloc(int, k*k*w*w);
+ ptr = decoding_matrix;
+ for (i = 0; i < k; i++) {
+ if (row_ids[i] == i) {
+ bzero(ptr, k*w*w*sizeof(int));
+ for (x = 0; x < w; x++) {
+ ptr[x+i*w+x*k*w] = 1;
+ }
+ } else {
+ memcpy(ptr, bitmatrix+k*w*w*(row_ids[i]-k), k*w*w*sizeof(int));
+ }
+ ptr += (k*w*w);
+ }
+ inverse = talloc(int, k*k*w*w);
+ jerasure_invert_bitmatrix(decoding_matrix, inverse, k*w);
+
+/* printf("\nMatrix to invert\n");
+ jerasure_print_bitmatrix(decoding_matrix, k*w, k*w, w);
+ printf("\n");
+ printf("\nInverse\n");
+ jerasure_print_bitmatrix(inverse, k*w, k*w, w);
+ printf("\n"); */
+
+ free(decoding_matrix);
+ ptr = real_decoding_matrix;
+ for (i = 0; i < ddf; i++) {
+ memcpy(ptr, inverse+k*w*w*row_ids[k+i], sizeof(int)*k*w*w);
+ ptr += (k*w*w);
+ }
+ free(inverse);
+ }
+
+ /* Next, here comes the hard part. For each coding node that needs
+ to be decoded, you start by putting its rows of the distribution
+ matrix into the decoding matrix. If there were no failed data
+ nodes, then you're done. However, if there have been failed
+ data nodes, then you need to modify the columns that correspond
+ to the data nodes. You do that by first zeroing them. Then
+ whereever there is a one in the distribution matrix, you XOR
+ in the corresponding row from the failed data node's entry in
+ the decoding matrix. The whole process kind of makes my head
+ spin, but it works.
+ */
+
+ for (x = 0; x < cdf; x++) {
+ drive = row_ids[x+ddf+k]-k;
+ ptr = real_decoding_matrix + k*w*w*(ddf+x);
+ memcpy(ptr, bitmatrix+drive*k*w*w, sizeof(int)*k*w*w);
+
+ for (i = 0; i < k; i++) {
+ if (row_ids[i] != i) {
+ for (j = 0; j < w; j++) {
+ bzero(ptr+j*k*w+i*w, sizeof(int)*w);
+ }
+ }
+ }
+
+ /* There's the yucky part */
+
+ index = drive*k*w*w;
+ for (i = 0; i < k; i++) {
+ if (row_ids[i] != i) {
+ b1 = real_decoding_matrix+(ind_to_row[i]-k)*k*w*w;
+ for (j = 0; j < w; j++) {
+ b2 = ptr + j*k*w;
+ for (y = 0; y < w; y++) {
+ if (bitmatrix[index+j*k*w+i*w+y]) {
+ for (z = 0; z < k*w; z++) {
+ b2[z] = b2[z] ^ b1[z+y*k*w];
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+/*
+ printf("\n\nReal Decoding Matrix\n\n");
+ jerasure_print_bitmatrix(real_decoding_matrix, (ddf+cdf)*w, k*w, w);
+ printf("\n"); */
+ if (smart) {
+ schedule = jerasure_smart_bitmatrix_to_schedule(k, ddf+cdf, w, real_decoding_matrix);
+ } else {
+ schedule = jerasure_dumb_bitmatrix_to_schedule(k, ddf+cdf, w, real_decoding_matrix);
+ }
+ free(row_ids);
+ free(ind_to_row);
+ free(real_decoding_matrix);
+ return schedule;
+}
+
+int jerasure_schedule_decode_lazy(int k, int m, int w, int *bitmatrix, int *erasures,
+ char **data_ptrs, char **coding_ptrs, int size, int packetsize,
+ int smart)
+{
+ int i, tdone;
+ char **ptrs;
+ int **schedule;
+
+ ptrs = set_up_ptrs_for_scheduled_decoding(k, m, erasures, data_ptrs, coding_ptrs);
+ if (ptrs == NULL) return -1;
+
+ schedule = jerasure_generate_decoding_schedule(k, m, w, bitmatrix, erasures, smart);
+ if (schedule == NULL) {
+ free(ptrs);
+ return -1;
+ }
+
+ for (tdone = 0; tdone < size; tdone += packetsize*w) {
+ jerasure_do_scheduled_operations(ptrs, schedule, packetsize);
+ for (i = 0; i < k+m; i++) ptrs[i] += (packetsize*w);
+ }
+
+ jerasure_free_schedule(schedule);
+ free(ptrs);
+
+ return 0;
+}
+
+int jerasure_schedule_decode_cache(int k, int m, int w, int ***scache, int *erasures,
+ char **data_ptrs, char **coding_ptrs, int size, int packetsize)
+{
+ int i, tdone;
+ char **ptrs;
+ int **schedule;
+ int index;
+
+ if (erasures[1] == -1) {
+ index = erasures[0]*(k+m) + erasures[0];
+ } else if (erasures[2] == -1) {
+ index = erasures[0]*(k+m) + erasures[1];
+ } else {
+ return -1;
+ }
+
+ schedule = scache[index];
+
+ ptrs = set_up_ptrs_for_scheduled_decoding(k, m, erasures, data_ptrs, coding_ptrs);
+ if (ptrs == NULL) return -1;
+
+
+ for (tdone = 0; tdone < size; tdone += packetsize*w) {
+ jerasure_do_scheduled_operations(ptrs, schedule, packetsize);
+ for (i = 0; i < k+m; i++) ptrs[i] += (packetsize*w);
+ }
+
+ free(ptrs);
+
+ return 0;
+}
+
+/* This only works when m = 2 */
+
+int ***jerasure_generate_schedule_cache(int k, int m, int w, int *bitmatrix, int smart)
+{
+ int ***scache;
+ int erasures[3];
+ int e1, e2;
+
+ /* Ok -- this is yucky, but it's how I'm doing it. You will make an index out
+ of erasures, which will be e1*(k+m)+(e2). If there is no e2, then e2 = e1.
+ Isn't that clever and confusing. Sorry.
+
+ We're not going to worry about ordering -- in other words, the schedule for
+ e1,e2 will be the same as e2,e1. They will have the same pointer -- the
+ schedule will not be duplicated. */
+
+ if (m != 2) return NULL;
+
+ scache = talloc(int **, (k+m)*(k+m+1));
+ if (scache == NULL) return NULL;
+
+ for (e1 = 0; e1 < k+m; e1++) {
+ erasures[0] = e1;
+ for (e2 = 0; e2 < e1; e2++) {
+ erasures[1] = e2;
+ erasures[2] = -1;
+ scache[e1*(k+m)+e2] = jerasure_generate_decoding_schedule(k, m, w, bitmatrix, erasures, smart);
+ scache[e2*(k+m)+e1] = scache[e1*(k+m)+e2];
+ }
+ erasures[1] = -1;
+ scache[e1*(k+m)+e1] = jerasure_generate_decoding_schedule(k, m, w, bitmatrix, erasures, smart);
+ }
+ return scache;
+
+}
+
+int jerasure_invert_bitmatrix(int *mat, int *inv, int rows)
+{
+ int cols, i, j, k;
+ int tmp;
+
+ cols = rows;
+
+ k = 0;
+ for (i = 0; i < rows; i++) {
+ for (j = 0; j < cols; j++) {
+ inv[k] = (i == j) ? 1 : 0;
+ k++;
+ }
+ }
+
+ /* First -- convert into upper triangular */
+
+ for (i = 0; i < cols; i++) {
+
+ /* Swap rows if we have a zero i,i element. If we can't swap, then the
+ matrix was not invertible */
+
+ if ((mat[i*cols+i]) == 0) {
+ for (j = i+1; j < rows && (mat[j*cols+i]) == 0; j++) ;
+ if (j == rows) return -1;
+ for (k = 0; k < cols; k++) {
+ tmp = mat[i*cols+k]; mat[i*cols+k] = mat[j*cols+k]; mat[j*cols+k] = tmp;
+ tmp = inv[i*cols+k]; inv[i*cols+k] = inv[j*cols+k]; inv[j*cols+k] = tmp;
+ }
+ }
+
+ /* Now for each j>i, add A_ji*Ai to Aj */
+ for (j = i+1; j != rows; j++) {
+ if (mat[j*cols+i] != 0) {
+ for (k = 0; k < cols; k++) {
+ mat[j*cols+k] ^= mat[i*cols+k];
+ inv[j*cols+k] ^= inv[i*cols+k];
+ }
+ }
+ }
+ }
+
+ /* Now the matrix is upper triangular. Start at the top and multiply down */
+
+ for (i = rows-1; i >= 0; i--) {
+ for (j = 0; j < i; j++) {
+ if (mat[j*cols+i]) {
+ for (k = 0; k < cols; k++) {
+ mat[j*cols+k] ^= mat[i*cols+k];
+ inv[j*cols+k] ^= inv[i*cols+k];
+ }
+ }
+ }
+ }
+ return 0;
+}
+
+int jerasure_invertible_bitmatrix(int *mat, int rows)
+{
+ int cols, i, j, k;
+ int tmp;
+
+ cols = rows;
+
+ /* First -- convert into upper triangular */
+
+ for (i = 0; i < cols; i++) {
+
+ /* Swap rows if we have a zero i,i element. If we can't swap, then the
+ matrix was not invertible */
+
+ if ((mat[i*cols+i]) == 0) {
+ for (j = i+1; j < rows && (mat[j*cols+i]) == 0; j++) ;
+ if (j == rows) return 0;
+ for (k = 0; k < cols; k++) {
+ tmp = mat[i*cols+k]; mat[i*cols+k] = mat[j*cols+k]; mat[j*cols+k] = tmp;
+ }
+ }
+
+ /* Now for each j>i, add A_ji*Ai to Aj */
+ for (j = i+1; j != rows; j++) {
+ if (mat[j*cols+i] != 0) {
+ for (k = 0; k < cols; k++) {
+ mat[j*cols+k] ^= mat[i*cols+k];
+ }
+ }
+ }
+ }
+ return 1;
+}
+
+
+int *jerasure_matrix_multiply(int *m1, int *m2, int r1, int c1, int r2, int c2, int w)
+{
+ int *product, i, j, k;
+
+ product = (int *) malloc(sizeof(int)*r1*c2);
+ for (i = 0; i < r1*c2; i++) product[i] = 0;
+
+ for (i = 0; i < r1; i++) {
+ for (j = 0; j < c2; j++) {
+ for (k = 0; k < r2; k++) {
+ product[i*c2+j] ^= galois_single_multiply(m1[i*c1+k], m2[k*c2+j], w);
+ }
+ }
+ }
+ return product;
+}
+
+void jerasure_get_stats(double *fill_in)
+{
+ fill_in[0] = jerasure_total_xor_bytes;
+ fill_in[1] = jerasure_total_gf_bytes;
+ fill_in[2] = jerasure_total_memcpy_bytes;
+ jerasure_total_xor_bytes = 0;
+ jerasure_total_gf_bytes = 0;
+ jerasure_total_memcpy_bytes = 0;
+}
+
+void jerasure_do_scheduled_operations(char **ptrs, int **operations, int packetsize)
+{
+ char *sptr;
+ char *dptr;
+ int op;
+
+ for (op = 0; operations[op][0] >= 0; op++) {
+ sptr = ptrs[operations[op][0]] + operations[op][1]*packetsize;
+ dptr = ptrs[operations[op][2]] + operations[op][3]*packetsize;
+ if (operations[op][4]) {
+/* printf("%d,%d %d,%d\n", operations[op][0],
+ operations[op][1],
+ operations[op][2],
+ operations[op][3]);
+ printf("xor(0x%x, 0x%x -> 0x%x, %d)\n", sptr, dptr, dptr, packetsize); */
+ galois_region_xor(sptr, dptr, dptr, packetsize);
+ jerasure_total_xor_bytes += packetsize;
+ } else {
+/* printf("memcpy(0x%x <- 0x%x)\n", dptr, sptr); */
+ memcpy(dptr, sptr, packetsize);
+ jerasure_total_memcpy_bytes += packetsize;
+ }
+ }
+}
+
+void jerasure_schedule_encode(int k, int m, int w, int **schedule,
+ char **data_ptrs, char **coding_ptrs, int size, int packetsize)
+{
+ char **ptr_copy;
+ int i, tdone;
+
+ ptr_copy = talloc(char *, (k+m));
+ for (i = 0; i < k; i++) ptr_copy[i] = data_ptrs[i];
+ for (i = 0; i < m; i++) ptr_copy[i+k] = coding_ptrs[i];
+ for (tdone = 0; tdone < size; tdone += packetsize*w) {
+ jerasure_do_scheduled_operations(ptr_copy, schedule, packetsize);
+ for (i = 0; i < k+m; i++) ptr_copy[i] += (packetsize*w);
+ }
+ free(ptr_copy);
+}
+
+int **jerasure_dumb_bitmatrix_to_schedule(int k, int m, int w, int *bitmatrix)
+{
+ int **operations;
+ int op;
+ int index, optodo, i, j;
+
+ operations = talloc(int *, k*m*w*w+1);
+ op = 0;
+
+ index = 0;
+ for (i = 0; i < m*w; i++) {
+ optodo = 0;
+ for (j = 0; j < k*w; j++) {
+ if (bitmatrix[index]) {
+ operations[op] = talloc(int, 5);
+ operations[op][4] = optodo;
+ operations[op][0] = j/w;
+ operations[op][1] = j%w;
+ operations[op][2] = k+i/w;
+ operations[op][3] = i%w;
+ optodo = 1;
+ op++;
+
+ }
+ index++;
+ }
+ }
+ operations[op] = talloc(int, 5);
+ operations[op][0] = -1;
+ return operations;
+}
+
+int **jerasure_smart_bitmatrix_to_schedule(int k, int m, int w, int *bitmatrix)
+{
+ int **operations;
+ int op;
+ int i, j;
+ int *diff, *from, *b1, *flink, *blink;
+ int *ptr, no, row;
+ int optodo;
+ int bestrow = 0, bestdiff, top;
+
+/* printf("Scheduling:\n\n");
+ jerasure_print_bitmatrix(bitmatrix, m*w, k*w, w); */
+
+ operations = talloc(int *, k*m*w*w+1);
+ op = 0;
+
+ diff = talloc(int, m*w);
+ from = talloc(int, m*w);
+ flink = talloc(int, m*w);
+ blink = talloc(int, m*w);
+
+ ptr = bitmatrix;
+
+ bestdiff = k*w+1;
+ top = 0;
+ for (i = 0; i < m*w; i++) {
+ no = 0;
+ for (j = 0; j < k*w; j++) {
+ no += *ptr;
+ ptr++;
+ }
+ diff[i] = no;
+ from[i] = -1;
+ flink[i] = i+1;
+ blink[i] = i-1;
+ if (no < bestdiff) {
+ bestdiff = no;
+ bestrow = i;
+ }
+ }
+
+ flink[m*w-1] = -1;
+
+ while (top != -1) {
+ row = bestrow;
+ /* printf("Doing row %d - %d from %d\n", row, diff[row], from[row]); */
+
+ if (blink[row] == -1) {
+ top = flink[row];
+ if (top != -1) blink[top] = -1;
+ } else {
+ flink[blink[row]] = flink[row];
+ if (flink[row] != -1) {
+ blink[flink[row]] = blink[row];
+ }
+ }
+
+ ptr = bitmatrix + row*k*w;
+ if (from[row] == -1) {
+ optodo = 0;
+ for (j = 0; j < k*w; j++) {
+ if (ptr[j]) {
+ operations[op] = talloc(int, 5);
+ operations[op][4] = optodo;
+ operations[op][0] = j/w;
+ operations[op][1] = j%w;
+ operations[op][2] = k+row/w;
+ operations[op][3] = row%w;
+ optodo = 1;
+ op++;
+ }
+ }
+ } else {
+ operations[op] = talloc(int, 5);
+ operations[op][4] = 0;
+ operations[op][0] = k+from[row]/w;
+ operations[op][1] = from[row]%w;
+ operations[op][2] = k+row/w;
+ operations[op][3] = row%w;
+ op++;
+ b1 = bitmatrix + from[row]*k*w;
+ for (j = 0; j < k*w; j++) {
+ if (ptr[j] ^ b1[j]) {
+ operations[op] = talloc(int, 5);
+ operations[op][4] = 1;
+ operations[op][0] = j/w;
+ operations[op][1] = j%w;
+ operations[op][2] = k+row/w;
+ operations[op][3] = row%w;
+ optodo = 1;
+ op++;
+ }
+ }
+ }
+ bestdiff = k*w+1;
+ for (i = top; i != -1; i = flink[i]) {
+ no = 1;
+ b1 = bitmatrix + i*k*w;
+ for (j = 0; j < k*w; j++) no += (ptr[j] ^ b1[j]);
+ if (no < diff[i]) {
+ from[i] = row;
+ diff[i] = no;
+ }
+ if (diff[i] < bestdiff) {
+ bestdiff = diff[i];
+ bestrow = i;
+ }
+ }
+ }
+
+ operations[op] = talloc(int, 5);
+ operations[op][0] = -1;
+ free(from);
+ free(diff);
+ free(blink);
+ free(flink);
+
+ return operations;
+}
+
+void jerasure_bitmatrix_encode(int k, int m, int w, int *bitmatrix,
+ char **data_ptrs, char **coding_ptrs, int size, int packetsize)
+{
+ int i;
+
+ if (packetsize%sizeof(long) != 0) {
+ fprintf(stderr, "jerasure_bitmatrix_encode - packetsize(%d) %c sizeof(long) != 0\n", packetsize, '%');
+ exit(1);
+ }
+ if (size%(packetsize*w) != 0) {
+ fprintf(stderr, "jerasure_bitmatrix_encode - size(%d) %c (packetsize(%d)*w(%d))) != 0\n",
+ size, '%', packetsize, w);
+ exit(1);
+ }
+
+ for (i = 0; i < m; i++) {
+ jerasure_bitmatrix_dotprod(k, w, bitmatrix+i*k*w*w, NULL, k+i, data_ptrs, coding_ptrs, size, packetsize);
+ }
+}
+
diff --git a/src/osd/ErasureCodePluginJerasure/jerasure.h b/src/osd/ErasureCodePluginJerasure/jerasure.h
new file mode 100755
index 00000000000..8cc25cad839
--- /dev/null
+++ b/src/osd/ErasureCodePluginJerasure/jerasure.h
@@ -0,0 +1,300 @@
+/* jerasure.h - header of kernel procedures
+ * James S. Plank
+
+Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure Coding Techniques
+
+Revision 1.2A
+May 24, 2011
+
+James S. Plank
+Department of Electrical Engineering and Computer Science
+University of Tennessee
+Knoxville, TN 37996
+plank@cs.utk.edu
+
+Copyright (c) 2011, James S. Plank
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ - Neither the name of the University of Tennessee nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+ */
+
+#ifndef _JERASURE_H
+#define _JERASURE_H
+
+/* This uses procedures from the Galois Field arithmetic library */
+
+#include "galois.h"
+
+/* ------------------------------------------------------------ */
+/* In all of the routines below:
+
+ k = Number of data devices
+ m = Number of coding devices
+ w = Word size
+
+ data_ptrs = An array of k pointers to data which is size bytes.
+ Size must be a multiple of sizeof(long).
+ Pointers must also be longword aligned.
+
+ coding_ptrs = An array of m pointers to coding data which is size bytes.
+
+ packetsize = The size of a coding block with bitmatrix coding.
+ When you code with a bitmatrix, you will use w packets
+ of size packetsize.
+
+ matrix = an array of k*m integers.
+ It represents an m by k matrix.
+ Element i,j is in matrix[i*k+j];
+
+ bitmatrix = an array of k*m*w*w integers.
+ It represents an mw by kw matrix.
+ Element i,j is in matrix[i*k*w+j];
+
+ erasures = an array of id's of erased devices.
+ Id's are integers between 0 and k+m-1.
+ Id's 0 to k-1 are id's of data devices.
+ Id's k to k+m-1 are id's of coding devices:
+ Coding device id = id-k.
+ If there are e erasures, erasures[e] = -1.
+
+ schedule = an array of schedule operations.
+
+ If there are m operations, then schedule[m][0] = -1.
+
+ operation = an array of 5 integers:
+
+ 0 = operation: 0 for copy, 1 for xor (-1 for end)
+ 1 = source device (0 - k+m-1)
+ 2 = source packet (0 - w-1)
+ 3 = destination device (0 - k+m-1)
+ 4 = destination packet (0 - w-1)
+ */
+
+/* --------------------------------------------------------------- */
+/* Bitmatrices / schedules ---------------------------------------- */
+/*
+ - jerasure_matrix_to_bitmatrix turns a m X k matrix in GF(2^w) into a
+ wm X wk bitmatrix (in GF(2)). This is
+ explained in the Cauchy Reed-Solomon coding
+ paper.
+
+ - jerasure_dumb_bitmatrix_to_schedule turns a bitmatrix into a schedule
+ using the straightforward algorithm -- just
+ schedule the dot products defined by each
+ row of the matrix.
+
+ - jerasure_smart_bitmatrix_to_schedule turns a bitmatrix into a schedule,
+ but tries to use previous dot products to
+ calculate new ones. This is the optimization
+ explained in the original Liberation code paper.
+
+ - jerasure_generate_schedule_cache precalcalculate all the schedule for the
+ given distribution bitmatrix. M must equal 2.
+
+ - jerasure_free_schedule frees a schedule that was allocated with
+ jerasure_XXX_bitmatrix_to_schedule.
+
+ - jerasure_free_schedule_cache frees a schedule cache that was created with
+ jerasure_generate_schedule_cache.
+ */
+
+int *jerasure_matrix_to_bitmatrix(int k, int m, int w, int *matrix);
+int **jerasure_dumb_bitmatrix_to_schedule(int k, int m, int w, int *bitmatrix);
+int **jerasure_smart_bitmatrix_to_schedule(int k, int m, int w, int *bitmatrix);
+int ***jerasure_generate_schedule_cache(int k, int m, int w, int *bitmatrix, int smart);
+
+void jerasure_free_schedule(int **schedule);
+void jerasure_free_schedule_cache(int k, int m, int ***cache);
+
+
+/* ------------------------------------------------------------ */
+/* Encoding - these are all straightforward. jerasure_matrix_encode only
+ works with w = 8|16|32. */
+
+void jerasure_do_parity(int k, char **data_ptrs, char *parity_ptr, int size);
+
+void jerasure_matrix_encode(int k, int m, int w, int *matrix,
+ char **data_ptrs, char **coding_ptrs, int size);
+
+void jerasure_bitmatrix_encode(int k, int m, int w, int *bitmatrix,
+ char **data_ptrs, char **coding_ptrs, int size, int packetsize);
+
+void jerasure_schedule_encode(int k, int m, int w, int **schedule,
+ char **data_ptrs, char **coding_ptrs, int size, int packetsize);
+
+/* ------------------------------------------------------------ */
+/* Decoding. -------------------------------------------------- */
+
+/* These return integers, because the matrix may not be invertible.
+
+ The parameter row_k_ones should be set to 1 if row k of the matrix
+ (or rows kw to (k+1)w+1) of th distribution matrix are all ones
+ (or all identity matrices). Then you can improve the performance
+ of decoding when there is more than one failure, and the parity
+ device didn't fail. You do it by decoding all but one of the data
+ devices, and then decoding the last data device from the data devices
+ and the parity device.
+
+ jerasure_schedule_decode_lazy generates the schedule on the fly.
+
+ jerasure_matrix_decode only works when w = 8|16|32.
+
+ jerasure_make_decoding_matrix/bitmatrix make the k*k decoding matrix
+ (or wk*wk bitmatrix) by taking the rows corresponding to k
+ non-erased devices of the distribution matrix, and then
+ inverting that matrix.
+
+ You should already have allocated the decoding matrix and
+ dm_ids, which is a vector of k integers. These will be
+ filled in appropriately. dm_ids[i] is the id of element
+ i of the survivors vector. I.e. row i of the decoding matrix
+ times dm_ids equals data drive i.
+
+ Both of these routines take "erased" instead of "erasures".
+ Erased is a vector with k+m elements, which has 0 or 1 for
+ each device's id, according to whether the device is erased.
+
+ jerasure_erasures_to_erased allocates and returns erased from erasures.
+
+ */
+
+int jerasure_matrix_decode(int k, int m, int w,
+ int *matrix, int row_k_ones, int *erasures,
+ char **data_ptrs, char **coding_ptrs, int size);
+
+int jerasure_bitmatrix_decode(int k, int m, int w,
+ int *bitmatrix, int row_k_ones, int *erasures,
+ char **data_ptrs, char **coding_ptrs, int size, int packetsize);
+
+int jerasure_schedule_decode_lazy(int k, int m, int w, int *bitmatrix, int *erasures,
+ char **data_ptrs, char **coding_ptrs, int size, int packetsize,
+ int smart);
+
+int jerasure_schedule_decode_cache(int k, int m, int w, int ***scache, int *erasures,
+ char **data_ptrs, char **coding_ptrs, int size, int packetsize);
+
+int jerasure_make_decoding_matrix(int k, int m, int w, int *matrix, int *erased,
+ int *decoding_matrix, int *dm_ids);
+
+int jerasure_make_decoding_bitmatrix(int k, int m, int w, int *matrix, int *erased,
+ int *decoding_matrix, int *dm_ids);
+
+int *jerasure_erasures_to_erased(int k, int m, int *erasures);
+
+/* ------------------------------------------------------------ */
+/* These perform dot products and schedules. -------------------*/
+/*
+ src_ids is a matrix of k id's (0 - k-1 for data devices, k - k+m-1
+ for coding devices) that identify the source devices. Dest_id is
+ the id of the destination device.
+
+ jerasure_matrix_dotprod only works when w = 8|16|32.
+
+ jerasure_do_scheduled_operations executes the schedule on w*packetsize worth of
+ bytes from each device. ptrs is an array of pointers which should have as many
+ elements as the highest referenced device in the schedule.
+
+ */
+
+void jerasure_matrix_dotprod(int k, int w, int *matrix_row,
+ int *src_ids, int dest_id,
+ char **data_ptrs, char **coding_ptrs, int size);
+
+void jerasure_bitmatrix_dotprod(int k, int w, int *bitmatrix_row,
+ int *src_ids, int dest_id,
+ char **data_ptrs, char **coding_ptrs, int size, int packetsize);
+
+void jerasure_do_scheduled_operations(char **ptrs, int **schedule, int packetsize);
+
+/* ------------------------------------------------------------ */
+/* Matrix Inversion ------------------------------------------- */
+/*
+ The two matrix inversion functions work on rows*rows matrices of
+ ints. If a bitmatrix, then each int will just be zero or one.
+ Otherwise, they will be elements of gf(2^w). Obviously, you can
+ do bit matrices with crs_invert_matrix() and set w = 1, but
+ crs_invert_bitmatrix will be more efficient.
+
+ The two invertible functions return whether a matrix is invertible.
+ They are more efficient than the inverstion functions.
+
+ Mat will be destroyed when the matrix inversion or invertible
+ testing is done. Sorry.
+
+ Inv must be allocated by the caller.
+
+ The two invert_matrix functions return 0 on success, and -1 if the
+ matrix is uninvertible.
+
+ The two invertible function simply return whether the matrix is
+ invertible. (0 or 1). Mat will be destroyed.
+ */
+
+int jerasure_invert_matrix(int *mat, int *inv, int rows, int w);
+int jerasure_invert_bitmatrix(int *mat, int *inv, int rows);
+int jerasure_invertible_matrix(int *mat, int rows, int w);
+int jerasure_invertible_bitmatrix(int *mat, int rows);
+
+/* ------------------------------------------------------------ */
+/* Basic matrix operations -------------------------------------*/
+/*
+ Each of the print_matrix routines require a w. In jerasure_print_matrix,
+ this is to calculate the field width. In jerasure_print_bitmatrix, it is
+ to put spaces between the bits.
+
+ jerasure_matrix_multiply is a simple matrix multiplier in GF(2^w). It returns a r1*c2
+ matrix, which is the product of the two input matrices. It allocates
+ the product. Obviously, c1 should equal r2. However, this is not
+ validated by the procedure.
+*/
+
+void jerasure_print_matrix(int *matrix, int rows, int cols, int w);
+void jerasure_print_bitmatrix(int *matrix, int rows, int cols, int w);
+
+
+int *jerasure_matrix_multiply(int *m1, int *m2, int r1, int c1, int r2, int c2, int w);
+
+/* ------------------------------------------------------------ */
+/* Stats ------------------------------------------------------ */
+/*
+ jerasure_get_stats fills in a vector of three doubles:
+
+ fill_in[0] is the number of bytes that have been XOR'd
+ fill_in[1] is the number of bytes that have been copied
+ fill_in[2] is the number of bytes that have been multiplied
+ by a constant in GF(2^w)
+
+ When jerasure_get_stats() is called, it resets its values.
+ */
+
+void jerasure_get_stats(double *fill_in);
+
+#endif
diff --git a/src/osd/ErasureCodePluginJerasure/liberation.c b/src/osd/ErasureCodePluginJerasure/liberation.c
new file mode 100755
index 00000000000..beca10df9f4
--- /dev/null
+++ b/src/osd/ErasureCodePluginJerasure/liberation.c
@@ -0,0 +1,265 @@
+/* liberation.c
+ * James S. Plank
+
+Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure Coding Techniques
+
+Revision 1.2A
+May 24, 2011
+
+James S. Plank
+Department of Electrical Engineering and Computer Science
+University of Tennessee
+Knoxville, TN 37996
+plank@cs.utk.edu
+
+Copyright (c) 2011, James S. Plank
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ - Neither the name of the University of Tennessee nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "galois.h"
+#include "jerasure.h"
+#include "liberation.h"
+
+#define talloc(type, num) (type *) malloc(sizeof(type)*(num))
+
+int *liberation_coding_bitmatrix(int k, int w)
+{
+ int *matrix, i, j, index;
+
+ if (k > w) return NULL;
+ matrix = talloc(int, 2*k*w*w);
+ if (matrix == NULL) return NULL;
+ bzero(matrix, sizeof(int)*2*k*w*w);
+
+ /* Set up identity matrices */
+
+ for(i = 0; i < w; i++) {
+ index = i*k*w+i;
+ for (j = 0; j < k; j++) {
+ matrix[index] = 1;
+ index += w;
+ }
+ }
+
+ /* Set up liberation matrices */
+
+ for (j = 0; j < k; j++) {
+ index = k*w*w+j*w;
+ for (i = 0; i < w; i++) {
+ matrix[index+(j+i)%w] = 1;
+ index += (k*w);
+ }
+ if (j > 0) {
+ i = (j*((w-1)/2))%w;
+ matrix[k*w*w+j*w+i*k*w+(i+j-1)%w] = 1;
+ }
+ }
+ return matrix;
+}
+
+
+int *liber8tion_coding_bitmatrix(int k)
+{
+ int *matrix, i, j, index;
+ int w;
+
+ w = 8;
+ if (k > w) return NULL;
+ matrix = talloc(int, 2*k*w*w);
+ if (matrix == NULL) return NULL;
+ bzero(matrix, sizeof(int)*2*k*w*w);
+
+ /* Set up identity matrices */
+
+ for(i = 0; i < w; i++) {
+ index = i*k*w+i;
+ for (j = 0; j < k; j++) {
+ matrix[index] = 1;
+ index += w;
+ }
+ }
+
+ /* Set up liber8tion matrices */
+
+ index = k*w*w;
+
+ if (k == 0) return matrix;
+ matrix[index+0*k*w+0*w+0] = 1;
+ matrix[index+1*k*w+0*w+1] = 1;
+ matrix[index+2*k*w+0*w+2] = 1;
+ matrix[index+3*k*w+0*w+3] = 1;
+ matrix[index+4*k*w+0*w+4] = 1;
+ matrix[index+5*k*w+0*w+5] = 1;
+ matrix[index+6*k*w+0*w+6] = 1;
+ matrix[index+7*k*w+0*w+7] = 1;
+
+ if (k == 1) return matrix;
+ matrix[index+0*k*w+1*w+7] = 1;
+ matrix[index+1*k*w+1*w+3] = 1;
+ matrix[index+2*k*w+1*w+0] = 1;
+ matrix[index+3*k*w+1*w+2] = 1;
+ matrix[index+4*k*w+1*w+6] = 1;
+ matrix[index+5*k*w+1*w+1] = 1;
+ matrix[index+6*k*w+1*w+5] = 1;
+ matrix[index+7*k*w+1*w+4] = 1;
+ matrix[index+4*k*w+1*w+7] = 1;
+
+ if (k == 2) return matrix;
+ matrix[index+0*k*w+2*w+6] = 1;
+ matrix[index+1*k*w+2*w+2] = 1;
+ matrix[index+2*k*w+2*w+4] = 1;
+ matrix[index+3*k*w+2*w+0] = 1;
+ matrix[index+4*k*w+2*w+7] = 1;
+ matrix[index+5*k*w+2*w+3] = 1;
+ matrix[index+6*k*w+2*w+1] = 1;
+ matrix[index+7*k*w+2*w+5] = 1;
+ matrix[index+1*k*w+2*w+3] = 1;
+
+ if (k == 3) return matrix;
+ matrix[index+0*k*w+3*w+2] = 1;
+ matrix[index+1*k*w+3*w+5] = 1;
+ matrix[index+2*k*w+3*w+7] = 1;
+ matrix[index+3*k*w+3*w+6] = 1;
+ matrix[index+4*k*w+3*w+0] = 1;
+ matrix[index+5*k*w+3*w+3] = 1;
+ matrix[index+6*k*w+3*w+4] = 1;
+ matrix[index+7*k*w+3*w+1] = 1;
+ matrix[index+5*k*w+3*w+4] = 1;
+
+ if (k == 4) return matrix;
+ matrix[index+0*k*w+4*w+5] = 1;
+ matrix[index+1*k*w+4*w+6] = 1;
+ matrix[index+2*k*w+4*w+1] = 1;
+ matrix[index+3*k*w+4*w+7] = 1;
+ matrix[index+4*k*w+4*w+2] = 1;
+ matrix[index+5*k*w+4*w+4] = 1;
+ matrix[index+6*k*w+4*w+3] = 1;
+ matrix[index+7*k*w+4*w+0] = 1;
+ matrix[index+2*k*w+4*w+0] = 1;
+
+ if (k == 5) return matrix;
+ matrix[index+0*k*w+5*w+1] = 1;
+ matrix[index+1*k*w+5*w+2] = 1;
+ matrix[index+2*k*w+5*w+3] = 1;
+ matrix[index+3*k*w+5*w+4] = 1;
+ matrix[index+4*k*w+5*w+5] = 1;
+ matrix[index+5*k*w+5*w+6] = 1;
+ matrix[index+6*k*w+5*w+7] = 1;
+ matrix[index+7*k*w+5*w+0] = 1;
+ matrix[index+7*k*w+5*w+2] = 1;
+
+ if (k == 6) return matrix;
+ matrix[index+0*k*w+6*w+3] = 1;
+ matrix[index+1*k*w+6*w+0] = 1;
+ matrix[index+2*k*w+6*w+6] = 1;
+ matrix[index+3*k*w+6*w+5] = 1;
+ matrix[index+4*k*w+6*w+1] = 1;
+ matrix[index+5*k*w+6*w+7] = 1;
+ matrix[index+6*k*w+6*w+4] = 1;
+ matrix[index+7*k*w+6*w+2] = 1;
+ matrix[index+6*k*w+6*w+5] = 1;
+
+ if (k == 7) return matrix;
+ matrix[index+0*k*w+7*w+4] = 1;
+ matrix[index+1*k*w+7*w+7] = 1;
+ matrix[index+2*k*w+7*w+1] = 1;
+ matrix[index+3*k*w+7*w+5] = 1;
+ matrix[index+4*k*w+7*w+3] = 1;
+ matrix[index+5*k*w+7*w+2] = 1;
+ matrix[index+6*k*w+7*w+0] = 1;
+ matrix[index+7*k*w+7*w+6] = 1;
+ matrix[index+3*k*w+7*w+1] = 1;
+
+ return matrix;
+}
+
+int *blaum_roth_coding_bitmatrix(int k, int w)
+{
+ int *matrix, i, j, index, l, m, p;
+
+ if (k > w) return NULL ;
+
+ matrix = talloc(int, 2*k*w*w);
+ if (matrix == NULL) return NULL;
+ bzero(matrix, sizeof(int)*2*k*w*w);
+
+ /* Set up identity matrices */
+
+ for(i = 0; i < w; i++) {
+ index = i*k*w+i;
+ for (j = 0; j < k; j++) {
+ matrix[index] = 1;
+ index += w;
+ }
+ }
+
+ /* Set up blaum_roth matrices -- Ignore identity */
+
+ p = w+1;
+ for (j = 0; j < k; j++) {
+ index = k*w*w+j*w;
+ if (j == 0) {
+ for (l = 0; l < w; l++) {
+ matrix[index+l] = 1;
+ index += k*w;
+ }
+ } else {
+ i = j;
+ for (l = 1; l <= w; l++) {
+ if (l != p-i) {
+ m = l+i;
+ if (m >= p) m -= p;
+ m--;
+ matrix[index+m] = 1;
+ } else {
+ matrix[index+i-1] = 1;
+ if (i%2 == 0) {
+ m = i/2;
+ } else {
+ m = (p/2) + 1 + (i/2);
+ }
+ m--;
+ matrix[index+m] = 1;
+ }
+ index += k*w;
+ }
+ }
+ }
+
+ return matrix;
+}
diff --git a/src/osd/ErasureCodePluginJerasure/liberation.h b/src/osd/ErasureCodePluginJerasure/liberation.h
new file mode 100755
index 00000000000..ee176e877c8
--- /dev/null
+++ b/src/osd/ErasureCodePluginJerasure/liberation.h
@@ -0,0 +1,56 @@
+/* liberation.h
+ * James S. Plank
+
+Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure Coding Techniques
+
+Revision 1.2A
+May 24, 2011
+
+James S. Plank
+Department of Electrical Engineering and Computer Science
+University of Tennessee
+Knoxville, TN 37996
+plank@cs.utk.edu
+
+Copyright (c) 2011, James S. Plank
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ - Neither the name of the University of Tennessee nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+ */
+
+#ifndef _LIBERATION
+
+extern int *liberation_coding_bitmatrix(int k, int w);
+extern int *liber8tion_coding_bitmatrix(int k);
+extern int *blaum_roth_coding_bitmatrix(int k, int w);
+
+#endif
diff --git a/src/osd/ErasureCodePluginJerasure/reed_sol.c b/src/osd/ErasureCodePluginJerasure/reed_sol.c
new file mode 100755
index 00000000000..003eb419ad0
--- /dev/null
+++ b/src/osd/ErasureCodePluginJerasure/reed_sol.c
@@ -0,0 +1,368 @@
+/* reed_sol.c
+ * James S. Plank
+
+Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure Coding Techniques
+
+Revision 1.2A
+May 24, 2011
+
+James S. Plank
+Department of Electrical Engineering and Computer Science
+University of Tennessee
+Knoxville, TN 37996
+plank@cs.utk.edu
+
+Copyright (c) 2011, James S. Plank
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ - Neither the name of the University of Tennessee nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "galois.h"
+#include "jerasure.h"
+#include "reed_sol.h"
+
+#define talloc(type, num) (type *) malloc(sizeof(type)*(num))
+
+int *reed_sol_r6_coding_matrix(int k, int w)
+{
+ int *matrix;
+ int i, tmp;
+
+ if (w != 8 && w != 16 && w != 32) return NULL;
+
+ matrix = talloc(int, 2*k);
+ if (matrix == NULL) return NULL;
+
+ for (i = 0; i < k; i++) matrix[i] = 1;
+ matrix[k] = 1;
+ tmp = 1;
+ for (i = 1; i < k; i++) {
+ tmp = galois_single_multiply(tmp, 2, w);
+ matrix[k+i] = tmp;
+ }
+ return matrix;
+}
+
+int *reed_sol_vandermonde_coding_matrix(int k, int m, int w)
+{
+ int i, j;
+ int *vdm, *dist;
+
+ vdm = reed_sol_big_vandermonde_distribution_matrix(k+m, k, w);
+ if (vdm == NULL) return NULL;
+ dist = talloc(int, m*k);
+ if (dist == NULL) {
+ free(vdm);
+ return NULL;
+ }
+
+ i = k*k;
+ for (j = 0; j < m*k; j++) {
+ dist[j] = vdm[i];
+ i++;
+ }
+ free(vdm);
+ return dist;
+}
+
+static int prim32 = -1;
+
+#define rgw32_mask(v) ((v) & 0x80000000)
+
+void reed_sol_galois_w32_region_multby_2(char *region, int nbytes)
+{
+ int *l1;
+ int *ltop;
+ char *ctop;
+
+ if (prim32 == -1) prim32 = galois_single_multiply((1 << 31), 2, 32);
+
+ ctop = region + nbytes;
+ ltop = (int *) ctop;
+ l1 = (int *) region;
+
+ while (l1 < ltop) {
+ *l1 = ((*l1) << 1) ^ ((*l1 & 0x80000000) ? prim32 : 0);
+ l1++;
+ }
+}
+
+static int prim08 = -1;
+static int mask08_1 = -1;
+static int mask08_2 = -1;
+
+void reed_sol_galois_w08_region_multby_2(char *region, int nbytes)
+{
+ unsigned int *l1;
+ unsigned int *ltop;
+ char *ctop;
+ unsigned int tmp, tmp2;
+
+
+ if (prim08 == -1) {
+ tmp = galois_single_multiply((1 << 7), 2, 8);
+ prim08 = 0;
+ while (tmp != 0) {
+ prim08 |= tmp;
+ tmp = (tmp << 8);
+ }
+ tmp = (1 << 8) - 2;
+ mask08_1 = 0;
+ while (tmp != 0) {
+ mask08_1 |= tmp;
+ tmp = (tmp << 8);
+ }
+ tmp = (1 << 7);
+ mask08_2 = 0;
+ while (tmp != 0) {
+ mask08_2 |= tmp;
+ tmp = (tmp << 8);
+ }
+ }
+
+ ctop = region + nbytes;
+ ltop = (unsigned int *) ctop;
+ l1 = (unsigned int *) region;
+
+ while (l1 < ltop) {
+ tmp = ((*l1) << 1) & mask08_1;
+ tmp2 = (*l1) & mask08_2;
+ tmp2 = ((tmp2 << 1) - (tmp2 >> 7));
+ *l1 = (tmp ^ (tmp2 & prim08));
+ l1++;
+ }
+}
+
+static int prim16 = -1;
+static int mask16_1 = -1;
+static int mask16_2 = -1;
+
+void reed_sol_galois_w16_region_multby_2(char *region, int nbytes)
+{
+ unsigned int *l1;
+ unsigned int *ltop;
+ char *ctop;
+ unsigned int tmp, tmp2;
+
+
+ if (prim16 == -1) {
+ tmp = galois_single_multiply((1 << 15), 2, 16);
+ prim16 = 0;
+ while (tmp != 0) {
+ prim16 |= tmp;
+ tmp = (tmp << 16);
+ }
+ tmp = (1 << 16) - 2;
+ mask16_1 = 0;
+ while (tmp != 0) {
+ mask16_1 |= tmp;
+ tmp = (tmp << 16);
+ }
+ tmp = (1 << 15);
+ mask16_2 = 0;
+ while (tmp != 0) {
+ mask16_2 |= tmp;
+ tmp = (tmp << 16);
+ }
+ }
+
+ ctop = region + nbytes;
+ ltop = (unsigned int *) ctop;
+ l1 = (unsigned int *) region;
+
+ while (l1 < ltop) {
+ tmp = ((*l1) << 1) & mask16_1;
+ tmp2 = (*l1) & mask16_2;
+ tmp2 = ((tmp2 << 1) - (tmp2 >> 15));
+ *l1 = (tmp ^ (tmp2 & prim16));
+ l1++;
+ }
+}
+
+int reed_sol_r6_encode(int k, int w, char **data_ptrs, char **coding_ptrs, int size)
+{
+ int i;
+
+ /* First, put the XOR into coding region 0 */
+
+ memcpy(coding_ptrs[0], data_ptrs[0], size);
+
+ for (i = 1; i < k; i++) galois_region_xor(coding_ptrs[0], data_ptrs[i], coding_ptrs[0], size);
+
+ /* Next, put the sum of (2^j)*Dj into coding region 1 */
+
+ memcpy(coding_ptrs[1], data_ptrs[k-1], size);
+
+ for (i = k-2; i >= 0; i--) {
+ switch (w) {
+ case 8: reed_sol_galois_w08_region_multby_2(coding_ptrs[1], size); break;
+ case 16: reed_sol_galois_w16_region_multby_2(coding_ptrs[1], size); break;
+ case 32: reed_sol_galois_w32_region_multby_2(coding_ptrs[1], size); break;
+ default: return 0;
+ }
+
+ galois_region_xor(coding_ptrs[1], data_ptrs[i], coding_ptrs[1], size);
+ }
+ return 1;
+}
+
+int *reed_sol_extended_vandermonde_matrix(int rows, int cols, int w)
+{
+ int *vdm;
+ int i, j, k;
+
+ if (w < 30 && (1 << w) < rows) return NULL;
+ if (w < 30 && (1 << w) < cols) return NULL;
+
+ vdm = talloc(int, rows*cols);
+ if (vdm == NULL) { return NULL; }
+
+ vdm[0] = 1;
+ for (j = 1; j < cols; j++) vdm[j] = 0;
+ if (rows == 1) return vdm;
+
+ i=(rows-1)*cols;
+ for (j = 0; j < cols-1; j++) vdm[i+j] = 0;
+ vdm[i+j] = 1;
+ if (rows == 2) return vdm;
+
+ for (i = 1; i < rows-1; i++) {
+ k = 1;
+ for (j = 0; j < cols; j++) {
+ vdm[i*cols+j] = k;
+ k = galois_single_multiply(k, i, w);
+ }
+ }
+ return vdm;
+}
+
+int *reed_sol_big_vandermonde_distribution_matrix(int rows, int cols, int w)
+{
+ int *dist;
+ int i, j, k;
+ int sindex, srindex, siindex, tmp;
+
+ if (cols >= rows) return NULL;
+
+ dist = reed_sol_extended_vandermonde_matrix(rows, cols, w);
+ if (dist == NULL) return NULL;
+
+ sindex = 0;
+ for (i = 1; i < cols; i++) {
+ sindex += cols;
+
+ /* Find an appropriate row -- where i,i != 0 */
+ srindex = sindex+i;
+ for (j = i; j < rows && dist[srindex] == 0; j++) srindex += cols;
+ if (j >= rows) { /* This should never happen if rows/w are correct */
+ fprintf(stderr, "reed_sol_big_vandermonde_distribution_matrix(%d,%d,%d) - couldn't make matrix\n",
+ rows, cols, w);
+ exit(1);
+ }
+
+ /* If necessary, swap rows */
+ if (j != i) {
+ srindex -= i;
+ for (k = 0; k < cols; k++) {
+ tmp = dist[srindex+k];
+ dist[srindex+k] = dist[sindex+k];
+ dist[sindex+k] = tmp;
+ }
+ }
+
+ /* If Element i,i is not equal to 1, multiply the column by 1/i */
+
+ if (dist[sindex+i] != 1) {
+ tmp = galois_single_divide(1, dist[sindex+i], w);
+ srindex = i;
+ for (j = 0; j < rows; j++) {
+ dist[srindex] = galois_single_multiply(tmp, dist[srindex], w);
+ srindex += cols;
+ }
+ }
+
+ /* Now, for each element in row i that is not in column 1, you need
+ to make it zero. Suppose that this is column j, and the element
+ at i,j = e. Then you want to replace all of column j with
+ (col-j + col-i*e). Note, that in row i, col-i = 1 and col-j = e.
+ So (e + 1e) = 0, which is indeed what we want. */
+
+ for (j = 0; j < cols; j++) {
+ tmp = dist[sindex+j];
+ if (j != i && tmp != 0) {
+ srindex = j;
+ siindex = i;
+ for (k = 0; k < rows; k++) {
+ dist[srindex] = dist[srindex] ^ galois_single_multiply(tmp, dist[siindex], w);
+ srindex += cols;
+ siindex += cols;
+ }
+ }
+ }
+ }
+ /* We desire to have row k be all ones. To do that, multiply
+ the entire column j by 1/dist[k,j]. Then row j by 1/dist[j,j]. */
+
+ sindex = cols*cols;
+ for (j = 0; j < cols; j++) {
+ tmp = dist[sindex];
+ if (tmp != 1) {
+ tmp = galois_single_divide(1, tmp, w);
+ srindex = sindex;
+ for (i = cols; i < rows; i++) {
+ dist[srindex] = galois_single_multiply(tmp, dist[srindex], w);
+ srindex += cols;
+ }
+ }
+ sindex++;
+ }
+
+ /* Finally, we'd like the first column of each row to be all ones. To
+ do that, we multiply the row by the inverse of the first element. */
+
+ sindex = cols*(cols+1);
+ for (i = cols+1; i < rows; i++) {
+ tmp = dist[sindex];
+ if (tmp != 1) {
+ tmp = galois_single_divide(1, tmp, w);
+ for (j = 0; j < cols; j++) dist[sindex+j] = galois_single_multiply(dist[sindex+j], tmp, w);
+ }
+ sindex += cols;
+ }
+
+ return dist;
+}
+
diff --git a/src/osd/ErasureCodePluginJerasure/reed_sol.h b/src/osd/ErasureCodePluginJerasure/reed_sol.h
new file mode 100755
index 00000000000..741c3177432
--- /dev/null
+++ b/src/osd/ErasureCodePluginJerasure/reed_sol.h
@@ -0,0 +1,59 @@
+/* reed_sol.h
+ * James S. Plank
+
+Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure Coding Techniques
+
+Revision 1.2A
+May 24, 2011
+
+James S. Plank
+Department of Electrical Engineering and Computer Science
+University of Tennessee
+Knoxville, TN 37996
+plank@cs.utk.edu
+
+Copyright (c) 2011, James S. Plank
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ - Neither the name of the University of Tennessee nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+ */
+
+extern int *reed_sol_vandermonde_coding_matrix(int k, int m, int w);
+extern int *reed_sol_extended_vandermonde_matrix(int rows, int cols, int w);
+extern int *reed_sol_big_vandermonde_distribution_matrix(int rows, int cols, int w);
+
+extern int reed_sol_r6_encode(int k, int w, char **data_ptrs, char **coding_ptrs, int size);
+extern int *reed_sol_r6_coding_matrix(int k, int w);
+
+extern void reed_sol_galois_w08_region_multby_2(char *region, int nbytes);
+extern void reed_sol_galois_w16_region_multby_2(char *region, int nbytes);
+extern void reed_sol_galois_w32_region_multby_2(char *region, int nbytes);
diff --git a/src/osd/Makefile.am b/src/osd/Makefile.am
new file mode 100644
index 00000000000..cae02015fce
--- /dev/null
+++ b/src/osd/Makefile.am
@@ -0,0 +1,44 @@
+## erasure code plugins
+erasure_codelibdir = $(libdir)/erasure-code
+erasure_codelib_LTLIBRARIES =
+
+include osd/ErasureCodePluginJerasure/Makefile.am
+
+libosd_la_SOURCES = \
+ osd/ErasureCodePlugin.cc \
+ osd/PG.cc \
+ osd/PGLog.cc \
+ osd/ReplicatedPG.cc \
+ osd/ReplicatedBackend.cc \
+ osd/Ager.cc \
+ osd/OSD.cc \
+ osd/OSDCap.cc \
+ osd/Watch.cc \
+ osd/ClassHandler.cc \
+ osd/OpRequest.cc \
+ common/TrackedOp.cc \
+ osd/SnapMapper.cc \
+ osd/osd_types.cc \
+ objclass/class_api.cc
+libosd_la_LIBADD = $(LIBOSDC) $(LIBOS)
+noinst_LTLIBRARIES += libosd.la
+
+noinst_HEADERS += \
+ osd/Ager.h \
+ osd/ClassHandler.h \
+ osd/ErasureCodeInterface.h \
+ osd/ErasureCodePlugin.h \
+ osd/OSD.h \
+ osd/OSDCap.h \
+ osd/OSDMap.h \
+ osd/ObjectVersioner.h \
+ osd/OpRequest.h \
+ osd/SnapMapper.h \
+ osd/PG.h \
+ osd/PGLog.h \
+ osd/ReplicatedPG.h \
+ osd/PGBackend.h \
+ osd/ReplicatedBackend.h \
+ osd/Watch.h \
+ osd/osd_types.h
+
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 3e19f5634c1..b2aa2ebbcd2 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -33,6 +33,7 @@
#include "OSD.h"
#include "OSDMap.h"
#include "Watch.h"
+#include "osdc/Objecter.h"
#include "common/ceph_argparse.h"
#include "common/version.h"
@@ -133,9 +134,9 @@ static ostream& _prefix(std::ostream* _dout, int whoami, OSDMapRef osdmap) {
<< " ";
}
-const coll_t coll_t::META_COLL("meta");
-
-static CompatSet get_osd_compat_set() {
+//Initial features in new superblock.
+//Features here are also automatically upgraded
+CompatSet OSD::get_osd_initial_compat_set() {
CompatSet::FeatureSet ceph_osd_feature_compat;
CompatSet::FeatureSet ceph_osd_feature_ro_compat;
CompatSet::FeatureSet ceph_osd_feature_incompat;
@@ -153,8 +154,17 @@ static CompatSet get_osd_compat_set() {
ceph_osd_feature_incompat);
}
+//Features are added here that this OSD supports.
+CompatSet OSD::get_osd_compat_set() {
+ CompatSet compat = get_osd_initial_compat_set();
+ //Any features here can be set in code, but not in initial superblock
+ compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
+ return compat;
+}
+
OSDService::OSDService(OSD *osd) :
osd(osd),
+ cct(osd->cct),
whoami(osd->whoami), store(osd->store), clog(osd->clog),
pg_recovery_stats(osd->pg_recovery_stats),
infos_oid(OSD::make_infos_oid()),
@@ -170,26 +180,33 @@ OSDService::OSDService(OSD *osd) :
scrub_wq(osd->scrub_wq),
scrub_finalize_wq(osd->scrub_finalize_wq),
rep_scrub_wq(osd->rep_scrub_wq),
+ push_wq("push_wq", cct->_conf->osd_recovery_thread_timeout, &osd->recovery_tp),
class_handler(osd->class_handler),
publish_lock("OSDService::publish_lock"),
pre_publish_lock("OSDService::pre_publish_lock"),
sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
scrubs_active(0),
+ objecter_lock("OSD::objecter_lock"),
+ objecter_timer(osd->client_messenger->cct, objecter_lock),
+ objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, &objecter_osdmap,
+ objecter_lock, objecter_timer)),
+ objecter_finisher(osd->client_messenger->cct),
+ objecter_dispatcher(this),
watch_lock("OSD::watch_lock"),
watch_timer(osd->client_messenger->cct, watch_lock),
next_notif_id(0),
backfill_request_lock("OSD::backfill_request_lock"),
- backfill_request_timer(g_ceph_context, backfill_request_lock, false),
+ backfill_request_timer(cct, backfill_request_lock, false),
last_tid(0),
tid_lock("OSDService::tid_lock"),
- reserver_finisher(g_ceph_context),
- local_reserver(&reserver_finisher, g_conf->osd_max_backfills),
- remote_reserver(&reserver_finisher, g_conf->osd_max_backfills),
+ reserver_finisher(cct),
+ local_reserver(&reserver_finisher, cct->_conf->osd_max_backfills),
+ remote_reserver(&reserver_finisher, cct->_conf->osd_max_backfills),
pg_temp_lock("OSDService::pg_temp_lock"),
map_cache_lock("OSDService::map_lock"),
- map_cache(g_conf->osd_map_cache_size),
- map_bl_cache(g_conf->osd_map_cache_size),
- map_bl_inc_cache(g_conf->osd_map_cache_size),
+ map_cache(cct->_conf->osd_map_cache_size),
+ map_bl_cache(cct->_conf->osd_map_cache_size),
+ map_bl_inc_cache(cct->_conf->osd_map_cache_size),
in_progress_split_lock("OSDService::in_progress_split_lock"),
full_status_lock("OSDService::full_status_lock"),
cur_state(NONE),
@@ -202,6 +219,11 @@ OSDService::OSDService(OSD *osd) :
#endif
{}
+OSDService::~OSDService()
+{
+ delete objecter;
+}
+
void OSDService::_start_split(pg_t parent, const set<pg_t> &children)
{
for (set<pg_t>::const_iterator i = children.begin();
@@ -385,6 +407,15 @@ void OSDService::shutdown()
Mutex::Locker l(watch_lock);
watch_timer.shutdown();
}
+
+ {
+ Mutex::Locker l(objecter_lock);
+ objecter_timer.shutdown();
+ objecter->shutdown_locked();
+ }
+ objecter->shutdown_unlocked();
+ objecter_finisher.stop();
+
{
Mutex::Locker l(backfill_request_lock);
backfill_request_timer.shutdown();
@@ -396,16 +427,25 @@ void OSDService::shutdown()
void OSDService::init()
{
reserver_finisher.start();
+ {
+ objecter_finisher.start();
+ objecter->init_unlocked();
+ Mutex::Locker l(objecter_lock);
+ objecter_timer.init();
+ objecter->set_client_incarnation(0);
+ objecter->init_locked();
+ objecter->unset_honor_cache_redirects();
+ }
watch_timer.init();
}
-ObjectStore *OSD::create_object_store(const std::string &dev, const std::string &jdev)
+ObjectStore *OSD::create_object_store(CephContext *cct, const std::string &dev, const std::string &jdev)
{
struct stat st;
if (::stat(dev.c_str(), &st) != 0)
return 0;
- if (g_conf->filestore)
+ if (cct->_conf->filestore)
return new FileStore(dev, jdev);
if (S_ISDIR(st.st_mode))
@@ -421,7 +461,7 @@ int OSD::convert_collection(ObjectStore *store, coll_t cid)
{
coll_t tmp0("convertfs_temp");
coll_t tmp1("convertfs_temp1");
- vector<hobject_t> objects;
+ vector<ghobject_t> objects;
map<string, bufferptr> aset;
int r = store->collection_getattrs(cid, aset);
@@ -441,10 +481,10 @@ int OSD::convert_collection(ObjectStore *store, coll_t cid)
store->apply_transaction(t);
}
- hobject_t next;
+ ghobject_t next;
while (!next.is_max()) {
objects.clear();
- hobject_t start = next;
+ ghobject_t start = next;
r = store->collection_list_partial(cid, start,
200, 300, 0,
&objects, &next);
@@ -452,7 +492,7 @@ int OSD::convert_collection(ObjectStore *store, coll_t cid)
return r;
ObjectStore::Transaction t;
- for (vector<hobject_t>::iterator i = objects.begin();
+ for (vector<ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i) {
t.collection_add(tmp0, cid, *i);
@@ -552,20 +592,20 @@ int OSD::convertfs(const std::string &dev, const std::string &jdev)
return r;
}
-int OSD::mkfs(const std::string &dev, const std::string &jdev, uuid_d fsid, int whoami)
+int OSD::mkfs(CephContext *cct, const std::string &dev, const std::string &jdev, uuid_d fsid, int whoami)
{
int ret;
ObjectStore *store = NULL;
try {
- store = create_object_store(dev, jdev);
+ store = create_object_store(cct, dev, jdev);
if (!store) {
ret = -ENOENT;
goto out;
}
// if we are fed a uuid for this osd, use it.
- store->set_fsid(g_conf->osd_uuid);
+ store->set_fsid(cct->_conf->osd_uuid);
ret = store->mkfs();
if (ret) {
@@ -580,15 +620,15 @@ int OSD::mkfs(const std::string &dev, const std::string &jdev, uuid_d fsid, int
}
// age?
- if (g_conf->osd_age_time != 0) {
- if (g_conf->osd_age_time >= 0) {
- dout(0) << "aging..." << dendl;
- Ager ager(store);
- ager.age(g_conf->osd_age_time,
- g_conf->osd_age,
- g_conf->osd_age - .05,
- 50000,
- g_conf->osd_age - .05);
+ if (cct->_conf->osd_age_time != 0) {
+ if (cct->_conf->osd_age_time >= 0) {
+ dout(0) << "aging..." << dendl;
+ Ager ager(cct, store);
+ ager.age(cct->_conf->osd_age_time,
+ cct->_conf->osd_age,
+ cct->_conf->osd_age - .05,
+ 50000,
+ cct->_conf->osd_age - .05);
}
}
@@ -618,16 +658,16 @@ int OSD::mkfs(const std::string &dev, const std::string &jdev, uuid_d fsid, int
sb.cluster_fsid = fsid;
sb.osd_fsid = store->get_fsid();
sb.whoami = whoami;
- sb.compat_features = get_osd_compat_set();
+ sb.compat_features = get_osd_initial_compat_set();
// benchmark?
- if (g_conf->osd_auto_weight) {
+ if (cct->_conf->osd_auto_weight) {
bufferlist bl;
bufferptr bp(1048576);
bp.zero();
bl.push_back(bp);
dout(0) << "testing disk bandwidth..." << dendl;
- utime_t start = ceph_clock_now(g_ceph_context);
+ utime_t start = ceph_clock_now(cct);
object_t oid("disk_bw_test");
for (int i=0; i<1000; i++) {
ObjectStore::Transaction *t = new ObjectStore::Transaction;
@@ -635,7 +675,7 @@ int OSD::mkfs(const std::string &dev, const std::string &jdev, uuid_d fsid, int
store->queue_transaction(NULL, t);
}
store->sync();
- utime_t end = ceph_clock_now(g_ceph_context);
+ utime_t end = ceph_clock_now(cct);
end -= start;
dout(0) << "measured " << (1000.0 / (double)end) << " mb/sec" << dendl;
ObjectStore::Transaction tr;
@@ -673,7 +713,7 @@ int OSD::mkfs(const std::string &dev, const std::string &jdev, uuid_d fsid, int
goto umount_store;
}
- ret = write_meta(dev, "ready", "ready\n", 6);
+ ret = safe_write_file(dev.c_str(), "ready", "ready\n", 6);
if (ret) {
derr << "OSD::mkfs: failed to write ready file: error " << ret << dendl;
goto umount_store;
@@ -697,17 +737,17 @@ out:
return ret;
}
-int OSD::mkjournal(const std::string &dev, const std::string &jdev)
+int OSD::mkjournal(CephContext *cct, const std::string &dev, const std::string &jdev)
{
- ObjectStore *store = create_object_store(dev, jdev);
+ ObjectStore *store = create_object_store(cct, dev, jdev);
if (!store)
return -ENOENT;
return store->mkjournal();
}
-int OSD::flushjournal(const std::string &dev, const std::string &jdev)
+int OSD::flushjournal(CephContext *cct, const std::string &dev, const std::string &jdev)
{
- ObjectStore *store = create_object_store(dev, jdev);
+ ObjectStore *store = create_object_store(cct, dev, jdev);
if (!store)
return -ENOENT;
int err = store->mount();
@@ -719,9 +759,9 @@ int OSD::flushjournal(const std::string &dev, const std::string &jdev)
return err;
}
-int OSD::dump_journal(const std::string &dev, const std::string &jdev, ostream& out)
+int OSD::dump_journal(CephContext *cct, const std::string &dev, const std::string &jdev, ostream& out)
{
- ObjectStore *store = create_object_store(dev, jdev);
+ ObjectStore *store = create_object_store(cct, dev, jdev);
if (!store)
return -ENOENT;
int err = store->dump_journal(out);
@@ -729,103 +769,19 @@ int OSD::dump_journal(const std::string &dev, const std::string &jdev, ostream&
return err;
}
-int OSD::write_meta(const std::string &base, const std::string &file,
- const char *val, size_t vallen)
-{
- int ret;
- char fn[PATH_MAX];
- char tmp[PATH_MAX];
- int fd;
-
- // does the file already have correct content?
- char oldval[80];
- ret = read_meta(base, file, oldval, sizeof(oldval));
- if (ret == (int)vallen && memcmp(oldval, val, vallen) == 0)
- return 0; // yes.
-
- snprintf(fn, sizeof(fn), "%s/%s", base.c_str(), file.c_str());
- snprintf(tmp, sizeof(tmp), "%s/%s.tmp", base.c_str(), file.c_str());
- fd = ::open(tmp, O_WRONLY|O_CREAT|O_TRUNC, 0644);
- if (fd < 0) {
- ret = errno;
- derr << "write_meta: error opening '" << tmp << "': "
- << cpp_strerror(ret) << dendl;
- return -ret;
- }
- ret = safe_write(fd, val, vallen);
- if (ret) {
- derr << "write_meta: failed to write to '" << tmp << "': "
- << cpp_strerror(ret) << dendl;
- TEMP_FAILURE_RETRY(::close(fd));
- return ret;
- }
-
- ret = ::fsync(fd);
- TEMP_FAILURE_RETRY(::close(fd));
- if (ret) {
- ::unlink(tmp);
- derr << "write_meta: failed to fsync to '" << tmp << "': "
- << cpp_strerror(ret) << dendl;
- return ret;
- }
- ret = ::rename(tmp, fn);
- if (ret) {
- ::unlink(tmp);
- derr << "write_meta: failed to rename '" << tmp << "' to '" << fn << "': "
- << cpp_strerror(ret) << dendl;
- return ret;
- }
-
- fd = ::open(base.c_str(), O_RDONLY);
- if (fd < 0) {
- ret = errno;
- derr << "write_meta: failed to open dir '" << base << "': "
- << cpp_strerror(ret) << dendl;
- return -ret;
- }
- ::fsync(fd);
- TEMP_FAILURE_RETRY(::close(fd));
-
- return 0;
-}
-
-int OSD::read_meta(const std::string &base, const std::string &file,
- char *val, size_t vallen)
-{
- char fn[PATH_MAX];
- int fd, len;
-
- snprintf(fn, sizeof(fn), "%s/%s", base.c_str(), file.c_str());
- fd = ::open(fn, O_RDONLY);
- if (fd < 0) {
- int err = errno;
- return -err;
- }
- len = safe_read(fd, val, vallen);
- if (len < 0) {
- TEMP_FAILURE_RETRY(::close(fd));
- return len;
- }
- // close sometimes returns errors, but only after write()
- TEMP_FAILURE_RETRY(::close(fd));
-
- val[len] = 0;
- return len;
-}
-
int OSD::write_meta(const std::string &base, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
{
char val[80];
snprintf(val, sizeof(val), "%s\n", CEPH_OSD_ONDISK_MAGIC);
- write_meta(base, "magic", val, strlen(val));
+ safe_write_file(base.c_str(), "magic", val, strlen(val));
snprintf(val, sizeof(val), "%d\n", whoami);
- write_meta(base, "whoami", val, strlen(val));
+ safe_write_file(base.c_str(), "whoami", val, strlen(val));
cluster_fsid.print(val);
strcat(val, "\n");
- write_meta(base, "ceph_fsid", val, strlen(val));
+ safe_write_file(base.c_str(), "ceph_fsid", val, strlen(val));
return 0;
}
@@ -835,24 +791,24 @@ int OSD::peek_meta(const std::string &dev, std::string& magic,
{
char val[80] = { 0 };
- if (read_meta(dev, "magic", val, sizeof(val)) < 0)
+ if (safe_read_file(dev.c_str(), "magic", val, sizeof(val)) < 0)
return -errno;
int l = strlen(val);
if (l && val[l-1] == '\n')
val[l-1] = 0;
magic = val;
- if (read_meta(dev, "whoami", val, sizeof(val)) < 0)
+ if (safe_read_file(dev.c_str(), "whoami", val, sizeof(val)) < 0)
return -errno;
whoami = atoi(val);
- if (read_meta(dev, "ceph_fsid", val, sizeof(val)) < 0)
+ if (safe_read_file(dev.c_str(), "ceph_fsid", val, sizeof(val)) < 0)
return -errno;
if (strlen(val) > 36)
val[36] = 0;
cluster_fsid.parse(val);
- if (read_meta(dev, "fsid", val, sizeof(val)) < 0)
+ if (safe_read_file(dev.c_str(), "fsid", val, sizeof(val)) < 0)
osd_fsid = uuid_d();
else {
if (strlen(val) > 36)
@@ -878,40 +834,42 @@ int OSD::peek_journal_fsid(string path, uuid_d& fsid)
// cons/des
-OSD::OSD(int id, Messenger *internal_messenger, Messenger *external_messenger,
+OSD::OSD(CephContext *cct_, int id, Messenger *internal_messenger, Messenger *external_messenger,
Messenger *hb_clientm,
Messenger *hb_front_serverm,
Messenger *hb_back_serverm,
+ Messenger *osdc_messenger,
MonClient *mc,
const std::string &dev, const std::string &jdev) :
- Dispatcher(external_messenger->cct),
+ Dispatcher(cct_),
osd_lock("OSD::osd_lock"),
- tick_timer(external_messenger->cct, osd_lock),
- authorize_handler_cluster_registry(new AuthAuthorizeHandlerRegistry(external_messenger->cct,
+ tick_timer(cct, osd_lock),
+ authorize_handler_cluster_registry(new AuthAuthorizeHandlerRegistry(cct,
cct->_conf->auth_supported.length() ?
cct->_conf->auth_supported :
cct->_conf->auth_cluster_required)),
- authorize_handler_service_registry(new AuthAuthorizeHandlerRegistry(external_messenger->cct,
+ authorize_handler_service_registry(new AuthAuthorizeHandlerRegistry(cct,
cct->_conf->auth_supported.length() ?
cct->_conf->auth_supported :
cct->_conf->auth_service_required)),
cluster_messenger(internal_messenger),
client_messenger(external_messenger),
+ objecter_messenger(osdc_messenger),
monc(mc),
logger(NULL),
recoverystate_perf(NULL),
store(NULL),
- clog(external_messenger->cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
+ clog(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
whoami(id),
dev_path(dev), journal_path(jdev),
dispatch_running(false),
asok_hook(NULL),
osd_compat(get_osd_compat_set()),
state(STATE_INITIALIZING), boot_epoch(0), up_epoch(0), bind_epoch(0),
- op_tp(external_messenger->cct, "OSD::op_tp", g_conf->osd_op_threads, "osd_op_threads"),
- recovery_tp(external_messenger->cct, "OSD::recovery_tp", g_conf->osd_recovery_threads, "osd_recovery_threads"),
- disk_tp(external_messenger->cct, "OSD::disk_tp", g_conf->osd_disk_threads, "osd_disk_threads"),
- command_tp(external_messenger->cct, "OSD::command_tp", 1),
+ op_tp(cct, "OSD::op_tp", cct->_conf->osd_op_threads, "osd_op_threads"),
+ recovery_tp(cct, "OSD::recovery_tp", cct->_conf->osd_recovery_threads, "osd_recovery_threads"),
+ disk_tp(cct, "OSD::disk_tp", cct->_conf->osd_disk_threads, "osd_disk_threads"),
+ command_tp(cct, "OSD::command_tp", 1),
paused_recovery(false),
heartbeat_lock("OSD::heartbeat_lock"),
heartbeat_stop(false), heartbeat_need_update(true), heartbeat_epoch(0),
@@ -922,32 +880,37 @@ OSD::OSD(int id, Messenger *internal_messenger, Messenger *external_messenger,
heartbeat_dispatcher(this),
stat_lock("OSD::stat_lock"),
finished_lock("OSD::finished_lock"),
+ op_tracker(cct),
test_ops_hook(NULL),
- op_wq(this, g_conf->osd_op_thread_timeout, &op_tp),
- peering_wq(this, g_conf->osd_op_thread_timeout, &op_tp),
+ op_wq(this, cct->_conf->osd_op_thread_timeout, &op_tp),
+ peering_wq(this, cct->_conf->osd_op_thread_timeout, &op_tp),
map_lock("OSD::map_lock"),
peer_map_epoch_lock("OSD::peer_map_epoch_lock"),
- debug_drop_pg_create_probability(g_conf->osd_debug_drop_pg_create_probability),
- debug_drop_pg_create_duration(g_conf->osd_debug_drop_pg_create_duration),
+ debug_drop_pg_create_probability(cct->_conf->osd_debug_drop_pg_create_probability),
+ debug_drop_pg_create_duration(cct->_conf->osd_debug_drop_pg_create_duration),
debug_drop_pg_create_left(-1),
outstanding_pg_stats(false),
up_thru_wanted(0), up_thru_pending(0),
pg_stat_queue_lock("OSD::pg_stat_queue_lock"),
osd_stat_updated(false),
pg_stat_tid(0), pg_stat_tid_flushed(0),
- command_wq(this, g_conf->osd_command_thread_timeout, &command_tp),
+ command_wq(this, cct->_conf->osd_command_thread_timeout, &command_tp),
recovery_ops_active(0),
- recovery_wq(this, g_conf->osd_recovery_thread_timeout, &recovery_tp),
+ recovery_wq(this, cct->_conf->osd_recovery_thread_timeout, &recovery_tp),
replay_queue_lock("OSD::replay_queue_lock"),
- snap_trim_wq(this, g_conf->osd_snap_trim_thread_timeout, &disk_tp),
- scrub_wq(this, g_conf->osd_scrub_thread_timeout, &disk_tp),
- scrub_finalize_wq(this, g_conf->osd_scrub_finalize_thread_timeout, &op_tp),
- rep_scrub_wq(this, g_conf->osd_scrub_thread_timeout, &disk_tp),
- remove_wq(store, g_conf->osd_remove_thread_timeout, &disk_tp),
+ snap_trim_wq(this, cct->_conf->osd_snap_trim_thread_timeout, &disk_tp),
+ scrub_wq(this, cct->_conf->osd_scrub_thread_timeout, &disk_tp),
+ scrub_finalize_wq(this, cct->_conf->osd_scrub_finalize_thread_timeout, &op_tp),
+ rep_scrub_wq(this, cct->_conf->osd_scrub_thread_timeout, &disk_tp),
+ remove_wq(store, cct->_conf->osd_remove_thread_timeout, &disk_tp),
next_removal_seq(0),
service(this)
{
monc->set_messenger(client_messenger);
+ op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
+ cct->_conf->osd_op_log_threshold);
+ op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
+ cct->_conf->osd_op_history_duration);
}
OSD::~OSD()
@@ -955,8 +918,8 @@ OSD::~OSD()
delete authorize_handler_cluster_registry;
delete authorize_handler_service_registry;
delete class_handler;
- g_ceph_context->get_perfcounters_collection()->remove(recoverystate_perf);
- g_ceph_context->get_perfcounters_collection()->remove(logger);
+ cct->get_perfcounters_collection()->remove(recoverystate_perf);
+ cct->get_perfcounters_collection()->remove(logger);
delete recoverystate_perf;
delete logger;
delete store;
@@ -979,7 +942,7 @@ int OSD::pre_init()
return 0;
assert(!store);
- store = create_object_store(dev_path, journal_path);
+ store = create_object_store(cct, dev_path, journal_path);
if (!store) {
derr << "OSD::pre_init: unable to create object store" << dendl;
return -ENODEV;
@@ -991,7 +954,7 @@ int OSD::pre_init()
return -EBUSY;
}
- g_conf->add_observer(this);
+ cct->_conf->add_observer(this);
return 0;
}
@@ -1108,6 +1071,7 @@ public:
int OSD::init()
{
+ CompatSet initial, diff;
Mutex::Locker lock(osd_lock);
if (is_stopping())
return 0;
@@ -1132,9 +1096,48 @@ int OSD::init()
r = read_superblock();
if (r < 0) {
derr << "OSD::init() : unable to read osd superblock" << dendl;
- store->umount();
- delete store;
- return -EINVAL;
+ r = -EINVAL;
+ goto out;
+ }
+
+ if (osd_compat.compare(superblock.compat_features) < 0) {
+ derr << "The disk uses features unsupported by the executable." << dendl;
+ derr << " ondisk features " << superblock.compat_features << dendl;
+ derr << " daemon features " << osd_compat << dendl;
+
+ if (osd_compat.writeable(superblock.compat_features)) {
+ CompatSet diff = osd_compat.unsupported(superblock.compat_features);
+ derr << "it is still writeable, though. Missing features: " << diff << dendl;
+ r = -EOPNOTSUPP;
+ goto out;
+ }
+ else {
+ CompatSet diff = osd_compat.unsupported(superblock.compat_features);
+ derr << "Cannot write to disk! Missing features: " << diff << dendl;
+ r = -EOPNOTSUPP;
+ goto out;
+ }
+ }
+
+ assert_warn(whoami == superblock.whoami);
+ if (whoami != superblock.whoami) {
+ derr << "OSD::init: superblock says osd"
+ << superblock.whoami << " but i am osd." << whoami << dendl;
+ r = -EINVAL;
+ goto out;
+ }
+
+ initial = get_osd_initial_compat_set();
+ diff = superblock.compat_features.unsupported(initial);
+ if (superblock.compat_features.merge(initial)) {
+ // We need to persist the new compat_set before we
+ // do anything else
+ dout(5) << "Upgrading superblock adding: " << diff << dendl;
+ ObjectStore::Transaction t;
+ write_superblock(t);
+ r = store->apply_transaction(t);
+ if (r < 0)
+ goto out;
}
// make sure info object exists
@@ -1144,7 +1147,7 @@ int OSD::init()
t.touch(coll_t::META_COLL, service.infos_oid);
r = store->apply_transaction(t);
if (r < 0)
- return r;
+ goto out;
}
// make sure snap mapper object exists
@@ -1154,25 +1157,13 @@ int OSD::init()
t.touch(coll_t::META_COLL, OSD::make_snapmapper_oid());
r = store->apply_transaction(t);
if (r < 0)
- return r;
- }
-
- if (osd_compat.compare(superblock.compat_features) != 0) {
- // We need to persist the new compat_set before we
- // do anything else
- dout(5) << "Upgrading superblock compat_set" << dendl;
- superblock.compat_features = osd_compat;
- ObjectStore::Transaction t;
- write_superblock(t);
- r = store->apply_transaction(t);
- if (r < 0)
- return r;
+ goto out;
}
- class_handler = new ClassHandler();
+ class_handler = new ClassHandler(cct);
cls_initialize(class_handler);
- if (g_conf->osd_open_classes_on_start) {
+ if (cct->_conf->osd_open_classes_on_start) {
int r = class_handler->open_all_classes();
if (r)
dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
@@ -1182,7 +1173,8 @@ int OSD::init()
assert_warn(!osdmap);
if (osdmap) {
derr << "OSD::init: unable to read current osdmap" << dendl;
- return -EINVAL;
+ r = -EINVAL;
+ goto out;
}
osdmap = get_map(superblock.current_epoch);
check_osdmap_features();
@@ -1195,12 +1187,6 @@ int OSD::init()
load_pgs();
dout(2) << "superblock: i am osd." << superblock.whoami << dendl;
- assert_warn(whoami == superblock.whoami);
- if (whoami != superblock.whoami) {
- derr << "OSD::init: logic error: superblock says osd"
- << superblock.whoami << " but i am osd." << whoami << dendl;
- return -EINVAL;
- }
create_logger();
@@ -1212,10 +1198,12 @@ int OSD::init()
hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
+ objecter_messenger->add_dispatcher_head(&service.objecter_dispatcher);
+
monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD);
r = monc->init();
if (r < 0)
- return r;
+ goto out;
// tell monc about log_client so it will know about mon session resets
monc->set_log_client(&clog);
@@ -1229,8 +1217,51 @@ int OSD::init()
heartbeat_thread.create();
// tick
- tick_timer.add_event_after(g_conf->osd_heartbeat_interval, new C_Tick(this));
+ tick_timer.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick(this));
+
+ service.init();
+ service.publish_map(osdmap);
+ service.publish_superblock(superblock);
+
+ osd_lock.Unlock();
+
+ r = monc->authenticate();
+ if (r < 0) {
+ osd_lock.Lock(); // locker is going to unlock this on function exit
+ if (is_stopping())
+ r = 0;
+ goto monout;
+ }
+
+ while (monc->wait_auth_rotating(30.0) < 0) {
+ derr << "unable to obtain rotating service keys; retrying" << dendl;
+ }
+
+ osd_lock.Lock();
+ if (is_stopping())
+ return 0;
+ dout(10) << "ensuring pgs have consumed prior maps" << dendl;
+ consume_map();
+ peering_wq.drain();
+
+ dout(10) << "done with init, starting boot process" << dendl;
+ state = STATE_BOOTING;
+ start_boot();
+
+ return 0;
+monout:
+ monc->shutdown();
+
+out:
+ store->umount();
+ delete store;
+ return r;
+}
+
+void OSD::final_init()
+{
+ int r;
AdminSocket *admin_socket = cct->get_admin_socket();
asok_hook = new OSDSocketHook(this);
r = admin_socket->register_command("dump_ops_in_flight",
@@ -1323,47 +1354,13 @@ int OSD::init()
test_ops_hook,
"inject metadata error");
assert(r == 0);
-
- service.init();
- service.publish_map(osdmap);
- service.publish_superblock(superblock);
-
- osd_lock.Unlock();
-
- r = monc->authenticate();
- if (r < 0) {
- monc->shutdown();
- store->umount();
- osd_lock.Lock(); // locker is going to unlock this on function exit
- if (is_stopping())
- return 0;
- return r;
- }
-
- while (monc->wait_auth_rotating(30.0) < 0) {
- derr << "unable to obtain rotating service keys; retrying" << dendl;
- }
-
- osd_lock.Lock();
- if (is_stopping())
- return 0;
-
- dout(10) << "ensuring pgs have consumed prior maps" << dendl;
- consume_map();
- peering_wq.drain();
-
- dout(10) << "done with init, starting boot process" << dendl;
- state = STATE_BOOTING;
- start_boot();
-
- return 0;
}
void OSD::create_logger()
{
dout(10) << "create_logger" << dendl;
- PerfCountersBuilder osd_plb(g_ceph_context, "osd", l_osd_first, l_osd_last);
+ PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
osd_plb.add_u64(l_osd_opq, "opq"); // op queue length (waiting to be processed yet)
osd_plb.add_u64(l_osd_op_wip, "op_wip"); // rep ops currently being processed (primary)
@@ -1423,15 +1420,19 @@ void OSD::create_logger()
osd_plb.add_u64_counter(l_osd_waiting_for_map,
"messages_delayed_for_map"); // dup osdmap epochs
+ osd_plb.add_u64(l_osd_stat_bytes, "stat_bytes");
+ osd_plb.add_u64(l_osd_stat_bytes_used, "stat_bytes_used");
+ osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail");
+
logger = osd_plb.create_perf_counters();
- g_ceph_context->get_perfcounters_collection()->add(logger);
+ cct->get_perfcounters_collection()->add(logger);
}
void OSD::create_recoverystate_perf()
{
dout(10) << "create_recoverystate_perf" << dendl;
- PerfCountersBuilder rs_perf(g_ceph_context, "recoverystate_perf", rs_first, rs_last);
+ PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
rs_perf.add_time_avg(rs_initial_latency, "initial_latency");
rs_perf.add_time_avg(rs_started_latency, "started_latency");
@@ -1464,12 +1465,12 @@ void OSD::create_recoverystate_perf()
rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency");
recoverystate_perf = rs_perf.create_perf_counters();
- g_ceph_context->get_perfcounters_collection()->add(recoverystate_perf);
+ cct->get_perfcounters_collection()->add(recoverystate_perf);
}
void OSD::suicide(int exitcode)
{
- if (g_conf->filestore_blackhole) {
+ if (cct->_conf->filestore_blackhole) {
derr << " filestore_blackhole=true, doing abbreviated shutdown" << dendl;
_exit(exitcode);
}
@@ -1509,11 +1510,11 @@ int OSD::shutdown()
heartbeat_lock.Unlock();
// Debugging
- g_ceph_context->_conf->set_val("debug_osd", "100");
- g_ceph_context->_conf->set_val("debug_journal", "100");
- g_ceph_context->_conf->set_val("debug_filestore", "100");
- g_ceph_context->_conf->set_val("debug_ms", "100");
- g_ceph_context->_conf->apply_changes(NULL);
+ cct->_conf->set_val("debug_osd", "100");
+ cct->_conf->set_val("debug_journal", "100");
+ cct->_conf->set_val("debug_filestore", "100");
+ cct->_conf->set_val("debug_ms", "100");
+ cct->_conf->apply_changes(NULL);
// Shutdown PGs
for (hash_map<pg_t, PG*>::iterator p = pg_map.begin();
@@ -1629,7 +1630,7 @@ int OSD::shutdown()
#ifdef PG_DEBUG_REFS
service.dump_live_pgids();
#endif
- g_conf->remove_observer(this);
+ cct->_conf->remove_observer(this);
monc->shutdown();
osd_lock.Unlock();
@@ -1642,6 +1643,7 @@ int OSD::shutdown()
client_messenger->shutdown();
cluster_messenger->shutdown();
hbclient_messenger->shutdown();
+ objecter_messenger->shutdown();
hb_front_server_messenger->shutdown();
hb_back_server_messenger->shutdown();
peering_wq.clear();
@@ -1673,28 +1675,6 @@ int OSD::read_superblock()
::decode(superblock, p);
dout(10) << "read_superblock " << superblock << dendl;
- if (osd_compat.compare(superblock.compat_features) < 0) {
- derr << "The disk uses features unsupported by the executable." << dendl;
- derr << " ondisk features " << superblock.compat_features << dendl;
- derr << " daemon features " << osd_compat << dendl;
-
- if (osd_compat.writeable(superblock.compat_features)) {
- derr << "it is still writeable, though. Missing features:" << dendl;
- CompatSet diff = osd_compat.unsupported(superblock.compat_features);
- return -EOPNOTSUPP;
- }
- else {
- derr << "Cannot write to disk! Missing features:" << dendl;
- CompatSet diff = osd_compat.unsupported(superblock.compat_features);
- return -EOPNOTSUPP;
- }
- }
-
- if (whoami != superblock.whoami) {
- derr << "read_superblock superblock says osd." << superblock.whoami
- << ", but i (think i) am osd." << whoami << dendl;
- return -1;
- }
return 0;
}
@@ -1709,17 +1689,17 @@ void OSD::recursive_remove_collection(ObjectStore *store, coll_t tmp)
make_snapmapper_oid());
SnapMapper mapper(&driver, 0, 0, 0);
- vector<hobject_t> objects;
+ vector<ghobject_t> objects;
store->collection_list(tmp, objects);
// delete them.
ObjectStore::Transaction t;
unsigned removed = 0;
- for (vector<hobject_t>::iterator p = objects.begin();
+ for (vector<ghobject_t>::iterator p = objects.begin();
p != objects.end();
++p, removed++) {
OSDriver::OSTransaction _t(driver.get_transaction(&t));
- int r = mapper.remove_oid(*p, &_t);
+ int r = mapper.remove_oid(p->hobj, &_t);
if (r != 0 && r != -ENOENT)
assert(0);
t.collection_remove(tmp, *p);
@@ -2185,7 +2165,7 @@ void OSD::build_past_intervals_parallel()
pg->unlock();
// don't let the transaction get too big
- if (++num >= g_conf->osd_target_transaction_size) {
+ if (++num >= cct->_conf->osd_target_transaction_size) {
store->apply_transaction(t);
t = ObjectStore::Transaction();
num = 0;
@@ -2478,14 +2458,14 @@ void OSD::project_pg_history(pg_t pgid, pg_history_t& h, epoch_t from,
float OSDService::get_full_ratio()
{
- float full_ratio = g_conf->osd_failsafe_full_ratio;
+ float full_ratio = cct->_conf->osd_failsafe_full_ratio;
if (full_ratio > 1.0) full_ratio /= 100.0;
return full_ratio;
}
float OSDService::get_nearfull_ratio()
{
- float nearfull_ratio = g_conf->osd_failsafe_nearfull_ratio;
+ float nearfull_ratio = cct->_conf->osd_failsafe_nearfull_ratio;
if (nearfull_ratio > 1.0) nearfull_ratio /= 100.0;
return nearfull_ratio;
}
@@ -2497,7 +2477,12 @@ void OSDService::check_nearfull_warning(const osd_stat_t &osd_stat)
time_t now = ceph_clock_gettime(NULL);
- float ratio = ((float)osd_stat.kb_used) / ((float)osd_stat.kb);
+ // We base ratio on kb_avail rather than kb_used because they can
+ // differ significantly e.g. on btrfs volumes with a large number of
+ // chunks reserved for metadata, and for our purposes (avoiding
+ // completely filling the disk) it's far more important to know how
+ // much space is available to use than how much we've already used.
+ float ratio = ((float)(osd_stat.kb - osd_stat.kb_avail)) / ((float)osd_stat.kb);
float nearfull_ratio = get_nearfull_ratio();
float full_ratio = get_full_ratio();
cur_ratio = ratio;
@@ -2513,7 +2498,7 @@ void OSDService::check_nearfull_warning(const osd_stat_t &osd_stat)
if (cur_state != new_state) {
cur_state = new_state;
- } else if (now - last_msg < g_conf->osd_op_complaint_time) {
+ } else if (now - last_msg < cct->_conf->osd_op_complaint_time) {
return;
}
last_msg = now;
@@ -2535,7 +2520,7 @@ bool OSDService::too_full_for_backfill(double *_ratio, double *_max_ratio)
{
Mutex::Locker l(full_status_lock);
double max_ratio;
- max_ratio = g_conf->osd_backfill_full_ratio;
+ max_ratio = cct->_conf->osd_backfill_full_ratio;
if (_ratio)
*_ratio = cur_ratio;
if (_max_ratio)
@@ -2550,9 +2535,17 @@ void OSD::update_osd_stat()
struct statfs stbuf;
store->statfs(&stbuf);
- osd_stat.kb = stbuf.f_blocks * stbuf.f_bsize / 1024;
- osd_stat.kb_used = (stbuf.f_blocks - stbuf.f_bfree) * stbuf.f_bsize / 1024;
- osd_stat.kb_avail = stbuf.f_bavail * stbuf.f_bsize / 1024;
+ uint64_t bytes = stbuf.f_blocks * stbuf.f_bsize;
+ uint64_t used = (stbuf.f_blocks - stbuf.f_bfree) * stbuf.f_bsize;
+ uint64_t avail = stbuf.f_bavail * stbuf.f_bsize;
+
+ osd_stat.kb = bytes >> 10;
+ osd_stat.kb_used = used >> 10;
+ osd_stat.kb_avail = avail >> 10;
+
+ logger->set(l_osd_stat_bytes, bytes);
+ logger->set(l_osd_stat_bytes_used, used);
+ logger->set(l_osd_stat_bytes_avail, avail);
osd_stat.hb_in.clear();
for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin(); p != heartbeat_peers.end(); ++p)
@@ -2630,13 +2623,13 @@ void OSD::maybe_update_heartbeat_peers()
assert(osd_lock.is_locked());
if (is_waiting_for_healthy()) {
- utime_t now = ceph_clock_now(g_ceph_context);
+ utime_t now = ceph_clock_now(cct);
if (last_heartbeat_resample == utime_t()) {
last_heartbeat_resample = now;
heartbeat_need_update = true;
} else if (!heartbeat_need_update) {
utime_t dur = now - last_heartbeat_resample;
- if (dur > g_conf->osd_heartbeat_grace) {
+ if (dur > cct->_conf->osd_heartbeat_grace) {
dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
heartbeat_need_update = true;
last_heartbeat_resample = now;
@@ -2709,7 +2702,7 @@ void OSD::maybe_update_heartbeat_peers()
// too few?
int start = osdmap->get_next_up_osd_after(whoami);
for (int n = start; n >= 0; ) {
- if ((int)heartbeat_peers.size() >= g_conf->osd_heartbeat_min_peers)
+ if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
break;
if (!extras.count(n) && !want.count(n) && n != whoami) {
dout(10) << " adding random peer osd." << n << dendl;
@@ -2723,7 +2716,7 @@ void OSD::maybe_update_heartbeat_peers()
// too many?
for (set<int>::iterator p = extras.begin();
- (int)heartbeat_peers.size() > g_conf->osd_heartbeat_min_peers && p != extras.end();
+ (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
++p) {
if (want.count(*p))
continue;
@@ -2773,7 +2766,7 @@ void OSD::handle_osd_ping(MOSDPing *m)
case MOSDPing::PING:
{
- if (g_conf->osd_debug_drop_ping_probability > 0) {
+ if (cct->_conf->osd_debug_drop_ping_probability > 0) {
if (debug_heartbeat_drops_remaining.count(from)) {
if (debug_heartbeat_drops_remaining[from] == 0) {
debug_heartbeat_drops_remaining.erase(from);
@@ -2784,10 +2777,10 @@ void OSD::handle_osd_ping(MOSDPing *m)
<< " remaining to drop" << dendl;
break;
}
- } else if (g_conf->osd_debug_drop_ping_probability >
+ } else if (cct->_conf->osd_debug_drop_ping_probability >
((((double)(rand()%100))/100.0))) {
debug_heartbeat_drops_remaining[from] =
- g_conf->osd_debug_drop_ping_duration;
+ cct->_conf->osd_debug_drop_ping_duration;
dout(5) << "Dropping heartbeat from " << from
<< ", " << debug_heartbeat_drops_remaining[from]
<< " remaining to drop" << dendl;
@@ -2795,7 +2788,7 @@ void OSD::handle_osd_ping(MOSDPing *m)
}
}
- if (!g_ceph_context->get_heartbeat_map()->is_healthy()) {
+ if (!cct->get_heartbeat_map()->is_healthy()) {
dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl;
break;
}
@@ -2863,8 +2856,8 @@ void OSD::handle_osd_ping(MOSDPing *m)
}
}
- utime_t cutoff = ceph_clock_now(g_ceph_context);
- cutoff -= g_conf->osd_heartbeat_grace;
+ utime_t cutoff = ceph_clock_now(cct);
+ cutoff -= cct->_conf->osd_heartbeat_grace;
if (i->second.is_healthy(cutoff)) {
// Cancel false reports
if (failure_queue.count(from)) {
@@ -2900,11 +2893,11 @@ void OSD::heartbeat_entry()
while (!heartbeat_stop) {
heartbeat();
- double wait = .5 + ((float)(rand() % 10)/10.0) * (float)g_conf->osd_heartbeat_interval;
+ double wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
utime_t w;
w.set_from_double(wait);
dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
- heartbeat_cond.WaitInterval(g_ceph_context, heartbeat_lock, w);
+ heartbeat_cond.WaitInterval(cct, heartbeat_lock, w);
if (is_stopping())
return;
dout(30) << "heartbeat_entry woke up" << dendl;
@@ -2914,16 +2907,16 @@ void OSD::heartbeat_entry()
void OSD::heartbeat_check()
{
assert(heartbeat_lock.is_locked());
- utime_t now = ceph_clock_now(g_ceph_context);
+ utime_t now = ceph_clock_now(cct);
double age = hbclient_messenger->get_dispatch_queue_max_age(now);
- if (age > (g_conf->osd_heartbeat_grace / 2)) {
+ if (age > (cct->_conf->osd_heartbeat_grace / 2)) {
derr << "skipping heartbeat_check, hbqueue max age: " << age << dendl;
return; // hb dispatch is too backed up for our hb status to be meaningful
}
// check for incoming heartbeats (move me elsewhere?)
utime_t cutoff = now;
- cutoff -= g_conf->osd_heartbeat_grace;
+ cutoff -= cct->_conf->osd_heartbeat_grace;
for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
p != heartbeat_peers.end();
++p) {
@@ -2972,7 +2965,7 @@ void OSD::heartbeat()
dout(5) << "heartbeat: " << osd_stat << dendl;
- utime_t now = ceph_clock_now(g_ceph_context);
+ utime_t now = ceph_clock_now(cct);
// send heartbeats
for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
@@ -3005,7 +2998,7 @@ void OSD::heartbeat()
// hmm.. am i all alone?
dout(30) << "heartbeat lonely?" << dendl;
if (heartbeat_peers.empty()) {
- if (now - last_mon_heartbeat > g_conf->osd_mon_heartbeat_interval && is_active()) {
+ if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
last_mon_heartbeat = now;
dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
monc->sub_want("osdmap", osdmap->get_epoch() + 1, CEPH_SUBSCRIBE_ONETIME);
@@ -3083,19 +3076,19 @@ void OSD::tick()
heartbeat_lock.Unlock();
// mon report?
- utime_t now = ceph_clock_now(g_ceph_context);
+ utime_t now = ceph_clock_now(cct);
if (outstanding_pg_stats &&
- (now - g_conf->osd_mon_ack_timeout) > last_pg_stats_ack) {
+ (now - cct->_conf->osd_mon_ack_timeout) > last_pg_stats_ack) {
dout(1) << "mon hasn't acked PGStats in " << now - last_pg_stats_ack
<< " seconds, reconnecting elsewhere" << dendl;
monc->reopen_session();
- last_pg_stats_ack = ceph_clock_now(g_ceph_context); // reset clock
+ last_pg_stats_ack = ceph_clock_now(cct); // reset clock
last_pg_stats_sent = utime_t();
}
- if (now - last_pg_stats_sent > g_conf->osd_mon_report_interval_max) {
+ if (now - last_pg_stats_sent > cct->_conf->osd_mon_report_interval_max) {
osd_stat_updated = true;
do_mon_report();
- } else if (now - last_mon_report > g_conf->osd_mon_report_interval_min) {
+ } else if (now - last_mon_report > cct->_conf->osd_mon_report_interval_min) {
do_mon_report();
}
@@ -3175,7 +3168,7 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
string poolstr;
- cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+ cmd_getval(service->cct, cmdmap, "pool", poolstr);
pool = curmap->const_lookup_pg_pool_name(poolstr.c_str());
//If we can't find it by name then maybe id specified
if (pool < 0 && isdigit(poolstr[0]))
@@ -3186,7 +3179,7 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
}
r = -1;
string objname, nspace;
- cmd_getval(g_ceph_context, cmdmap, "objname", objname);
+ cmd_getval(service->cct, cmdmap, "objname", objname);
std::size_t found = objname.find_first_of('/');
if (found != string::npos) {
nspace = objname.substr(0, found);
@@ -3208,8 +3201,8 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
map<string, bufferlist> newattrs;
bufferlist val;
string key, valstr;
- cmd_getval(g_ceph_context, cmdmap, "key", key);
- cmd_getval(g_ceph_context, cmdmap, "val", valstr);
+ cmd_getval(service->cct, cmdmap, "key", key);
+ cmd_getval(service->cct, cmdmap, "val", valstr);
val.append(valstr);
newattrs[key] = val;
@@ -3222,7 +3215,7 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
} else if (command == "rmomapkey") {
string key;
set<string> keys;
- cmd_getval(g_ceph_context, cmdmap, "key", key);
+ cmd_getval(service->cct, cmdmap, "key", key);
keys.insert(key);
t.omap_rmkeys(coll_t(pgid), obj, keys);
@@ -3235,7 +3228,7 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
bufferlist newheader;
string headerstr;
- cmd_getval(g_ceph_context, cmdmap, "header", headerstr);
+ cmd_getval(service->cct, cmdmap, "header", headerstr);
newheader.append(headerstr);
t.omap_setheader(coll_t(pgid), obj, newheader);
r = store->apply_transaction(t);
@@ -3259,7 +3252,7 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
}
} else if (command == "truncobj") {
int64_t trunclen;
- cmd_getval(g_ceph_context, cmdmap, "len", trunclen);
+ cmd_getval(service->cct, cmdmap, "len", trunclen);
t.truncate(coll_t(pgid), obj, trunclen);
r = store->apply_transaction(t);
if (r < 0)
@@ -3281,15 +3274,16 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
// =========================================
bool remove_dir(
+ CephContext *cct,
ObjectStore *store, SnapMapper *mapper,
OSDriver *osdriver,
ObjectStore::Sequencer *osr,
coll_t coll, DeletingStateRef dstate)
{
- vector<hobject_t> olist;
+ vector<ghobject_t> olist;
int64_t num = 0;
ObjectStore::Transaction *t = new ObjectStore::Transaction;
- hobject_t next;
+ ghobject_t next;
while (!next.is_max()) {
store->collection_list_partial(
coll,
@@ -3299,16 +3293,16 @@ bool remove_dir(
0,
&olist,
&next);
- for (vector<hobject_t>::iterator i = olist.begin();
+ for (vector<ghobject_t>::iterator i = olist.begin();
i != olist.end();
++i, ++num) {
OSDriver::OSTransaction _t(osdriver->get_transaction(t));
- int r = mapper->remove_oid(*i, &_t);
+ int r = mapper->remove_oid(i->hobj, &_t);
if (r != 0 && r != -ENOENT) {
assert(0);
}
t->remove(coll, *i);
- if (num >= g_conf->osd_target_transaction_size) {
+ if (num >= cct->_conf->osd_target_transaction_size) {
C_SaferCond waiter;
store->queue_transaction(osr, t, &waiter);
bool cont = dstate->pause_clearing();
@@ -3346,16 +3340,16 @@ void OSD::RemoveWQ::_process(pair<PGRef, DeletingStateRef> item)
if (!item.second->start_clearing())
return;
- if (pg->have_temp_coll()) {
+ list<coll_t> colls_to_remove;
+ pg->get_colls(&colls_to_remove);
+ for (list<coll_t>::iterator i = colls_to_remove.begin();
+ i != colls_to_remove.end();
+ ++i) {
bool cont = remove_dir(
- store, &mapper, &driver, pg->osr.get(), pg->get_temp_coll(), item.second);
+ pg->cct, store, &mapper, &driver, pg->osr.get(), *i, item.second);
if (!cont)
return;
}
- bool cont = remove_dir(
- store, &mapper, &driver, pg->osr.get(), coll, item.second);
- if (!cont)
- return;
if (!item.second->start_deleting())
return;
@@ -3366,9 +3360,12 @@ void OSD::RemoveWQ::_process(pair<PGRef, DeletingStateRef> item)
OSD::make_infos_oid(),
pg->log_oid,
t);
- if (pg->have_temp_coll())
- t->remove_collection(pg->get_temp_coll());
- t->remove_collection(coll);
+
+ for (list<coll_t>::iterator i = colls_to_remove.begin();
+ i != colls_to_remove.end();
+ ++i) {
+ t->remove_collection(*i);
+ }
// We need the sequencer to stick around until the op is complete
store->queue_transaction(
@@ -3389,7 +3386,7 @@ void OSD::do_mon_report()
{
dout(7) << "do_mon_report" << dendl;
- utime_t now(ceph_clock_now(g_ceph_context));
+ utime_t now(ceph_clock_now(cct));
last_mon_report = now;
// do any pending reports
@@ -3412,7 +3409,7 @@ void OSD::ms_handle_connect(Connection *con)
send_alive();
service.send_pg_temp();
send_failures();
- send_pg_stats(ceph_clock_now(g_ceph_context));
+ send_pg_stats(ceph_clock_now(cct));
monc->sub_want("osd_pg_creates", 0, CEPH_SUBSCRIBE_ONETIME);
monc->renew_subs();
@@ -3473,7 +3470,7 @@ void OSD::_maybe_boot(epoch_t oldest, epoch_t newest)
// send pings sooner rather than later
heartbeat_kick();
} else if (osdmap->get_epoch() >= oldest - 1 &&
- osdmap->get_epoch() + g_conf->osd_map_message_max > newest) {
+ osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
_send_boot();
return;
}
@@ -3495,15 +3492,15 @@ void OSD::start_waiting_for_healthy()
bool OSD::_is_healthy()
{
- if (!g_ceph_context->get_heartbeat_map()->is_healthy()) {
+ if (!cct->get_heartbeat_map()->is_healthy()) {
dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
return false;
}
if (is_waiting_for_healthy()) {
Mutex::Locker l(heartbeat_lock);
- utime_t cutoff = ceph_clock_now(g_ceph_context);
- cutoff -= g_conf->osd_heartbeat_grace;
+ utime_t cutoff = ceph_clock_now(cct);
+ cutoff -= cct->_conf->osd_heartbeat_grace;
int num = 0, up = 0;
for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
p != heartbeat_peers.end();
@@ -3512,7 +3509,7 @@ bool OSD::_is_healthy()
++up;
++num;
}
- if ((float)up < (float)num * g_conf->osd_heartbeat_min_healthy_ratio) {
+ if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than 1/3)" << dendl;
return false;
}
@@ -3569,7 +3566,7 @@ void OSD::queue_want_up_thru(epoch_t want)
up_thru_wanted = want;
// expedite, a bit. WARNING this will somewhat delay other mon queries.
- last_mon_report = ceph_clock_now(g_ceph_context);
+ last_mon_report = ceph_clock_now(cct);
send_alive();
} else {
dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
@@ -3664,7 +3661,7 @@ void OSD::send_failures()
heartbeat_lock.Lock();
locked = true;
}
- utime_t now = ceph_clock_now(g_ceph_context);
+ utime_t now = ceph_clock_now(cct);
while (!failure_queue.empty()) {
int osd = failure_queue.begin()->first;
int failed_for = (int)(double)(now - failure_queue.begin()->second);
@@ -3733,7 +3730,7 @@ void OSD::send_pg_stats(const utime_t &now)
if (!outstanding_pg_stats) {
outstanding_pg_stats = true;
- last_pg_stats_ack = ceph_clock_now(g_ceph_context);
+ last_pg_stats_ack = ceph_clock_now(cct);
}
monc->send_mon_message(m);
}
@@ -3750,7 +3747,7 @@ void OSD::handle_pg_stats_ack(MPGStatsAck *ack)
return;
}
- last_pg_stats_ack = ceph_clock_now(g_ceph_context);
+ last_pg_stats_ack = ceph_clock_now(cct);
pg_stat_queue_lock.Lock();
@@ -3910,6 +3907,10 @@ COMMAND("bench " \
"(default 1G size 4MB). Results in log.",
"osd", "rw", "cli,rest")
COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw", "cli,rest")
+COMMAND("heap " \
+ "name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
+ "show heap usage info (available only if compiled with tcmalloc)", \
+ "osd", "rw", "cli,rest")
COMMAND("debug_dump_missing " \
"name=filename,type=CephFilepath",
"dump missing objects to a named file", "osd", "r", "cli,rest")
@@ -3963,7 +3964,7 @@ void OSD::do_command(Connection *con, tid_t tid, vector<string>& cmd, bufferlist
goto out;
}
- cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+ cmd_getval(cct, cmdmap, "prefix", prefix);
if (prefix == "get_command_descriptions") {
int cmdnum = 0;
@@ -3985,7 +3986,7 @@ void OSD::do_command(Connection *con, tid_t tid, vector<string>& cmd, bufferlist
goto out;
}
- cmd_getval(g_ceph_context, cmdmap, "format", format);
+ cmd_getval(cct, cmdmap, "format", format);
f.reset(new_formatter(format));
if (prefix == "version") {
@@ -4001,7 +4002,7 @@ void OSD::do_command(Connection *con, tid_t tid, vector<string>& cmd, bufferlist
}
else if (prefix == "injectargs") {
vector<string> argsvec;
- cmd_getval(g_ceph_context, cmdmap, "injected_args", argsvec);
+ cmd_getval(cct, cmdmap, "injected_args", argsvec);
if (argsvec.empty()) {
r = -EINVAL;
@@ -4012,7 +4013,7 @@ void OSD::do_command(Connection *con, tid_t tid, vector<string>& cmd, bufferlist
for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
args += " " + *a;
osd_lock.Unlock();
- g_conf->injectargs(args, &ss);
+ cct->_conf->injectargs(args, &ss);
osd_lock.Lock();
}
@@ -4020,14 +4021,14 @@ void OSD::do_command(Connection *con, tid_t tid, vector<string>& cmd, bufferlist
// 'tell <pgid>' (which comes in without any of that prefix)?
else if (prefix == "pg" ||
- (cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr) &&
+ (cmd_getval(cct, cmdmap, "pgid", pgidstr) &&
(prefix == "query" ||
prefix == "mark_unfound_lost" ||
prefix == "list_missing")
)) {
pg_t pgid;
- if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
+ if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
ss << "no pgid specified";
r = -EINVAL;
} else if (!pgid.parse(pgidstr.c_str())) {
@@ -4041,7 +4042,7 @@ void OSD::do_command(Connection *con, tid_t tid, vector<string>& cmd, bufferlist
} else {
// simulate pg <pgid> cmd= for pg->do-command
if (prefix != "pg")
- cmd_putval(g_ceph_context, cmdmap, "cmd", prefix);
+ cmd_putval(cct, cmdmap, "cmd", prefix);
r = pg->do_command(cmdmap, ss, data, odata);
pg->unlock();
}
@@ -4052,8 +4053,8 @@ void OSD::do_command(Connection *con, tid_t tid, vector<string>& cmd, bufferlist
int64_t count;
int64_t bsize;
// default count 1G, size 4MB
- cmd_getval(g_ceph_context, cmdmap, "count", count, (int64_t)1 << 30);
- cmd_getval(g_ceph_context, cmdmap, "bsize", bsize, (int64_t)4 << 20);
+ cmd_getval(cct, cmdmap, "count", count, (int64_t)1 << 30);
+ cmd_getval(cct, cmdmap, "bsize", bsize, (int64_t)4 << 20);
bufferlist bl;
bufferptr bp(bsize);
@@ -4063,7 +4064,7 @@ void OSD::do_command(Connection *con, tid_t tid, vector<string>& cmd, bufferlist
ObjectStore::Transaction *cleanupt = new ObjectStore::Transaction;
store->sync_and_flush();
- utime_t start = ceph_clock_now(g_ceph_context);
+ utime_t start = ceph_clock_now(cct);
for (int64_t pos = 0; pos < count; pos += bsize) {
char nm[30];
snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
@@ -4075,7 +4076,7 @@ void OSD::do_command(Connection *con, tid_t tid, vector<string>& cmd, bufferlist
cleanupt->remove(coll_t::META_COLL, soid);
}
store->sync_and_flush();
- utime_t end = ceph_clock_now(g_ceph_context);
+ utime_t end = ceph_clock_now(cct);
// clean up
store->queue_transaction(NULL, cleanupt);
@@ -4105,7 +4106,7 @@ void OSD::do_command(Connection *con, tid_t tid, vector<string>& cmd, bufferlist
ss << "could not issue heap profiler command -- not using tcmalloc!";
} else {
string heapcmd;
- cmd_getval(g_ceph_context, cmdmap, "heapcmd", heapcmd);
+ cmd_getval(cct, cmdmap, "heapcmd", heapcmd);
// XXX 1-element vector, change at callee or make vector here?
vector<string> heapcmd_vec;
get_str_vec(heapcmd, heapcmd_vec);
@@ -4115,7 +4116,7 @@ void OSD::do_command(Connection *con, tid_t tid, vector<string>& cmd, bufferlist
else if (prefix == "debug dump_missing") {
string file_name;
- cmd_getval(g_ceph_context, cmdmap, "filename", file_name);
+ cmd_getval(cct, cmdmap, "filename", file_name);
std::ofstream fout(file_name.c_str());
if (!fout.is_open()) {
ss << "failed to open file '" << file_name << "'";
@@ -4161,27 +4162,27 @@ void OSD::do_command(Connection *con, tid_t tid, vector<string>& cmd, bufferlist
}
else if (prefix == "debug kick_recovery_wq") {
int64_t delay;
- cmd_getval(g_ceph_context, cmdmap, "delay", delay);
+ cmd_getval(cct, cmdmap, "delay", delay);
ostringstream oss;
oss << delay;
- r = g_conf->set_val("osd_recovery_delay_start", oss.str().c_str());
+ r = cct->_conf->set_val("osd_recovery_delay_start", oss.str().c_str());
if (r != 0) {
ss << "kick_recovery_wq: error setting "
<< "osd_recovery_delay_start to '" << delay << "': error "
<< r;
goto out;
}
- g_conf->apply_changes(NULL);
+ cct->_conf->apply_changes(NULL);
ss << "kicking recovery queue. set osd_recovery_delay_start "
- << "to " << g_conf->osd_recovery_delay_start;
- defer_recovery_until = ceph_clock_now(g_ceph_context);
- defer_recovery_until += g_conf->osd_recovery_delay_start;
+ << "to " << cct->_conf->osd_recovery_delay_start;
+ defer_recovery_until = ceph_clock_now(cct);
+ defer_recovery_until += cct->_conf->osd_recovery_delay_start;
recovery_wq.wake();
}
else if (prefix == "cpu_profiler") {
string arg;
- cmd_getval(g_ceph_context, cmdmap, "arg", arg);
+ cmd_getval(cct, cmdmap, "arg", arg);
vector<string> argvec;
get_str_vec(arg, argvec);
cpu_profiler_handle_command(argvec, ds);
@@ -4299,7 +4300,7 @@ bool OSD::_share_map_incoming(entity_name_t name, Connection *con, epoch_t epoch
}
// does peer have old map?
- if (name.is_osd() &&
+ if (con->get_messenger() == cluster_messenger &&
osdmap->is_up(name.num()) &&
(osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
@@ -4372,6 +4373,37 @@ bool OSD::heartbeat_dispatch(Message *m)
return true;
}
+bool OSDService::ObjecterDispatcher::ms_dispatch(Message *m)
+{
+ Mutex::Locker l(osd->objecter_lock);
+ osd->objecter->dispatch(m);
+ return true;
+}
+
+bool OSDService::ObjecterDispatcher::ms_handle_reset(Connection *con)
+{
+ Mutex::Locker l(osd->objecter_lock);
+ osd->objecter->ms_handle_reset(con);
+ return true;
+}
+
+void OSDService::ObjecterDispatcher::ms_handle_connect(Connection *con)
+{
+ Mutex::Locker l(osd->objecter_lock);
+ return osd->objecter->ms_handle_connect(con);
+}
+
+bool OSDService::ObjecterDispatcher::ms_get_authorizer(int dest_type,
+ AuthAuthorizer **authorizer,
+ bool force_new)
+{
+ if (dest_type == CEPH_ENTITY_TYPE_MON)
+ return true;
+ *authorizer = osd->monc->auth->build_authorizer(dest_type);
+ return *authorizer != NULL;
+}
+
+
bool OSD::ms_dispatch(Message *m)
{
if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
@@ -4454,7 +4486,7 @@ bool OSD::ms_verify_authorizer(Connection *con, int peer_type,
uint64_t global_id;
uint64_t auid = CEPH_AUTH_UID_DEFAULT;
- isvalid = authorize_handler->verify_authorizer(g_ceph_context, monc->rotating_secrets,
+ isvalid = authorize_handler->verify_authorizer(cct, monc->rotating_secrets,
authorizer_data, authorizer_reply, name, global_id, caps_info, session_key, &auid);
if (isvalid) {
@@ -4511,7 +4543,7 @@ void OSD::do_waiters()
void OSD::dispatch_op(OpRequestRef op)
{
- switch (op->request->get_type()) {
+ switch (op->get_req()->get_type()) {
case MSG_OSD_PG_CREATE:
handle_pg_create(op);
@@ -4637,7 +4669,7 @@ void OSD::_dispatch(Message *m)
default:
{
- OpRequestRef op = op_tracker.create_request(m);
+ OpRequestRef op = op_tracker.create_request<OpRequest>(m);
op->mark_event("waiting_for_osdmap");
// no map? starting up?
if (!osdmap) {
@@ -4728,17 +4760,17 @@ bool OSD::scrub_should_schedule()
return false;
}
- if (loadavgs[0] >= g_conf->osd_scrub_load_threshold) {
+ if (loadavgs[0] >= cct->_conf->osd_scrub_load_threshold) {
dout(20) << "scrub_should_schedule loadavg " << loadavgs[0]
- << " >= max " << g_conf->osd_scrub_load_threshold
+ << " >= max " << cct->_conf->osd_scrub_load_threshold
<< " = no, load too high" << dendl;
return false;
}
dout(20) << "scrub_should_schedule loadavg " << loadavgs[0]
- << " < max " << g_conf->osd_scrub_load_threshold
+ << " < max " << cct->_conf->osd_scrub_load_threshold
<< " = yes" << dendl;
- return loadavgs[0] < g_conf->osd_scrub_load_threshold;
+ return loadavgs[0] < cct->_conf->osd_scrub_load_threshold;
}
void OSD::sched_scrub()
@@ -4749,7 +4781,7 @@ void OSD::sched_scrub()
dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
- utime_t now = ceph_clock_now(g_ceph_context);
+ utime_t now = ceph_clock_now(cct);
//dout(20) << " " << last_scrub_pg << dendl;
@@ -4761,15 +4793,15 @@ void OSD::sched_scrub()
dout(30) << "sched_scrub examine " << pgid << " at " << t << dendl;
utime_t diff = now - t;
- if ((double)diff < g_conf->osd_scrub_min_interval) {
+ if ((double)diff < cct->_conf->osd_scrub_min_interval) {
dout(10) << "sched_scrub " << pgid << " at " << t
- << ": " << (double)diff << " < min (" << g_conf->osd_scrub_min_interval << " seconds)" << dendl;
+ << ": " << (double)diff << " < min (" << cct->_conf->osd_scrub_min_interval << " seconds)" << dendl;
break;
}
- if ((double)diff < g_conf->osd_scrub_max_interval && !load_is_low) {
+ if ((double)diff < cct->_conf->osd_scrub_max_interval && !load_is_low) {
// save ourselves some effort
dout(10) << "sched_scrub " << pgid << " high load at " << t
- << ": " << (double)diff << " < max (" << g_conf->osd_scrub_max_interval << " seconds)" << dendl;
+ << ": " << (double)diff << " < max (" << cct->_conf->osd_scrub_max_interval << " seconds)" << dendl;
break;
}
@@ -4777,11 +4809,11 @@ void OSD::sched_scrub()
if (pg) {
if (pg->is_active() &&
(load_is_low ||
- (double)diff >= g_conf->osd_scrub_max_interval ||
+ (double)diff >= cct->_conf->osd_scrub_max_interval ||
pg->scrubber.must_scrub)) {
dout(10) << "sched_scrub scrubbing " << pgid << " at " << t
<< (pg->scrubber.must_scrub ? ", explicitly requested" :
- ( (double)diff >= g_conf->osd_scrub_max_interval ? ", diff >= max" : ""))
+ ( (double)diff >= cct->_conf->osd_scrub_max_interval ? ", diff >= max" : ""))
<< dendl;
if (pg->sched_scrub()) {
pg->unlock();
@@ -4800,13 +4832,13 @@ bool OSDService::inc_scrubs_pending()
bool result = false;
sched_scrub_lock.Lock();
- if (scrubs_pending + scrubs_active < g_conf->osd_max_scrubs) {
+ if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
dout(20) << "inc_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending+1)
- << " (max " << g_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
+ << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
result = true;
++scrubs_pending;
} else {
- dout(20) << "inc_scrubs_pending " << scrubs_pending << " + " << scrubs_active << " active >= max " << g_conf->osd_max_scrubs << dendl;
+ dout(20) << "inc_scrubs_pending " << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
}
sched_scrub_lock.Unlock();
@@ -4817,7 +4849,7 @@ void OSDService::dec_scrubs_pending()
{
sched_scrub_lock.Lock();
dout(20) << "dec_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending-1)
- << " (max " << g_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
+ << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
--scrubs_pending;
assert(scrubs_pending >= 0);
sched_scrub_lock.Unlock();
@@ -4830,12 +4862,12 @@ void OSDService::inc_scrubs_active(bool reserved)
if (reserved) {
--(scrubs_pending);
dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
- << " (max " << g_conf->osd_max_scrubs
+ << " (max " << cct->_conf->osd_max_scrubs
<< ", pending " << (scrubs_pending+1) << " -> " << scrubs_pending << ")" << dendl;
assert(scrubs_pending >= 0);
} else {
dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
- << " (max " << g_conf->osd_max_scrubs
+ << " (max " << cct->_conf->osd_max_scrubs
<< ", pending " << scrubs_pending << ")" << dendl;
}
sched_scrub_lock.Unlock();
@@ -4845,7 +4877,7 @@ void OSDService::dec_scrubs_active()
{
sched_scrub_lock.Lock();
dout(20) << "dec_scrubs_active " << scrubs_active << " -> " << (scrubs_active-1)
- << " (max " << g_conf->osd_max_scrubs << ", pending " << scrubs_pending << ")" << dendl;
+ << " (max " << cct->_conf->osd_max_scrubs << ", pending " << scrubs_pending << ")" << dendl;
--scrubs_active;
sched_scrub_lock.Unlock();
}
@@ -4864,10 +4896,10 @@ bool OSDService::prepare_to_stop()
osdmap->get_epoch(),
false
));
- utime_t now = ceph_clock_now(g_ceph_context);
+ utime_t now = ceph_clock_now(cct);
utime_t timeout;
- timeout.set_from_double(now + g_conf->osd_mon_shutdown_timeout);
- while ((ceph_clock_now(g_ceph_context) < timeout) &&
+ timeout.set_from_double(now + cct->_conf->osd_mon_shutdown_timeout);
+ while ((ceph_clock_now(cct) < timeout) &&
(state != STOPPING)) {
is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
}
@@ -4965,6 +4997,13 @@ void OSD::handle_osd_map(MOSDMap *m)
if (session)
session->put();
+ // share with the objecter
+ {
+ Mutex::Locker l(service.objecter_lock);
+ m->get();
+ service.objecter->handle_osd_map(m);
+ }
+
epoch_t first = m->get_first();
epoch_t last = m->get_last();
dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
@@ -5079,7 +5118,7 @@ void OSD::handle_osd_map(MOSDMap *m)
t.remove(coll_t::META_COLL, get_inc_osdmap_pobject_name(e));
superblock.oldest_map = e+1;
num++;
- if (num >= g_conf->osd_target_transaction_size &&
+ if (num >= cct->_conf->osd_target_transaction_size &&
(uint64_t)num > (last - first)) // make sure we at least keep pace with incoming maps
break;
}
@@ -5092,7 +5131,7 @@ void OSD::handle_osd_map(MOSDMap *m)
map_lock.get_write();
- C_Contexts *fin = new C_Contexts(g_ceph_context);
+ C_Contexts *fin = new C_Contexts(cct);
// advance through the new maps
for (epoch_t cur = start; cur <= superblock.newest_map; cur++) {
@@ -5119,7 +5158,7 @@ void OSD::handle_osd_map(MOSDMap *m)
superblock.current_epoch = cur;
advance_map(t, fin);
- had_map_since = ceph_clock_now(g_ceph_context);
+ had_map_since = ceph_clock_now(cct);
}
if (osdmap->is_up(whoami) &&
@@ -5553,15 +5592,15 @@ void OSD::send_incremental_map(epoch_t since, Connection *con)
return;
}
- if (to > since && (int64_t)(to - since) > g_conf->osd_map_share_max_epochs) {
- dout(10) << " " << (to - since) << " > max " << g_conf->osd_map_share_max_epochs
+ if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
+ dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
<< ", only sending most recent" << dendl;
- since = to - g_conf->osd_map_share_max_epochs;
+ since = to - cct->_conf->osd_map_share_max_epochs;
}
while (since < to) {
- if (to - since > (epoch_t)g_conf->osd_map_message_max)
- to = since + g_conf->osd_map_message_max;
+ if (to - since > (epoch_t)cct->_conf->osd_map_message_max)
+ to = since + cct->_conf->osd_map_message_max;
MOSDMap *m = build_incremental_map_msg(since, to);
send_map(m, con);
since = to;
@@ -5628,7 +5667,7 @@ OSDMapRef OSDService::_add_map(OSDMap *o)
{
epoch_t e = o->get_epoch();
- if (g_conf->osd_map_dedup) {
+ if (cct->_conf->osd_map_dedup) {
// Dedup against an existing map at a nearby epoch
OSDMapRef for_dedup = map_cache.lower_bound(e);
if (for_dedup) {
@@ -5676,9 +5715,9 @@ bool OSD::require_mon_peer(Message *m)
bool OSD::require_osd_peer(OpRequestRef op)
{
- if (!op->request->get_connection()->peer_is_osd()) {
- dout(0) << "require_osd_peer received from non-osd " << op->request->get_connection()->get_peer_addr()
- << " " << *op->request << dendl;
+ if (!op->get_req()->get_connection()->peer_is_osd()) {
+ dout(0) << "require_osd_peer received from non-osd " << op->get_req()->get_connection()->get_peer_addr()
+ << " " << *op->get_req() << dendl;
return false;
}
return true;
@@ -5690,7 +5729,7 @@ bool OSD::require_osd_peer(OpRequestRef op)
*/
bool OSD::require_same_or_newer_map(OpRequestRef op, epoch_t epoch)
{
- Message *m = op->request;
+ Message *m = op->get_req();
dout(15) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
assert(osd_lock.is_locked());
@@ -5708,7 +5747,7 @@ bool OSD::require_same_or_newer_map(OpRequestRef op, epoch_t epoch)
}
// ok, our map is same or newer.. do they still exist?
- if (m->get_source().is_osd()) {
+ if (m->get_connection()->get_messenger() == cluster_messenger) {
int from = m->get_source().num();
if (!osdmap->have_inst(from) ||
osdmap->get_cluster_addr(from) != m->get_source_inst().addr) {
@@ -5781,22 +5820,11 @@ void OSD::split_pgs(
dout(10) << "m_seed " << i->ps() << dendl;
dout(10) << "split_bits is " << split_bits << dendl;
- rctx->transaction->create_collection(
- coll_t(*i));
- rctx->transaction->split_collection(
- coll_t(parent->info.pgid),
+ parent->split_colls(
+ *i,
split_bits,
i->m_seed,
- coll_t(*i));
- if (parent->have_temp_coll()) {
- rctx->transaction->create_collection(
- coll_t::make_temp_coll(*i));
- rctx->transaction->split_collection(
- coll_t::make_temp_coll(parent->info.pgid),
- split_bits,
- i->m_seed,
- coll_t::make_temp_coll(*i));
- }
+ rctx->transaction);
parent->split_into(
*i,
child,
@@ -5813,14 +5841,14 @@ void OSD::split_pgs(
*/
void OSD::handle_pg_create(OpRequestRef op)
{
- MOSDPGCreate *m = (MOSDPGCreate*)op->request;
+ MOSDPGCreate *m = (MOSDPGCreate*)op->get_req();
assert(m->get_header().type == MSG_OSD_PG_CREATE);
dout(10) << "handle_pg_create " << *m << dendl;
// drop the next N pg_creates in a row?
if (debug_drop_pg_create_left < 0 &&
- g_conf->osd_debug_drop_pg_create_probability >
+ cct->_conf->osd_debug_drop_pg_create_probability >
((((double)(rand()%100))/100.0))) {
debug_drop_pg_create_left = debug_drop_pg_create_duration;
}
@@ -5833,11 +5861,16 @@ void OSD::handle_pg_create(OpRequestRef op)
}
}
- if (!require_mon_peer(op->request)) {
- // we have to hack around require_mon_peer's interface limits
- op->request = NULL;
+ /* we have to hack around require_mon_peer's interface limits, so
+ * grab an extra reference before going in. If the peer isn't
+ * a Monitor, the reference is put for us (and then cleared
+ * up automatically by our OpTracker infrastructure). Otherwise,
+ * we put the extra ref ourself.
+ */
+ if (!require_mon_peer(op->get_req()->get())) {
return;
}
+ op->get_req()->put();
if (!require_same_or_newer_map(op, m->epoch)) return;
@@ -5944,8 +5977,8 @@ void OSD::handle_pg_create(OpRequestRef op)
PG::RecoveryCtx OSD::create_context()
{
ObjectStore::Transaction *t = new ObjectStore::Transaction;
- C_Contexts *on_applied = new C_Contexts(g_ceph_context);
- C_Contexts *on_safe = new C_Contexts(g_ceph_context);
+ C_Contexts *on_applied = new C_Contexts(cct);
+ C_Contexts *on_safe = new C_Contexts(cct);
map< int, map<pg_t,pg_query_t> > *query_map =
new map<int, map<pg_t, pg_query_t> >;
map<int,vector<pair<pg_notify_t, pg_interval_map_t> > > *notify_list =
@@ -5966,8 +5999,8 @@ void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg)
ctx.transaction, ctx.on_applied, ctx.on_safe);
assert(tr == 0);
ctx.transaction = new ObjectStore::Transaction;
- ctx.on_applied = new C_Contexts(g_ceph_context);
- ctx.on_safe = new C_Contexts(g_ceph_context);
+ ctx.on_applied = new C_Contexts(cct);
+ ctx.on_safe = new C_Contexts(cct);
}
}
@@ -6142,7 +6175,7 @@ void OSD::do_infos(map<int,vector<pair<pg_notify_t, pg_interval_map_t> > >& info
*/
void OSD::handle_pg_notify(OpRequestRef op)
{
- MOSDPGNotify *m = (MOSDPGNotify*)op->request;
+ MOSDPGNotify *m = (MOSDPGNotify*)op->get_req();
assert(m->get_header().type == MSG_OSD_PG_NOTIFY);
dout(7) << "handle_pg_notify from " << m->get_source() << dendl;
@@ -6177,7 +6210,7 @@ void OSD::handle_pg_notify(OpRequestRef op)
void OSD::handle_pg_log(OpRequestRef op)
{
- MOSDPGLog *m = (MOSDPGLog*) op->request;
+ MOSDPGLog *m = (MOSDPGLog*) op->get_req();
assert(m->get_header().type == MSG_OSD_PG_LOG);
dout(7) << "handle_pg_log " << *m << " from " << m->get_source() << dendl;
@@ -6205,7 +6238,7 @@ void OSD::handle_pg_log(OpRequestRef op)
void OSD::handle_pg_info(OpRequestRef op)
{
- MOSDPGInfo *m = static_cast<MOSDPGInfo *>(op->request);
+ MOSDPGInfo *m = static_cast<MOSDPGInfo *>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_INFO);
dout(7) << "handle_pg_info " << *m << " from " << m->get_source() << dendl;
@@ -6238,7 +6271,7 @@ void OSD::handle_pg_info(OpRequestRef op)
void OSD::handle_pg_trim(OpRequestRef op)
{
- MOSDPGTrim *m = (MOSDPGTrim *)op->request;
+ MOSDPGTrim *m = (MOSDPGTrim *)op->get_req();
assert(m->get_header().type == MSG_OSD_PG_TRIM);
dout(7) << "handle_pg_trim " << *m << " from " << m->get_source() << dendl;
@@ -6291,7 +6324,7 @@ void OSD::handle_pg_trim(OpRequestRef op)
void OSD::handle_pg_scan(OpRequestRef op)
{
- MOSDPGScan *m = static_cast<MOSDPGScan*>(op->request);
+ MOSDPGScan *m = static_cast<MOSDPGScan*>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_SCAN);
dout(10) << "handle_pg_scan " << *m << " from " << m->get_source() << dendl;
@@ -6319,7 +6352,7 @@ void OSD::handle_pg_scan(OpRequestRef op)
void OSD::handle_pg_backfill(OpRequestRef op)
{
- MOSDPGBackfill *m = static_cast<MOSDPGBackfill*>(op->request);
+ MOSDPGBackfill *m = static_cast<MOSDPGBackfill*>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_BACKFILL);
dout(10) << "handle_pg_backfill " << *m << " from " << m->get_source() << dendl;
@@ -6347,7 +6380,7 @@ void OSD::handle_pg_backfill(OpRequestRef op)
void OSD::handle_pg_backfill_reserve(OpRequestRef op)
{
- MBackfillReserve *m = static_cast<MBackfillReserve*>(op->request);
+ MBackfillReserve *m = static_cast<MBackfillReserve*>(op->get_req());
assert(m->get_header().type == MSG_OSD_BACKFILL_RESERVE);
if (!require_osd_peer(op))
@@ -6391,7 +6424,7 @@ void OSD::handle_pg_backfill_reserve(OpRequestRef op)
void OSD::handle_pg_recovery_reserve(OpRequestRef op)
{
- MRecoveryReserve *m = static_cast<MRecoveryReserve*>(op->request);
+ MRecoveryReserve *m = static_cast<MRecoveryReserve*>(op->get_req());
assert(m->get_header().type == MSG_OSD_RECOVERY_RESERVE);
if (!require_osd_peer(op))
@@ -6443,7 +6476,7 @@ void OSD::handle_pg_query(OpRequestRef op)
{
assert(osd_lock.is_locked());
- MOSDPGQuery *m = (MOSDPGQuery*)op->request;
+ MOSDPGQuery *m = (MOSDPGQuery*)op->get_req();
assert(m->get_header().type == MSG_OSD_PG_QUERY);
if (!require_osd_peer(op))
@@ -6530,7 +6563,7 @@ void OSD::handle_pg_query(OpRequestRef op)
void OSD::handle_pg_remove(OpRequestRef op)
{
- MOSDPGRemove *m = (MOSDPGRemove *)op->request;
+ MOSDPGRemove *m = (MOSDPGRemove *)op->get_req();
assert(m->get_header().type == MSG_OSD_PG_REMOVE);
assert(osd_lock.is_locked());
@@ -6621,7 +6654,7 @@ void OSD::check_replay_queue()
{
assert(osd_lock.is_locked());
- utime_t now = ceph_clock_now(g_ceph_context);
+ utime_t now = ceph_clock_now(cct);
list< pair<pg_t,utime_t> > pgids;
replay_queue_lock.Lock();
while (!replay_queue.empty() &&
@@ -6665,12 +6698,12 @@ bool OSDService::queue_for_recovery(PG *pg)
bool OSD::_recover_now()
{
- if (recovery_ops_active >= g_conf->osd_recovery_max_active) {
+ if (recovery_ops_active >= cct->_conf->osd_recovery_max_active) {
dout(15) << "_recover_now active " << recovery_ops_active
- << " >= max " << g_conf->osd_recovery_max_active << dendl;
+ << " >= max " << cct->_conf->osd_recovery_max_active << dendl;
return false;
}
- if (ceph_clock_now(g_ceph_context) < defer_recovery_until) {
+ if (ceph_clock_now(cct) < defer_recovery_until) {
dout(15) << "_recover_now defer until " << defer_recovery_until << dendl;
return false;
}
@@ -6682,14 +6715,14 @@ void OSD::do_recovery(PG *pg, ThreadPool::TPHandle &handle)
{
// see how many we should try to start. note that this is a bit racy.
recovery_wq.lock();
- int max = MAX(g_conf->osd_recovery_max_active - recovery_ops_active,
- g_conf->osd_recovery_max_single_start);
+ int max = MIN(cct->_conf->osd_recovery_max_active - recovery_ops_active,
+ cct->_conf->osd_recovery_max_single_start);
if (max > 0) {
- dout(10) << "do_recovery can start " << max << " (" << recovery_ops_active << "/" << g_conf->osd_recovery_max_active
+ dout(10) << "do_recovery can start " << max << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active
<< " rops)" << dendl;
recovery_ops_active += max; // take them now, return them if we don't use them.
} else {
- dout(10) << "do_recovery can start 0 (" << recovery_ops_active << "/" << g_conf->osd_recovery_max_active
+ dout(10) << "do_recovery can start 0 (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active
<< " rops)" << dendl;
}
recovery_wq.unlock();
@@ -6750,7 +6783,7 @@ void OSD::start_recovery_op(PG *pg, const hobject_t& soid)
{
recovery_wq.lock();
dout(10) << "start_recovery_op " << *pg << " " << soid
- << " (" << recovery_ops_active << "/" << g_conf->osd_recovery_max_active << " rops)"
+ << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active << " rops)"
<< dendl;
assert(recovery_ops_active >= 0);
recovery_ops_active++;
@@ -6769,7 +6802,7 @@ void OSD::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
recovery_wq.lock();
dout(10) << "finish_recovery_op " << *pg << " " << soid
<< " dequeue=" << dequeue
- << " (" << recovery_ops_active << "/" << g_conf->osd_recovery_max_active << " rops)"
+ << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active << " rops)"
<< dendl;
// adjust count
@@ -6797,27 +6830,25 @@ void OSD::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
void OSDService::reply_op_error(OpRequestRef op, int err)
{
- reply_op_error(op, err, eversion_t());
+ reply_op_error(op, err, eversion_t(), 0);
}
-void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v)
+void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
+ version_t uv)
{
- MOSDOp *m = static_cast<MOSDOp*>(op->request);
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
assert(m->get_header().type == CEPH_MSG_OSD_OP);
int flags;
flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags);
- Messenger *msgr = client_messenger;
- reply->set_version(v);
- if (m->get_source().is_osd())
- msgr = cluster_messenger;
- msgr->send_message(reply, m->get_connection());
+ reply->set_reply_versions(v, uv);
+ m->get_connection()->get_messenger()->send_message(reply, m->get_connection());
}
void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
{
- MOSDOp *m = static_cast<MOSDOp*>(op->request);
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
assert(m->get_header().type == CEPH_MSG_OSD_OP);
if (m->get_map_epoch() < pg->info.history.same_primary_since) {
@@ -6836,7 +6867,7 @@ void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
void OSD::handle_op(OpRequestRef op)
{
- MOSDOp *m = static_cast<MOSDOp*>(op->request);
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
assert(m->get_header().type == CEPH_MSG_OSD_OP);
if (op_is_discardable(m)) {
dout(10) << " discardable " << *m << dendl;
@@ -6876,9 +6907,9 @@ void OSD::handle_op(OpRequestRef op)
}
}
- if (g_conf->osd_debug_drop_op_probability > 0 &&
+ if (cct->_conf->osd_debug_drop_op_probability > 0 &&
!m->get_source().is_mds()) {
- if ((double)rand() / (double)RAND_MAX < g_conf->osd_debug_drop_op_probability) {
+ if ((double)rand() / (double)RAND_MAX < cct->_conf->osd_debug_drop_op_probability) {
dout(0) << "handle_op DEBUG artificially dropping op " << *m << dendl;
return;
}
@@ -6900,11 +6931,11 @@ void OSD::handle_op(OpRequestRef op)
}
// too big?
- if (g_conf->osd_max_write_size &&
- m->get_data_len() > g_conf->osd_max_write_size << 20) {
+ if (cct->_conf->osd_max_write_size &&
+ m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
// journal can't hold commit!
derr << "handle_op msg data len " << m->get_data_len()
- << " > osd_max_write_size " << (g_conf->osd_max_write_size << 20)
+ << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
<< " on " << *m << dendl;
service.reply_op_error(op, -OSD_WRITETOOBIG);
return;
@@ -6971,7 +7002,7 @@ void OSD::handle_op(OpRequestRef op)
template<typename T, int MSGTYPE>
void OSD::handle_replica_op(OpRequestRef op)
{
- T *m = static_cast<T *>(op->request);
+ T *m = static_cast<T *>(op->get_req());
assert(m->get_header().type == MSGTYPE);
dout(10) << __func__ << *m << " epoch " << m->map_epoch << dendl;
@@ -7025,24 +7056,24 @@ bool OSD::op_is_discardable(MOSDOp *op)
*/
void OSD::enqueue_op(PG *pg, OpRequestRef op)
{
- utime_t latency = ceph_clock_now(g_ceph_context) - op->request->get_recv_stamp();
- dout(15) << "enqueue_op " << op << " prio " << op->request->get_priority()
- << " cost " << op->request->get_cost()
+ utime_t latency = ceph_clock_now(cct) - op->get_req()->get_recv_stamp();
+ dout(15) << "enqueue_op " << op << " prio " << op->get_req()->get_priority()
+ << " cost " << op->get_req()->get_cost()
<< " latency " << latency
- << " " << *(op->request) << dendl;
+ << " " << *(op->get_req()) << dendl;
pg->queue_op(op);
}
void OSD::OpWQ::_enqueue(pair<PGRef, OpRequestRef> item)
{
- unsigned priority = item.second->request->get_priority();
- unsigned cost = item.second->request->get_cost();
+ unsigned priority = item.second->get_req()->get_priority();
+ unsigned cost = item.second->get_req()->get_cost();
if (priority >= CEPH_MSG_PRIO_LOW)
pqueue.enqueue_strict(
- item.second->request->get_source_inst(),
+ item.second->get_req()->get_source_inst(),
priority, item);
else
- pqueue.enqueue(item.second->request->get_source_inst(),
+ pqueue.enqueue(item.second->get_req()->get_source_inst(),
priority, cost, item);
osd->logger->set(l_osd_opq, pqueue.length());
}
@@ -7057,14 +7088,14 @@ void OSD::OpWQ::_enqueue_front(pair<PGRef, OpRequestRef> item)
pg_for_processing[&*(item.first)].pop_back();
}
}
- unsigned priority = item.second->request->get_priority();
- unsigned cost = item.second->request->get_cost();
+ unsigned priority = item.second->get_req()->get_priority();
+ unsigned cost = item.second->get_req()->get_cost();
if (priority >= CEPH_MSG_PRIO_LOW)
pqueue.enqueue_strict_front(
- item.second->request->get_source_inst(),
+ item.second->get_req()->get_source_inst(),
priority, item);
else
- pqueue.enqueue_front(item.second->request->get_source_inst(),
+ pqueue.enqueue_front(item.second->get_req()->get_source_inst(),
priority, cost, item);
osd->logger->set(l_osd_opq, pqueue.length());
}
@@ -7116,11 +7147,11 @@ void OSD::dequeue_op(
PGRef pg, OpRequestRef op,
ThreadPool::TPHandle &handle)
{
- utime_t latency = ceph_clock_now(g_ceph_context) - op->request->get_recv_stamp();
- dout(10) << "dequeue_op " << op << " prio " << op->request->get_priority()
- << " cost " << op->request->get_cost()
+ utime_t latency = ceph_clock_now(cct) - op->get_req()->get_recv_stamp();
+ dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority()
+ << " cost " << op->get_req()->get_cost()
<< " latency " << latency
- << " " << *(op->request)
+ << " " << *(op->get_req())
<< " pg " << *pg << dendl;
if (pg->deleting)
return;
@@ -7221,6 +7252,8 @@ const char** OSD::get_tracked_conf_keys() const
{
static const char* KEYS[] = {
"osd_max_backfills",
+ "osd_op_complaint_time", "osd_op_log_threshold",
+ "osd_op_history_size", "osd_op_history_duration",
NULL
};
return KEYS;
@@ -7230,8 +7263,18 @@ void OSD::handle_conf_change(const struct md_config_t *conf,
const std::set <std::string> &changed)
{
if (changed.count("osd_max_backfills")) {
- service.local_reserver.set_max(g_conf->osd_max_backfills);
- service.remote_reserver.set_max(g_conf->osd_max_backfills);
+ service.local_reserver.set_max(cct->_conf->osd_max_backfills);
+ service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
+ }
+ if (changed.count("osd_op_complaint_time") ||
+ changed.count("osd_op_log_threshold")) {
+ op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
+ cct->_conf->osd_op_log_threshold);
+ }
+ if (changed.count("osd_op_history_size") ||
+ changed.count("osd_op_history_duration")) {
+ op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
+ cct->_conf->osd_op_history_duration);
}
}
@@ -7239,7 +7282,7 @@ void OSD::handle_conf_change(const struct md_config_t *conf,
int OSD::init_op_flags(OpRequestRef op)
{
- MOSDOp *m = static_cast<MOSDOp*>(op->request);
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
vector<OSDOp>::iterator iter;
// client flags have no bearing on whether an op is a read, write, etc.
@@ -7308,3 +7351,34 @@ int OSD::init_op_flags(OpRequestRef op)
return 0;
}
+
+bool OSD::RecoveryWQ::_enqueue(PG *pg) {
+ if (!pg->recovery_item.is_on_list()) {
+ pg->get("RecoveryWQ");
+ osd->recovery_queue.push_back(&pg->recovery_item);
+
+ if (osd->cct->_conf->osd_recovery_delay_start > 0) {
+ osd->defer_recovery_until = ceph_clock_now(osd->cct);
+ osd->defer_recovery_until += osd->cct->_conf->osd_recovery_delay_start;
+ }
+ return true;
+ }
+ return false;
+}
+
+void OSD::PeeringWQ::_dequeue(list<PG*> *out) {
+ set<PG*> got;
+ for (list<PG*>::iterator i = peering_queue.begin();
+ i != peering_queue.end() &&
+ out->size() < osd->cct->_conf->osd_peering_wq_batch_size;
+ ) {
+ if (in_use.count(*i)) {
+ ++i;
+ } else {
+ out->push_back(*i);
+ got.insert(*i);
+ peering_queue.erase(i++);
+ }
+ }
+ in_use.insert(got.begin(), got.end());
+}
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 4d8c31e3046..9346cee6890 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -27,6 +27,7 @@
#include "common/WorkQueue.h"
#include "common/LogClient.h"
#include "common/AsyncReserver.h"
+#include "common/ceph_context.h"
#include "os/ObjectStore.h"
#include "OSDCap.h"
@@ -114,6 +115,10 @@ enum {
l_osd_waiting_for_map,
+ l_osd_stat_bytes,
+ l_osd_stat_bytes_used,
+ l_osd_stat_bytes_avail,
+
l_osd_last,
};
@@ -161,6 +166,7 @@ class OSDMap;
class MLog;
class MClass;
class MOSDPGMissing;
+class Objecter;
class Watch;
class Notification;
@@ -173,8 +179,6 @@ class HistoricOpsSocketHook;
class TestOpsSocketHook;
struct C_CompleteSplits;
-extern const coll_t meta_coll;
-
typedef std::tr1::shared_ptr<ObjectStore::Sequencer> SequencerRef;
class DeletingState {
@@ -281,6 +285,7 @@ class OSD;
class OSDService {
public:
OSD *osd;
+ CephContext *cct;
SharedPtrRegistry<pg_t, ObjectStore::Sequencer> osr_registry;
SharedPtrRegistry<pg_t, DeletingState> deleting_pgs;
const int whoami;
@@ -302,6 +307,7 @@ public:
ThreadPool::WorkQueue<PG> &scrub_wq;
ThreadPool::WorkQueue<PG> &scrub_finalize_wq;
ThreadPool::WorkQueue<MOSDRepScrub> &rep_scrub_wq;
+ GenContextWQ push_wq;
ClassHandler *&class_handler;
void dequeue_pg(PG *pg, list<OpRequestRef> *dequeued);
@@ -414,9 +420,29 @@ public:
void dec_scrubs_active();
void reply_op_error(OpRequestRef op, int err);
- void reply_op_error(OpRequestRef op, int err, eversion_t v);
+ void reply_op_error(OpRequestRef op, int err, eversion_t v, version_t uv);
void handle_misdirected_op(PG *pg, OpRequestRef op);
+ // -- Objecter, for teiring reads/writes from/to other OSDs --
+ Mutex objecter_lock;
+ SafeTimer objecter_timer;
+ OSDMap objecter_osdmap;
+ Objecter *objecter;
+ Finisher objecter_finisher;
+ struct ObjecterDispatcher : public Dispatcher {
+ OSDService *osd;
+ bool ms_dispatch(Message *m);
+ bool ms_handle_reset(Connection *con);
+ void ms_handle_remote_reset(Connection *con) {}
+ void ms_handle_connect(Connection *con);
+ bool ms_get_authorizer(int dest_type,
+ AuthAuthorizer **authorizer,
+ bool force_new);
+ ObjecterDispatcher(OSDService *o) : Dispatcher(cct), osd(o) {}
+ } objecter_dispatcher;
+ friend class ObjecterDispatcher;
+
+
// -- Watch --
Mutex watch_lock;
SafeTimer watch_timer;
@@ -608,7 +634,22 @@ public:
#endif
OSDService(OSD *osd);
+ ~OSDService();
};
+
+struct C_OSD_SendMessageOnConn: public Context {
+ OSDService *osd;
+ Message *reply;
+ ConnectionRef conn;
+ C_OSD_SendMessageOnConn(
+ OSDService *osd,
+ Message *reply,
+ ConnectionRef conn) : osd(osd), reply(reply), conn(conn) {}
+ void finish(int) {
+ osd->send_message_osd_cluster(reply, conn.get());
+ }
+};
+
class OSD : public Dispatcher,
public md_config_obs_t {
/** OSD **/
@@ -627,6 +668,7 @@ protected:
Messenger *cluster_messenger;
Messenger *client_messenger;
+ Messenger *objecter_messenger;
MonClient *monc;
PerfCounters *logger;
PerfCounters *recoverystate_perf;
@@ -704,6 +746,25 @@ public:
return oid;
}
static void recursive_remove_collection(ObjectStore *store, coll_t tmp);
+
+ /**
+ * get_osd_initial_compat_set()
+ *
+ * Get the initial feature set for this OSD. Features
+ * here are automatically upgraded.
+ *
+ * Return value: Initial osd CompatSet
+ */
+ static CompatSet get_osd_initial_compat_set();
+
+ /**
+ * get_osd_compat_set()
+ *
+ * Get all features supported by this OSD
+ *
+ * Return value: CompatSet of all supported features
+ */
+ static CompatSet get_osd_compat_set();
private:
@@ -832,7 +893,8 @@ public:
bool heartbeat_dispatch(Message *m);
struct HeartbeatDispatcher : public Dispatcher {
- private:
+ OSD *osd;
+ HeartbeatDispatcher(OSD *o) : Dispatcher(cct), osd(o) {}
bool ms_dispatch(Message *m) {
return osd->heartbeat_dispatch(m);
};
@@ -846,15 +908,8 @@ public:
isvalid = true;
return true;
}
- public:
- OSD *osd;
- HeartbeatDispatcher(OSD *o)
- : Dispatcher(g_ceph_context), osd(o)
- {
- }
} heartbeat_dispatcher;
-
private:
// -- stats --
Mutex stat_lock;
@@ -986,22 +1041,7 @@ private:
bool _empty() {
return peering_queue.empty();
}
- void _dequeue(list<PG*> *out) {
- set<PG*> got;
- for (list<PG*>::iterator i = peering_queue.begin();
- i != peering_queue.end() &&
- out->size() < g_conf->osd_peering_wq_batch_size;
- ) {
- if (in_use.count(*i)) {
- ++i;
- } else {
- out->push_back(*i);
- got.insert(*i);
- peering_queue.erase(i++);
- }
- }
- in_use.insert(got.begin(), got.end());
- }
+ void _dequeue(list<PG*> *out);
void _process(
const list<PG *> &pgs,
ThreadPool::TPHandle &handle) {
@@ -1377,19 +1417,7 @@ protected:
bool _empty() {
return osd->recovery_queue.empty();
}
- bool _enqueue(PG *pg) {
- if (!pg->recovery_item.is_on_list()) {
- pg->get("RecoveryWQ");
- osd->recovery_queue.push_back(&pg->recovery_item);
-
- if (g_conf->osd_recovery_delay_start > 0) {
- osd->defer_recovery_until = ceph_clock_now(g_ceph_context);
- osd->defer_recovery_until += g_conf->osd_recovery_delay_start;
- }
- return true;
- }
- return false;
- }
+ bool _enqueue(PG *pg);
void _dequeue(PG *pg) {
if (pg->recovery_item.remove_myself())
pg->put("RecoveryWQ");
@@ -1676,22 +1704,28 @@ protected:
public:
/* internal and external can point to the same messenger, they will still
* be cleaned up properly*/
- OSD(int id, Messenger *internal, Messenger *external,
- Messenger *hb_client, Messenger *hb_front_server, Messenger *hb_back_server,
+ OSD(CephContext *cct_,
+ int id,
+ Messenger *internal,
+ Messenger *external,
+ Messenger *hb_client,
+ Messenger *hb_front_server,
+ Messenger *hb_back_server,
+ Messenger *osdc_messenger,
MonClient *mc, const std::string &dev, const std::string &jdev);
~OSD();
// static bits
static int find_osd_dev(char *result, int whoami);
- static ObjectStore *create_object_store(const std::string &dev, const std::string &jdev);
+ static ObjectStore *create_object_store(CephContext *cct, const std::string &dev, const std::string &jdev);
static int convertfs(const std::string &dev, const std::string &jdev);
static int do_convertfs(ObjectStore *store);
static int convert_collection(ObjectStore *store, coll_t cid);
- static int mkfs(const std::string &dev, const std::string &jdev,
+ static int mkfs(CephContext *cct, const std::string &dev, const std::string &jdev,
uuid_d fsid, int whoami);
- static int mkjournal(const std::string &dev, const std::string &jdev);
- static int flushjournal(const std::string &dev, const std::string &jdev);
- static int dump_journal(const std::string &dev, const std::string &jdev, ostream& out);
+ static int mkjournal(CephContext *cct, const std::string &dev, const std::string &jdev);
+ static int flushjournal(CephContext *cct, const std::string &dev, const std::string &jdev);
+ static int dump_journal(CephContext *cct, const std::string &dev, const std::string &jdev, ostream& out);
/* remove any non-user xattrs from a map of them */
void filter_xattrs(map<string, bufferptr>& attrs) {
for (map<string, bufferptr>::iterator iter = attrs.begin();
@@ -1704,10 +1738,6 @@ protected:
}
private:
- static int write_meta(const std::string &base, const std::string &file,
- const char *val, size_t vallen);
- static int read_meta(const std::string &base, const std::string &file,
- char *val, size_t vallen);
static int write_meta(const std::string &base,
uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami);
public:
@@ -1719,6 +1749,7 @@ public:
// startup/shutdown
int pre_init();
int init();
+ void final_init();
void suicide(int exitcode);
int shutdown();
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index 3b7b498eb27..8007d613b8c 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -757,6 +757,10 @@ uint64_t OSDMap::get_features(uint64_t *pmask) const
if (p->second.flags & pg_pool_t::FLAG_HASHPSPOOL) {
features |= CEPH_FEATURE_OSDHASHPSPOOL;
}
+ if (!p->second.tiers.empty() ||
+ p->second.is_tier()) {
+ features |= CEPH_FEATURE_OSD_CACHEPOOL;
+ }
}
mask |= CEPH_FEATURE_OSDHASHPSPOOL;
@@ -1841,7 +1845,9 @@ void OSDMap::build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
crush.set_type_name(6, "root");
// root
- int rootid = crush.add_bucket(0, CRUSH_BUCKET_STRAW, CRUSH_HASH_DEFAULT, 6 /* pool */, 0, NULL, NULL);
+ int rootid;
+ int r = crush.add_bucket(0, CRUSH_BUCKET_STRAW, CRUSH_HASH_DEFAULT, 6 /* pool */, 0, NULL, NULL, &rootid);
+ assert(r == 0);
crush.set_item_name(rootid, "default");
for (int o=0; o<nosd; o++) {
@@ -1971,7 +1977,9 @@ void OSDMap::build_simple_crush_map_from_conf(CephContext *cct, CrushWrapper& cr
set<string> hosts, racks;
// root
- int rootid = crush.add_bucket(0, CRUSH_BUCKET_STRAW, CRUSH_HASH_DEFAULT, 6 /* pool */, 0, NULL, NULL);
+ int rootid;
+ int r = crush.add_bucket(0, CRUSH_BUCKET_STRAW, CRUSH_HASH_DEFAULT, 6 /* pool */, 0, NULL, NULL, &rootid);
+ assert(r == 0);
crush.set_item_name(rootid, "default");
// add osds
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h
index 2b0cbb8020c..bd8f09b682e 100644
--- a/src/osd/OSDMap.h
+++ b/src/osd/OSDMap.h
@@ -165,6 +165,12 @@ public:
Incremental(bufferlist::iterator &p) {
decode(p);
}
+
+ pg_pool_t *get_new_pool(int64_t pool, const pg_pool_t *orig) {
+ if (new_pools.count(pool) == 0)
+ new_pools[pool] = *orig;
+ return &new_pools[pool];
+ }
};
private:
diff --git a/src/osd/OpRequest.cc b/src/osd/OpRequest.cc
index c694362a8a5..2ed7a23086f 100644
--- a/src/osd/OpRequest.cc
+++ b/src/osd/OpRequest.cc
@@ -11,229 +11,21 @@
#include "messages/MOSDSubOp.h"
#include "include/assert.h"
-#define dout_subsys ceph_subsys_optracker
-#undef dout_prefix
-#define dout_prefix _prefix(_dout)
-static ostream& _prefix(std::ostream* _dout)
-{
- return *_dout << "--OSD::tracker-- ";
-}
OpRequest::OpRequest(Message *req, OpTracker *tracker) :
- request(req), xitem(this),
+ TrackedOp(req, tracker),
rmw_flags(0),
- warn_interval_multiplier(1),
- lock("OpRequest::lock"),
- tracker(tracker),
- hit_flag_points(0), latest_flag_point(0),
- seq(0) {
- received_time = request->get_recv_stamp();
- tracker->register_inflight_op(&xitem);
- if (req->get_priority() < g_conf->osd_client_op_priority) {
+ hit_flag_points(0), latest_flag_point(0) {
+ if (req->get_priority() < tracker->cct->_conf->osd_client_op_priority) {
// don't warn as quickly for low priority ops
- warn_interval_multiplier = g_conf->osd_recovery_op_warn_multiple;
- }
-}
-
-void OpHistory::on_shutdown()
-{
- arrived.clear();
- duration.clear();
- shutdown = true;
-}
-
-void OpHistory::insert(utime_t now, OpRequestRef op)
-{
- if (shutdown)
- return;
- duration.insert(make_pair(op->get_duration(), op));
- arrived.insert(make_pair(op->get_arrived(), op));
- cleanup(now);
-}
-
-void OpHistory::cleanup(utime_t now)
-{
- while (arrived.size() &&
- (now - arrived.begin()->first >
- (double)(g_conf->osd_op_history_duration))) {
- duration.erase(make_pair(
- arrived.begin()->second->get_duration(),
- arrived.begin()->second));
- arrived.erase(arrived.begin());
- }
-
- while (duration.size() > g_conf->osd_op_history_size) {
- arrived.erase(make_pair(
- duration.begin()->second->get_arrived(),
- duration.begin()->second));
- duration.erase(duration.begin());
- }
-}
-
-void OpHistory::dump_ops(utime_t now, Formatter *f)
-{
- cleanup(now);
- f->open_object_section("OpHistory");
- f->dump_int("num to keep", g_conf->osd_op_history_size);
- f->dump_int("duration to keep", g_conf->osd_op_history_duration);
- {
- f->open_array_section("Ops");
- for (set<pair<utime_t, OpRequestRef> >::const_iterator i =
- arrived.begin();
- i != arrived.end();
- ++i) {
- f->open_object_section("Op");
- i->second->dump(now, f);
- f->close_section();
- }
- f->close_section();
+ warn_interval_multiplier = tracker->cct->_conf->osd_recovery_op_warn_multiple;
}
- f->close_section();
-}
-
-void OpTracker::dump_historic_ops(Formatter *f)
-{
- Mutex::Locker locker(ops_in_flight_lock);
- utime_t now = ceph_clock_now(g_ceph_context);
- history.dump_ops(now, f);
}
-void OpTracker::dump_ops_in_flight(Formatter *f)
-{
- Mutex::Locker locker(ops_in_flight_lock);
- f->open_object_section("ops_in_flight"); // overall dump
- f->dump_int("num_ops", ops_in_flight.size());
- f->open_array_section("ops"); // list of OpRequests
- utime_t now = ceph_clock_now(g_ceph_context);
- for (xlist<OpRequest*>::iterator p = ops_in_flight.begin(); !p.end(); ++p) {
- f->open_object_section("op");
- (*p)->dump(now, f);
- f->close_section(); // this OpRequest
- }
- f->close_section(); // list of OpRequests
- f->close_section(); // overall dump
-}
-
-void OpTracker::register_inflight_op(xlist<OpRequest*>::item *i)
-{
- Mutex::Locker locker(ops_in_flight_lock);
- ops_in_flight.push_back(i);
- ops_in_flight.back()->seq = seq++;
-}
-
-void OpTracker::unregister_inflight_op(OpRequest *i)
-{
- Mutex::Locker locker(ops_in_flight_lock);
- assert(i->xitem.get_list() == &ops_in_flight);
- utime_t now = ceph_clock_now(g_ceph_context);
- i->xitem.remove_myself();
- i->request->clear_data();
- history.insert(now, OpRequestRef(i));
-}
-
-bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector)
-{
- Mutex::Locker locker(ops_in_flight_lock);
- if (!ops_in_flight.size())
- return false;
-
- utime_t now = ceph_clock_now(g_ceph_context);
- utime_t too_old = now;
- too_old -= g_conf->osd_op_complaint_time;
-
- utime_t oldest_secs = now - ops_in_flight.front()->received_time;
-
- dout(10) << "ops_in_flight.size: " << ops_in_flight.size()
- << "; oldest is " << oldest_secs
- << " seconds old" << dendl;
-
- if (oldest_secs < g_conf->osd_op_complaint_time)
- return false;
-
- xlist<OpRequest*>::iterator i = ops_in_flight.begin();
- warning_vector.reserve(g_conf->osd_op_log_threshold + 1);
-
- int slow = 0; // total slow
- int warned = 0; // total logged
- while (!i.end() && (*i)->received_time < too_old) {
- slow++;
-
- // exponential backoff of warning intervals
- if (((*i)->received_time +
- (g_conf->osd_op_complaint_time *
- (*i)->warn_interval_multiplier)) < now) {
- // will warn
- if (warning_vector.empty())
- warning_vector.push_back("");
- warned++;
- if (warned > g_conf->osd_op_log_threshold)
- break;
-
- utime_t age = now - (*i)->received_time;
- stringstream ss;
- ss << "slow request " << age << " seconds old, received at " << (*i)->received_time
- << ": " << *((*i)->request) << " currently "
- << ((*i)->current.size() ? (*i)->current : (*i)->state_string());
- warning_vector.push_back(ss.str());
-
- // only those that have been shown will backoff
- (*i)->warn_interval_multiplier *= 2;
- }
- ++i;
- }
-
- // only summarize if we warn about any. if everything has backed
- // off, we will stay silent.
- if (warned > 0) {
- stringstream ss;
- ss << slow << " slow requests, " << warned << " included below; oldest blocked for > "
- << oldest_secs << " secs";
- warning_vector[0] = ss.str();
- }
-
- return warning_vector.size();
-}
-
-void OpTracker::get_age_ms_histogram(pow2_hist_t *h)
-{
- Mutex::Locker locker(ops_in_flight_lock);
-
- h->clear();
-
- utime_t now = ceph_clock_now(NULL);
- unsigned bin = 30;
- uint32_t lb = 1 << (bin-1); // lower bound for this bin
- int count = 0;
- for (xlist<OpRequest*>::iterator i = ops_in_flight.begin(); !i.end(); ++i) {
- utime_t age = now - (*i)->received_time;
- uint32_t ms = (long)(age * 1000.0);
- if (ms >= lb) {
- count++;
- continue;
- }
- if (count)
- h->set(bin, count);
- while (lb > ms) {
- bin--;
- lb >>= 1;
- }
- count = 1;
- }
- if (count)
- h->set(bin, count);
-}
-
-void OpRequest::dump(utime_t now, Formatter *f) const
+void OpRequest::_dump(utime_t now, Formatter *f) const
{
Message *m = request;
- stringstream name;
- m->print(name);
- f->dump_string("description", name.str().c_str()); // this OpRequest
- f->dump_unsigned("rmw_flags", rmw_flags);
- f->dump_stream("received_at") << received_time;
- f->dump_float("age", now - received_time);
- f->dump_float("duration", get_duration());
f->dump_string("flag_point", state_string());
if (m->get_orig_source().is_client()) {
f->open_object_section("client_info");
@@ -257,50 +49,11 @@ void OpRequest::dump(utime_t now, Formatter *f) const
}
}
-void OpTracker::mark_event(OpRequest *op, const string &dest)
-{
- utime_t now = ceph_clock_now(g_ceph_context);
- return _mark_event(op, dest, now);
-}
-
-void OpTracker::_mark_event(OpRequest *op, const string &evt,
- utime_t time)
-{
- Mutex::Locker locker(ops_in_flight_lock);
- dout(5) << "reqid: " << op->get_reqid() << ", seq: " << op->seq
- << ", time: " << time << ", event: " << evt
- << ", request: " << *op->request << dendl;
-}
-
-void OpTracker::RemoveOnDelete::operator()(OpRequest *op) {
- op->mark_event("done");
- tracker->unregister_inflight_op(op);
- // Do not delete op, unregister_inflight_op took control
-}
-
-OpRequestRef OpTracker::create_request(Message *ref)
-{
- OpRequestRef retval(new OpRequest(ref, this),
- RemoveOnDelete(this));
-
- if (ref->get_type() == CEPH_MSG_OSD_OP) {
- retval->reqid = static_cast<MOSDOp*>(ref)->get_reqid();
- } else if (ref->get_type() == MSG_OSD_SUBOP) {
- retval->reqid = static_cast<MOSDSubOp*>(ref)->reqid;
- }
- _mark_event(retval.get(), "header_read", ref->get_recv_stamp());
- _mark_event(retval.get(), "throttled", ref->get_throttle_stamp());
- _mark_event(retval.get(), "all_read", ref->get_recv_complete_stamp());
- _mark_event(retval.get(), "dispatched", ref->get_dispatch_stamp());
- return retval;
-}
-
-void OpRequest::mark_event(const string &event)
+void OpRequest::init_from_message()
{
- utime_t now = ceph_clock_now(g_ceph_context);
- {
- Mutex::Locker l(lock);
- events.push_back(make_pair(now, event));
+ if (request->get_type() == CEPH_MSG_OSD_OP) {
+ reqid = static_cast<MOSDOp*>(request)->get_reqid();
+ } else if (request->get_type() == MSG_OSD_SUBOP) {
+ reqid = static_cast<MOSDSubOp*>(request)->reqid;
}
- tracker->mark_event(this, event);
}
diff --git a/src/osd/OpRequest.h b/src/osd/OpRequest.h
index fc8a8ab82c9..87571f58787 100644
--- a/src/osd/OpRequest.h
+++ b/src/osd/OpRequest.h
@@ -25,80 +25,12 @@
#include "common/TrackedOp.h"
#include "osd/osd_types.h"
-struct OpRequest;
-typedef std::tr1::shared_ptr<OpRequest> OpRequestRef;
-class OpHistory {
- set<pair<utime_t, OpRequestRef> > arrived;
- set<pair<double, OpRequestRef> > duration;
- void cleanup(utime_t now);
- bool shutdown;
-
-public:
- OpHistory() : shutdown(false) {}
- ~OpHistory() {
- assert(arrived.empty());
- assert(duration.empty());
- }
- void insert(utime_t now, OpRequestRef op);
- void dump_ops(utime_t now, Formatter *f);
- void on_shutdown();
-};
-
-class OpTracker {
- class RemoveOnDelete {
- OpTracker *tracker;
- public:
- RemoveOnDelete(OpTracker *tracker) : tracker(tracker) {}
- void operator()(OpRequest *op);
- };
- friend class RemoveOnDelete;
- uint64_t seq;
- Mutex ops_in_flight_lock;
- xlist<OpRequest *> ops_in_flight;
- OpHistory history;
-
-public:
- OpTracker() : seq(0), ops_in_flight_lock("OpTracker mutex") {}
- void dump_ops_in_flight(Formatter *f);
- void dump_historic_ops(Formatter *f);
- void register_inflight_op(xlist<OpRequest*>::item *i);
- void unregister_inflight_op(OpRequest *i);
-
- void get_age_ms_histogram(pow2_hist_t *h);
-
- /**
- * Look for Ops which are too old, and insert warning
- * strings for each Op that is too old.
- *
- * @param warning_strings A vector<string> reference which is filled
- * with a warning string for each old Op.
- * @return True if there are any Ops to warn on, false otherwise.
- */
- bool check_ops_in_flight(std::vector<string> &warning_strings);
- void mark_event(OpRequest *op, const string &evt);
- void _mark_event(OpRequest *op, const string &evt, utime_t now);
- OpRequestRef create_request(Message *req);
- void on_shutdown() {
- Mutex::Locker l(ops_in_flight_lock);
- history.on_shutdown();
- }
- ~OpTracker() {
- assert(ops_in_flight.empty());
- }
-};
-
/**
* The OpRequest takes in a Message* and takes over a single reference
* to it, which it puts() when destroyed.
- * OpRequest is itself ref-counted. The expectation is that you get a Message
- * you want to track, create an OpRequest with it, and then pass around that OpRequest
- * the way you used to pass around the Message.
*/
struct OpRequest : public TrackedOp {
friend class OpTracker;
- friend class OpHistory;
- Message *request;
- xlist<OpRequest*>::item xitem;
// rmw flags
int rmw_flags;
@@ -127,28 +59,12 @@ struct OpRequest : public TrackedOp {
void set_class_write() { rmw_flags |= CEPH_OSD_RMW_FLAG_CLASS_WRITE; }
void set_pg_op() { rmw_flags |= CEPH_OSD_RMW_FLAG_PGOP; }
- utime_t received_time;
- uint8_t warn_interval_multiplier;
- utime_t get_arrived() const {
- return received_time;
- }
- double get_duration() const {
- return events.size() ?
- (events.rbegin()->first - received_time) :
- 0.0;
- }
-
- void dump(utime_t now, Formatter *f) const;
+ void _dump(utime_t now, Formatter *f) const;
private:
- list<pair<utime_t, string> > events;
- string current;
- Mutex lock;
- OpTracker *tracker;
osd_reqid_t reqid;
uint8_t hit_flag_points;
uint8_t latest_flag_point;
- uint64_t seq;
static const uint8_t flag_queued_for_pg=1 << 0;
static const uint8_t flag_reached_pg = 1 << 1;
static const uint8_t flag_delayed = 1 << 2;
@@ -157,12 +73,8 @@ private:
static const uint8_t flag_commit_sent = 1 << 5;
OpRequest(Message *req, OpTracker *tracker);
-public:
- ~OpRequest() {
- assert(request);
- request->put();
- }
+public:
bool been_queued_for_pg() { return hit_flag_points & flag_queued_for_pg; }
bool been_reached_pg() { return hit_flag_points & flag_reached_pg; }
bool been_delayed() { return hit_flag_points & flag_delayed; }
@@ -226,10 +138,15 @@ public:
latest_flag_point = flag_commit_sent;
}
- void mark_event(const string &event);
osd_reqid_t get_reqid() const {
return reqid;
}
+
+ void init_from_message();
+
+ typedef std::tr1::shared_ptr<OpRequest> Ref;
};
+typedef OpRequest::Ref OpRequestRef;
+
#endif /* OPREQUEST_H_ */
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index cd5621cddf2..8f7d3ccb684 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -142,6 +142,7 @@ PG::PG(OSDService *o, OSDMapRef curmap,
const PGPool &_pool, pg_t p, const hobject_t& loid,
const hobject_t& ioid) :
osd(o),
+ cct(o->cct),
osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()),
snap_mapper(
&osdriver,
@@ -158,7 +159,7 @@ PG::PG(OSDService *o, OSDMapRef curmap,
deleting(false), dirty_info(false), dirty_big_info(false),
info(p),
info_struct_v(0),
- coll(p), pg_log(g_ceph_context), log_oid(loid), biginfo_oid(ioid),
+ coll(p), pg_log(cct), log_oid(loid), biginfo_oid(ioid),
recovery_item(this), scrub_item(this), scrub_finalize_item(this), snap_trim_item(this), stat_queue_item(this),
recovery_ops_active(0),
waiting_on_backfill(0),
@@ -1094,7 +1095,7 @@ void PG::activate(ObjectStore::Transaction& t,
if (is_primary() &&
pool.info.crash_replay_interval > 0 &&
may_need_replay(get_osdmap())) {
- replay_until = ceph_clock_now(g_ceph_context);
+ replay_until = ceph_clock_now(cct);
replay_until += pool.info.crash_replay_interval;
dout(10) << "activate starting replay interval for " << pool.info.crash_replay_interval
<< " until " << replay_until << dendl;
@@ -1223,7 +1224,7 @@ void PG::activate(ObjectStore::Transaction& t,
m = new MOSDPGLog(get_osdmap()->get_epoch(), pi);
// send some recent log, so that op dup detection works well.
- m->log.copy_up_to(pg_log.get_log(), g_conf->osd_min_pg_log_entries);
+ m->log.copy_up_to(pg_log.get_log(), cct->_conf->osd_min_pg_log_entries);
m->info.log_tail = m->log.tail;
pi.log_tail = m->log.tail; // sigh...
@@ -1331,10 +1332,10 @@ void PG::do_pending_flush()
bool PG::op_has_sufficient_caps(OpRequestRef op)
{
// only check MOSDOp
- if (op->request->get_type() != CEPH_MSG_OSD_OP)
+ if (op->get_req()->get_type() != CEPH_MSG_OSD_OP)
return true;
- MOSDOp *req = static_cast<MOSDOp*>(op->request);
+ MOSDOp *req = static_cast<MOSDOp*>(op->get_req());
OSD::Session *session = (OSD::Session *)req->get_connection()->get_priv();
if (!session) {
@@ -1397,76 +1398,6 @@ void PG::queue_op(OpRequestRef op)
osd->op_wq.queue(make_pair(PGRef(this), op));
}
-void PG::do_request(
- OpRequestRef op,
- ThreadPool::TPHandle &handle)
-{
- // do any pending flush
- do_pending_flush();
-
- if (!op_has_sufficient_caps(op)) {
- osd->reply_op_error(op, -EPERM);
- return;
- }
- assert(!op_must_wait_for_map(get_osdmap(), op));
- if (can_discard_request(op)) {
- return;
- }
- if (!flushed) {
- dout(20) << " !flushed, waiting for active on " << op << dendl;
- waiting_for_active.push_back(op);
- return;
- }
-
- switch (op->request->get_type()) {
- case CEPH_MSG_OSD_OP:
- if (is_replay() || !is_active()) {
- dout(20) << " replay, waiting for active on " << op << dendl;
- waiting_for_active.push_back(op);
- return;
- }
- do_op(op); // do it now
- break;
-
- case MSG_OSD_SUBOP:
- do_sub_op(op);
- break;
-
- case MSG_OSD_SUBOPREPLY:
- do_sub_op_reply(op);
- break;
-
- case MSG_OSD_PG_SCAN:
- do_scan(op, handle);
- break;
-
- case MSG_OSD_PG_BACKFILL:
- do_backfill(op);
- break;
-
- case MSG_OSD_PG_PUSH:
- if (!is_active()) {
- waiting_for_active.push_back(op);
- op->mark_delayed("waiting for active");
- return;
- }
- do_push(op);
- break;
-
- case MSG_OSD_PG_PULL:
- do_pull(op);
- break;
-
- case MSG_OSD_PG_PUSH_REPLY:
- do_push_reply(op);
- break;
-
- default:
- assert(0 == "bad message type in do_request");
- }
-}
-
-
void PG::replay_queued_ops()
{
assert(is_replay() && is_active());
@@ -1486,7 +1417,7 @@ void PG::replay_queued_ops()
c = p->first;
}
dout(10) << "activate replay " << p->first << " "
- << *p->second->request << dendl;
+ << *p->second->get_req() << dendl;
replay.push_back(p->second);
}
replay_queue.clear();
@@ -1751,6 +1682,8 @@ void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
info.last_update = pg_log.get_head();
child->info.last_update = child->pg_log.get_head();
+ child->info.last_user_version = info.last_user_version;
+
info.log_tail = pg_log.get_tail();
child->info.log_tail = child->pg_log.get_tail();
@@ -1916,7 +1849,7 @@ void PG::publish_stats_to_osd()
else
state_clear(PG_STATE_INCONSISTENT);
- utime_t now = ceph_clock_now(g_ceph_context);
+ utime_t now = ceph_clock_now(cct);
info.stats.last_fresh = now;
if (info.stats.state != state) {
info.stats.state = state;
@@ -2063,8 +1996,7 @@ void PG::upgrade(ObjectStore *store, const interval_set<snapid_t> &snapcolls)
hobject_t cur;
vector<hobject_t> objects;
while (1) {
- int r = store->collection_list_partial(
- cid,
+ int r = get_pgbackend()->objects_list_partial(
cur,
store->get_ideal_list_min(),
store->get_ideal_list_max(),
@@ -2112,8 +2044,7 @@ void PG::upgrade(ObjectStore *store, const interval_set<snapid_t> &snapcolls)
while (1) {
dout(1) << "Updating snap_mapper from main collection, "
<< done << " objects done" << dendl;
- int r = store->collection_list_partial(
- cid,
+ int r = get_pgbackend()->objects_list_partial(
cur,
store->get_ideal_list_min(),
store->get_ideal_list_max(),
@@ -2136,19 +2067,16 @@ void PG::upgrade(ObjectStore *store, const interval_set<snapid_t> &snapcolls)
++j) {
if (j->snap < CEPH_MAXSNAP) {
OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
- bufferptr bp;
- r = store->getattr(
- cid,
+ bufferlist bl;
+ r = get_pgbackend()->objects_get_attr(
*j,
OI_ATTR,
- bp);
+ &bl);
if (r < 0) {
derr << __func__ << ": getattr returned "
<< cpp_strerror(r) << dendl;
assert(0);
}
- bufferlist bl;
- bl.push_back(bp);
object_info_t oi(bl);
set<snapid_t> oi_snaps(oi.snaps.begin(), oi.snaps.end());
set<snapid_t> cur_snaps;
@@ -2250,7 +2178,8 @@ epoch_t PG::peek_map_epoch(ObjectStore *store, coll_t coll, hobject_t &infos_oid
snapid_t snap;
bool ok = coll.is_pg(pgid, snap);
assert(ok);
- store->collection_getattr(coll, "info", *bl);
+ int r = store->collection_getattr(coll, "info", *bl);
+ assert(r > 0);
bufferlist::iterator bp = bl->begin();
__u8 struct_v = 0;
::decode(struct_v, bp);
@@ -2305,6 +2234,11 @@ void PG::add_log_entry(pg_log_entry_t& e, bufferlist& log_bl)
assert(e.version > info.last_update);
info.last_update = e.version;
+ // raise user_version, if it increased (it may have not get bumped
+ // by all logged updates)
+ if (e.user_version > info.last_user_version)
+ info.last_user_version = e.user_version;
+
// log mutation
pg_log.add(e);
dout(10) << "add_log_entry " << e << dendl;
@@ -2472,9 +2406,8 @@ void PG::log_weirdness()
<< " log bound mismatch, empty but (" << pg_log.get_tail() << ","
<< pg_log.get_head() << "]\n";
} else {
- if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()) || // sloppy check
- (pg_log.get_log().log.rbegin()->version != pg_log.get_head() &&
- !(pg_log.get_head() == pg_log.get_tail())))
+ // sloppy check
+ if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
osd->clog.error() << info.pgid
<< " log bound mismatch, info (" << pg_log.get_tail() << ","
<< pg_log.get_head() << "]"
@@ -2618,8 +2551,8 @@ bool PG::sched_scrub()
return false;
}
- bool time_for_deep = (ceph_clock_now(g_ceph_context) >
- info.history.last_deep_scrub_stamp + g_conf->osd_deep_scrub_interval);
+ bool time_for_deep = (ceph_clock_now(cct) >
+ info.history.last_deep_scrub_stamp + cct->_conf->osd_deep_scrub_interval);
//NODEEP_SCRUB so ignore time initiated deep-scrub
if (osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB))
@@ -2685,7 +2618,7 @@ void PG::unreg_next_scrub()
void PG::sub_op_scrub_map(OpRequestRef op)
{
- MOSDSubOp *m = static_cast<MOSDSubOp *>(op->request);
+ MOSDSubOp *m = static_cast<MOSDSubOp *>(op->get_req());
assert(m->get_header().type == MSG_OSD_SUBOP);
dout(7) << "sub_op_scrub_map" << dendl;
@@ -2771,7 +2704,7 @@ void PG::_scan_list(
int r;
__u64 pos = 0;
while ( (r = osd->store->read(coll, poid, pos,
- g_conf->osd_deep_scrub_stride, bl,
+ cct->_conf->osd_deep_scrub_stride, bl,
true)) > 0) {
handle.reset_tp_timeout();
h << bl;
@@ -2805,8 +2738,8 @@ void PG::_scan_list(
assert(iter);
uint64_t keys_scanned = 0;
for (iter->seek_to_first(); iter->valid() ; iter->next()) {
- if (g_conf->osd_scan_list_ping_tp_interval &&
- (keys_scanned % g_conf->osd_scan_list_ping_tp_interval == 0)) {
+ if (cct->_conf->osd_scan_list_ping_tp_interval &&
+ (keys_scanned % cct->_conf->osd_scan_list_ping_tp_interval == 0)) {
handle.reset_tp_timeout();
}
++keys_scanned;
@@ -2871,7 +2804,7 @@ void PG::_request_scrub_map(int replica, eversion_t version,
void PG::sub_op_scrub_reserve(OpRequestRef op)
{
- MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request);
+ MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req());
assert(m->get_header().type == MSG_OSD_SUBOP);
dout(7) << "sub_op_scrub_reserve" << dendl;
@@ -2891,7 +2824,7 @@ void PG::sub_op_scrub_reserve(OpRequestRef op)
void PG::sub_op_scrub_reserve_reply(OpRequestRef op)
{
- MOSDSubOpReply *reply = static_cast<MOSDSubOpReply*>(op->request);
+ MOSDSubOpReply *reply = static_cast<MOSDSubOpReply*>(op->get_req());
assert(reply->get_header().type == MSG_OSD_SUBOPREPLY);
dout(7) << "sub_op_scrub_reserve_reply" << dendl;
@@ -2924,7 +2857,7 @@ void PG::sub_op_scrub_reserve_reply(OpRequestRef op)
void PG::sub_op_scrub_unreserve(OpRequestRef op)
{
- assert(op->request->get_header().type == MSG_OSD_SUBOP);
+ assert(op->get_req()->get_header().type == MSG_OSD_SUBOP);
dout(7) << "sub_op_scrub_unreserve" << dendl;
op->mark_started();
@@ -2936,7 +2869,7 @@ void PG::sub_op_scrub_stop(OpRequestRef op)
{
op->mark_started();
- MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request);
+ MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req());
assert(m->get_header().type == MSG_OSD_SUBOP);
dout(7) << "sub_op_scrub_stop" << dendl;
@@ -2962,7 +2895,7 @@ void PG::schedule_backfill_full_retry()
{
Mutex::Locker lock(osd->backfill_request_lock);
osd->backfill_request_timer.add_event_after(
- g_conf->osd_backfill_retry_interval,
+ cct->_conf->osd_backfill_retry_interval,
new QueuePeeringEvt<RequestBackfill>(
this, get_osdmap()->get_epoch(),
RequestBackfill()));
@@ -3099,9 +3032,9 @@ int PG::build_scrub_map_chunk(
// objects
vector<hobject_t> ls;
- int ret = osd->store->collection_list_range(coll, start, end, 0, &ls);
+ int ret = get_pgbackend()->objects_list_range(start, end, 0, &ls);
if (ret < 0) {
- dout(5) << "collection_list_range error: " << ret << dendl;
+ dout(5) << "objects_list_range error: " << ret << dendl;
return ret;
}
@@ -3621,11 +3554,13 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
hobject_t start = scrubber.start;
while (!boundary_found) {
vector<hobject_t> objects;
- ret = osd->store->collection_list_partial(coll, start,
- g_conf->osd_scrub_chunk_min,
- g_conf->osd_scrub_chunk_max,
- 0,
- &objects, &scrubber.end);
+ ret = get_pgbackend()->objects_list_partial(
+ start,
+ cct->_conf->osd_scrub_chunk_min,
+ cct->_conf->osd_scrub_chunk_max,
+ 0,
+ &objects,
+ &scrubber.end);
assert(ret >= 0);
// in case we don't find a boundary: start again at the end
@@ -4206,7 +4141,7 @@ void PG::scrub_finish()
// finish up
unreg_next_scrub();
- utime_t now = ceph_clock_now(g_ceph_context);
+ utime_t now = ceph_clock_now(cct);
info.history.last_scrub = info.last_update;
info.history.last_scrub_stamp = now;
if (scrubber.deep) {
@@ -4680,7 +4615,7 @@ void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo)
if (last_complete_ondisk.epoch >= info.history.last_epoch_started) {
// DEBUG: verify that the snaps are empty in snap_mapper
- if (g_conf->osd_debug_verify_snaps_on_info) {
+ if (cct->_conf->osd_debug_verify_snaps_on_info) {
interval_set<snapid_t> p;
p.union_of(oinfo.purged_snaps, info.purged_snaps);
p.subtract(info.purged_snaps);
@@ -4797,7 +4732,7 @@ ostream& operator<<(ostream& out, const PG& pg)
bool PG::can_discard_op(OpRequestRef op)
{
- MOSDOp *m = static_cast<MOSDOp*>(op->request);
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
if (OSD::op_is_discardable(m)) {
dout(20) << " discard " << *m << dendl;
return true;
@@ -4825,7 +4760,7 @@ bool PG::can_discard_op(OpRequestRef op)
template<typename T, int MSGTYPE>
bool PG::can_discard_replica_op(OpRequestRef op)
{
- T *m = static_cast<T *>(op->request);
+ T *m = static_cast<T *>(op->get_req());
assert(m->get_header().type == MSGTYPE);
// same pg?
@@ -4841,7 +4776,7 @@ bool PG::can_discard_replica_op(OpRequestRef op)
bool PG::can_discard_scan(OpRequestRef op)
{
- MOSDPGScan *m = static_cast<MOSDPGScan *>(op->request);
+ MOSDPGScan *m = static_cast<MOSDPGScan *>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_SCAN);
if (old_peering_msg(m->map_epoch, m->query_epoch)) {
@@ -4853,7 +4788,7 @@ bool PG::can_discard_scan(OpRequestRef op)
bool PG::can_discard_backfill(OpRequestRef op)
{
- MOSDPGBackfill *m = static_cast<MOSDPGBackfill *>(op->request);
+ MOSDPGBackfill *m = static_cast<MOSDPGBackfill *>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_BACKFILL);
if (old_peering_msg(m->map_epoch, m->query_epoch)) {
@@ -4867,7 +4802,7 @@ bool PG::can_discard_backfill(OpRequestRef op)
bool PG::can_discard_request(OpRequestRef op)
{
- switch (op->request->get_type()) {
+ switch (op->get_req()->get_type()) {
case CEPH_MSG_OSD_OP:
return can_discard_op(op);
case MSG_OSD_SUBOP:
@@ -4892,55 +4827,55 @@ bool PG::can_discard_request(OpRequestRef op)
bool PG::split_request(OpRequestRef op, unsigned match, unsigned bits)
{
unsigned mask = ~((~0)<<bits);
- switch (op->request->get_type()) {
+ switch (op->get_req()->get_type()) {
case CEPH_MSG_OSD_OP:
- return (static_cast<MOSDOp*>(op->request)->get_pg().m_seed & mask) == match;
+ return (static_cast<MOSDOp*>(op->get_req())->get_pg().m_seed & mask) == match;
}
return false;
}
bool PG::op_must_wait_for_map(OSDMapRef curmap, OpRequestRef op)
{
- switch (op->request->get_type()) {
+ switch (op->get_req()->get_type()) {
case CEPH_MSG_OSD_OP:
return !have_same_or_newer_map(
curmap,
- static_cast<MOSDOp*>(op->request)->get_map_epoch());
+ static_cast<MOSDOp*>(op->get_req())->get_map_epoch());
case MSG_OSD_SUBOP:
return !have_same_or_newer_map(
curmap,
- static_cast<MOSDSubOp*>(op->request)->map_epoch);
+ static_cast<MOSDSubOp*>(op->get_req())->map_epoch);
case MSG_OSD_SUBOPREPLY:
return !have_same_or_newer_map(
curmap,
- static_cast<MOSDSubOpReply*>(op->request)->map_epoch);
+ static_cast<MOSDSubOpReply*>(op->get_req())->map_epoch);
case MSG_OSD_PG_SCAN:
return !have_same_or_newer_map(
curmap,
- static_cast<MOSDPGScan*>(op->request)->map_epoch);
+ static_cast<MOSDPGScan*>(op->get_req())->map_epoch);
case MSG_OSD_PG_BACKFILL:
return !have_same_or_newer_map(
curmap,
- static_cast<MOSDPGBackfill*>(op->request)->map_epoch);
+ static_cast<MOSDPGBackfill*>(op->get_req())->map_epoch);
case MSG_OSD_PG_PUSH:
return !have_same_or_newer_map(
curmap,
- static_cast<MOSDPGPush*>(op->request)->map_epoch);
+ static_cast<MOSDPGPush*>(op->get_req())->map_epoch);
case MSG_OSD_PG_PULL:
return !have_same_or_newer_map(
curmap,
- static_cast<MOSDPGPull*>(op->request)->map_epoch);
+ static_cast<MOSDPGPull*>(op->get_req())->map_epoch);
case MSG_OSD_PG_PUSH_REPLY:
return !have_same_or_newer_map(
curmap,
- static_cast<MOSDPGPushReply*>(op->request)->map_epoch);
+ static_cast<MOSDPGPushReply*>(op->get_req())->map_epoch);
}
assert(0);
return false;
@@ -5055,7 +4990,7 @@ void PG::handle_activate_map(RecoveryCtx *rctx)
ActMap evt;
recovery_state.handle_event(evt, rctx);
if (osdmap_ref->get_epoch() - last_persisted_osdmap_ref->get_epoch() >
- g_conf->osd_pg_epoch_persisted_max_stale) {
+ cct->_conf->osd_pg_epoch_persisted_max_stale) {
dout(20) << __func__ << ": Dirtying info: last_persisted is "
<< last_persisted_osdmap_ref->get_epoch()
<< " while current is " << osdmap_ref->get_epoch() << dendl;
@@ -5109,9 +5044,9 @@ std::ostream& operator<<(std::ostream& oss,
/*------Crashed-------*/
PG::RecoveryState::Crashed::Crashed(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Crashed")
{
- state_name = "Crashed";
context< RecoveryMachine >().log_enter(state_name);
assert(0 == "we got a bad state machine event");
}
@@ -5119,9 +5054,9 @@ PG::RecoveryState::Crashed::Crashed(my_context ctx)
/*------Initial-------*/
PG::RecoveryState::Initial::Initial(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Initial")
{
- state_name = "Initial";
context< RecoveryMachine >().log_enter(state_name);
}
@@ -5164,15 +5099,15 @@ void PG::RecoveryState::Initial::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_initial_latency, dur);
}
/*------Started-------*/
PG::RecoveryState::Started::Started(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started")
{
- state_name = "Started";
context< RecoveryMachine >().log_enter(state_name);
}
@@ -5213,15 +5148,15 @@ void PG::RecoveryState::Started::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_started_latency, dur);
}
/*--------Reset---------*/
PG::RecoveryState::Reset::Reset(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Reset")
{
- state_name = "Reset";
context< RecoveryMachine >().log_enter(state_name);
PG *pg = context< RecoveryMachine >().pg;
pg->flushed = false;
@@ -5288,15 +5223,15 @@ void PG::RecoveryState::Reset::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_reset_latency, dur);
}
/*-------Start---------*/
PG::RecoveryState::Start::Start(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Start")
{
- state_name = "Start";
context< RecoveryMachine >().log_enter(state_name);
PG *pg = context< RecoveryMachine >().pg;
@@ -5313,15 +5248,15 @@ void PG::RecoveryState::Start::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_start_latency, dur);
}
/*---------Primary--------*/
PG::RecoveryState::Primary::Primary(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary")
{
- state_name = "Started/Primary";
context< RecoveryMachine >().log_enter(state_name);
PG *pg = context< RecoveryMachine >().pg;
assert(pg->want_acting.empty());
@@ -5362,15 +5297,16 @@ void PG::RecoveryState::Primary::exit()
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
pg->want_acting.clear();
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_primary_latency, dur);
}
/*---------Peering--------*/
PG::RecoveryState::Peering::Peering(my_context ctx)
- : my_base(ctx), flushed(false)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Peering"),
+ flushed(false)
{
- state_name = "Started/Primary/Peering";
context< RecoveryMachine >().log_enter(state_name);
PG *pg = context< RecoveryMachine >().pg;
@@ -5450,16 +5386,16 @@ void PG::RecoveryState::Peering::exit()
pg->state_clear(PG_STATE_PEERING);
pg->clear_probe_targets();
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_peering_latency, dur);
}
/*------Backfilling-------*/
PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/Backfilling")
{
- state_name = "Started/Primary/Active/Backfilling";
context< RecoveryMachine >().log_enter(state_name);
PG *pg = context< RecoveryMachine >().pg;
pg->backfill_reserved = true;
@@ -5488,16 +5424,16 @@ void PG::RecoveryState::Backfilling::exit()
pg->backfill_reserved = false;
pg->backfill_reserving = false;
pg->state_clear(PG_STATE_BACKFILL);
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
}
/*--WaitRemoteBackfillReserved--*/
PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/WaitRemoteBackfillReserved")
{
- state_name = "Started/Primary/Active/WaitRemoteBackfillReserved";
context< RecoveryMachine >().log_enter(state_name);
PG *pg = context< RecoveryMachine >().pg;
pg->state_set(PG_STATE_BACKFILL_WAIT);
@@ -5523,7 +5459,7 @@ void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur);
}
@@ -5550,9 +5486,9 @@ PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationReje
/*--WaitLocalBackfillReserved--*/
PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/WaitLocalBackfillReserved")
{
- state_name = "Started/Primary/Active/WaitLocalBackfillReserved";
context< RecoveryMachine >().log_enter(state_name);
PG *pg = context< RecoveryMachine >().pg;
pg->state_set(PG_STATE_BACKFILL_WAIT);
@@ -5568,15 +5504,15 @@ void PG::RecoveryState::WaitLocalBackfillReserved::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_waitlocalbackfillreserved_latency, dur);
}
/*----NotBackfilling------*/
PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/NotBackfilling")
{
- state_name = "Started/Primary/Active/NotBackfilling";
context< RecoveryMachine >().log_enter(state_name);
}
@@ -5584,15 +5520,15 @@ void PG::RecoveryState::NotBackfilling::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur);
}
/*---RepNotRecovering----*/
PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/ReplicaActive/RepNotRecovering")
{
- state_name = "Started/ReplicaActive/RepNotRecovering";
context< RecoveryMachine >().log_enter(state_name);
}
@@ -5600,15 +5536,15 @@ void PG::RecoveryState::RepNotRecovering::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_repnotrecovering_latency, dur);
}
/*---RepWaitRecoveryReserved--*/
PG::RecoveryState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/ReplicaActive/RepWaitRecoveryReserved")
{
- state_name = "Started/ReplicaActive/RepWaitRecoveryReserved";
context< RecoveryMachine >().log_enter(state_name);
PG *pg = context< RecoveryMachine >().pg;
@@ -5637,15 +5573,15 @@ void PG::RecoveryState::RepWaitRecoveryReserved::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_repwaitrecoveryreserved_latency, dur);
}
/*-RepWaitBackfillReserved*/
PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/ReplicaActive/RepWaitBackfillReserved")
{
- state_name = "Started/ReplicaActive/RepWaitBackfillReserved";
context< RecoveryMachine >().log_enter(state_name);
}
@@ -5656,7 +5592,7 @@ PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
double ratio, max_ratio;
if (pg->osd->too_full_for_backfill(&ratio, &max_ratio) &&
- !g_conf->osd_debug_skip_full_check_in_backfill_reservation) {
+ !pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation) {
dout(10) << "backfill reservation rejected: full ratio is "
<< ratio << ", which is greater than max allowed ratio "
<< max_ratio << dendl;
@@ -5675,7 +5611,7 @@ void PG::RecoveryState::RepWaitBackfillReserved::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_repwaitbackfillreserved_latency, dur);
}
@@ -5703,9 +5639,9 @@ PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteReservationRejecte
/*---RepRecovering-------*/
PG::RecoveryState::RepRecovering::RepRecovering(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/ReplicaActive/RepRecovering")
{
- state_name = "Started/ReplicaActive/RepRecovering";
context< RecoveryMachine >().log_enter(state_name);
}
@@ -5722,15 +5658,15 @@ void PG::RecoveryState::RepRecovering::exit()
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_RepRecovering_latency, dur);
}
/*------Activating--------*/
PG::RecoveryState::Activating::Activating(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/Activating")
{
- state_name = "Started/Primary/Active/Activating";
context< RecoveryMachine >().log_enter(state_name);
}
@@ -5738,14 +5674,14 @@ void PG::RecoveryState::Activating::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_activating_latency, dur);
}
PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/WaitLocalRecoveryReserved")
{
- state_name = "Started/Primary/Active/WaitLocalRecoveryReserved";
context< RecoveryMachine >().log_enter(state_name);
PG *pg = context< RecoveryMachine >().pg;
pg->state_set(PG_STATE_RECOVERY_WAIT);
@@ -5760,15 +5696,15 @@ void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_waitlocalrecoveryreserved_latency, dur);
}
PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
: my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
acting_osd_it(context< Active >().sorted_acting_set.begin())
{
- state_name = "Started/Primary/Active/WaitRemoteRecoveryReserved";
context< RecoveryMachine >().log_enter(state_name);
post_event(RemoteRecoveryReserved());
}
@@ -5807,14 +5743,14 @@ void PG::RecoveryState::WaitRemoteRecoveryReserved::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_waitremoterecoveryreserved_latency, dur);
}
PG::RecoveryState::Recovering::Recovering(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/Recovering")
{
- state_name = "Started/Primary/Active/Recovering";
context< RecoveryMachine >().log_enter(state_name);
PG *pg = context< RecoveryMachine >().pg;
@@ -5870,16 +5806,16 @@ void PG::RecoveryState::Recovering::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_recovering_latency, dur);
}
PG::RecoveryState::Recovered::Recovered(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/Recovered")
{
int newest_update_osd;
- state_name = "Started/Primary/Active/Recovered";
context< RecoveryMachine >().log_enter(state_name);
PG *pg = context< RecoveryMachine >().pg;
@@ -5904,14 +5840,14 @@ void PG::RecoveryState::Recovered::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_recovered_latency, dur);
}
PG::RecoveryState::Clean::Clean(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/Clean")
{
- state_name = "Started/Primary/Active/Clean";
context< RecoveryMachine >().log_enter(state_name);
PG *pg = context< RecoveryMachine >().pg;
@@ -5932,18 +5868,18 @@ void PG::RecoveryState::Clean::exit()
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
pg->state_clear(PG_STATE_CLEAN);
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur);
}
/*---------Active---------*/
PG::RecoveryState::Active::Active(my_context ctx)
: my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active"),
sorted_acting_set(context< RecoveryMachine >().pg->acting.begin(),
context< RecoveryMachine >().pg->acting.end()),
all_replicas_activated(false)
{
- state_name = "Started/Primary/Active";
context< RecoveryMachine >().log_enter(state_name);
PG *pg = context< RecoveryMachine >().pg;
@@ -5997,7 +5933,7 @@ boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap)
}
// if we haven't reported our PG stats in a long time, do so now.
- if (pg->info.stats.reported_epoch + g_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
+ if (pg->info.stats.reported_epoch + pg->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
dout(20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - pg->info.stats.reported_epoch)
<< " epochs" << dendl;
pg->publish_stats_to_osd();
@@ -6018,13 +5954,13 @@ boost::statechart::result PG::RecoveryState::Active::react(const ActMap&)
pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
}
- if (g_conf->osd_check_for_log_corruption)
+ if (pg->cct->_conf->osd_check_for_log_corruption)
pg->check_log_for_corruption(pg->osd->store);
int unfound = pg->pg_log.get_missing().num_missing() - pg->missing_loc.size();
if (unfound > 0 &&
pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) {
- if (g_conf->osd_auto_mark_unfound_lost) {
+ if (pg->cct->_conf->osd_auto_mark_unfound_lost) {
pg->osd->clog.error() << pg->info.pgid << " has " << unfound
<< " objects unfound and apparently lost, would automatically marking lost but NOT IMPLEMENTED\n";
//pg->mark_all_unfound_lost(*context< RecoveryMachine >().get_cur_transaction());
@@ -6183,16 +6119,15 @@ void PG::RecoveryState::Active::exit()
pg->state_clear(PG_STATE_BACKFILL_WAIT);
pg->state_clear(PG_STATE_RECOVERY_WAIT);
pg->state_clear(PG_STATE_REPLAY);
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_active_latency, dur);
}
/*------ReplicaActive-----*/
PG::RecoveryState::ReplicaActive::ReplicaActive(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/ReplicaActive")
{
- state_name = "Started/ReplicaActive";
-
context< RecoveryMachine >().log_enter(state_name);
PG *pg = context< RecoveryMachine >().pg;
@@ -6276,14 +6211,15 @@ void PG::RecoveryState::ReplicaActive::exit()
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur);
}
/*-------Stray---*/
PG::RecoveryState::Stray::Stray(my_context ctx)
- : my_base(ctx) {
- state_name = "Started/Stray";
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/Stray")
+{
context< RecoveryMachine >().log_enter(state_name);
PG *pg = context< RecoveryMachine >().pg;
@@ -6378,15 +6314,15 @@ void PG::RecoveryState::Stray::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_stray_latency, dur);
}
/*--------GetInfo---------*/
PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Peering/GetInfo")
{
- state_name = "Started/Primary/Peering/GetInfo";
context< RecoveryMachine >().log_enter(state_name);
PG *pg = context< RecoveryMachine >().pg;
@@ -6553,15 +6489,16 @@ void PG::RecoveryState::GetInfo::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_getinfo_latency, dur);
}
/*------GetLog------------*/
-PG::RecoveryState::GetLog::GetLog(my_context ctx) :
- my_base(ctx), newest_update_osd(-1), msg(0)
+PG::RecoveryState::GetLog::GetLog(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Peering/GetLog"),
+ newest_update_osd(-1), msg(0)
{
- state_name = "Started/Primary/Peering/GetLog";
context< RecoveryMachine >().log_enter(state_name);
PG *pg = context< RecoveryMachine >().pg;
@@ -6668,15 +6605,15 @@ void PG::RecoveryState::GetLog::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_getlog_latency, dur);
}
/*------WaitActingChange--------*/
PG::RecoveryState::WaitActingChange::WaitActingChange(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Peering/WaitActingChange")
{
- state_name = "Started/Primary/Peering/WaitActingChange";
context< RecoveryMachine >().log_enter(state_name);
}
@@ -6728,15 +6665,15 @@ void PG::RecoveryState::WaitActingChange::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_waitactingchange_latency, dur);
}
/*------Incomplete--------*/
PG::RecoveryState::Incomplete::Incomplete(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Peering/Incomplete")
{
- state_name = "Started/Primary/Peering/Incomplete";
context< RecoveryMachine >().log_enter(state_name);
PG *pg = context< RecoveryMachine >().pg;
@@ -6765,15 +6702,15 @@ void PG::RecoveryState::Incomplete::exit()
PG *pg = context< RecoveryMachine >().pg;
pg->state_clear(PG_STATE_INCOMPLETE);
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_incomplete_latency, dur);
}
/*------GetMissing--------*/
PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Peering/GetMissing")
{
- state_name = "Started/Primary/Peering/GetMissing";
context< RecoveryMachine >().log_enter(state_name);
PG *pg = context< RecoveryMachine >().pg;
@@ -6890,15 +6827,15 @@ void PG::RecoveryState::GetMissing::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_getmissing_latency, dur);
}
/*---WaitFlushedPeering---*/
PG::RecoveryState::WaitFlushedPeering::WaitFlushedPeering(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Peering/WaitFlushedPeering")
{
- state_name = "Started/Primary/Peering/WaitFlushedPeering";
PG *pg = context< RecoveryMachine >().pg;
context< RecoveryMachine >().log_enter(state_name);
if (context< RecoveryMachine >().pg->flushed)
@@ -6926,9 +6863,9 @@ PG::RecoveryState::WaitFlushedPeering::react(const QueryState &q)
/*------WaitUpThru--------*/
PG::RecoveryState::WaitUpThru::WaitUpThru(my_context ctx)
- : my_base(ctx)
+ : my_base(ctx),
+ NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Peering/WaitUpThru")
{
- state_name = "Started/Primary/Peering/WaitUpThru";
context< RecoveryMachine >().log_enter(state_name);
}
@@ -6971,7 +6908,7 @@ void PG::RecoveryState::WaitUpThru::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
pg->osd->recoverystate_perf->tinc(rs_waitupthru_latency, dur);
}
@@ -6987,9 +6924,9 @@ void PG::RecoveryState::RecoveryMachine::log_enter(const char *state_name)
void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_t enter_time)
{
- utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+ utime_t dur = ceph_clock_now(pg->cct) - enter_time;
dout(5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
- pg->osd->pg_recovery_stats.log_exit(state_name, ceph_clock_now(g_ceph_context) - enter_time,
+ pg->osd->pg_recovery_stats.log_exit(state_name, ceph_clock_now(pg->cct) - enter_time,
event_count, event_time);
event_count = 0;
event_time = utime_t();
@@ -7186,6 +7123,22 @@ bool PG::PriorSet::affected_by_map(const OSDMapRef osdmap, const PG *debug_pg) c
return false;
}
+void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
+ assert(!rctx);
+ rctx = new_ctx;
+ if (rctx)
+ rctx->start_time = ceph_clock_now(pg->cct);
+}
+
+void PG::RecoveryState::end_handle() {
+ if (rctx) {
+ utime_t dur = ceph_clock_now(pg->cct) - rctx->start_time;
+ machine.event_time += dur;
+ }
+ machine.event_count++;
+ rctx = 0;
+}
+
void intrusive_ptr_add_ref(PG *pg) { pg->get("intptr"); }
void intrusive_ptr_release(PG *pg) { pg->put("intptr"); }
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 720ce67bca3..9b42ff4272b 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -46,7 +46,9 @@
#include "common/cmdparse.h"
#include "common/tracked_int_ptr.hpp"
#include "common/WorkQueue.h"
+#include "common/ceph_context.h"
#include "include/str_list.h"
+#include "PGBackend.h"
#include <list>
#include <memory>
@@ -189,8 +191,11 @@ public:
/*** PG ****/
protected:
OSDService *osd;
+ CephContext *cct;
OSDriver osdriver;
SnapMapper snap_mapper;
+
+ virtual PGBackend *get_pgbackend() = 0;
public:
void update_snap_mapper_bits(uint32_t bits) {
snap_mapper.update_bits(bits);
@@ -383,7 +388,9 @@ public:
const char *state_name;
utime_t enter_time;
const char *get_state_name() { return state_name; }
- NamedState() : state_name(0), enter_time(ceph_clock_now(g_ceph_context)) {}
+ NamedState(CephContext *cct_, const char *state_name_)
+ : state_name(state_name_),
+ enter_time(ceph_clock_now(cct_)) {};
virtual ~NamedState() {}
};
@@ -435,14 +442,14 @@ protected:
*/
struct BackfillInterval {
// info about a backfill interval on a peer
+ eversion_t version; /// version at which the scan occurred
map<hobject_t,eversion_t> objects;
hobject_t begin;
hobject_t end;
/// clear content
void clear() {
- objects.clear();
- begin = end = hobject_t();
+ *this = BackfillInterval();
}
void reset(hobject_t start) {
@@ -519,7 +526,8 @@ protected:
list<OpRequestRef> waiting_for_active;
list<OpRequestRef> waiting_for_all_missing;
map<hobject_t, list<OpRequestRef> > waiting_for_missing_object,
- waiting_for_degraded_object;
+ waiting_for_degraded_object,
+ waiting_for_blocked_object;
// Callbacks should assume pg (and nothing else) is locked
map<hobject_t, list<Context*> > callbacks_for_degraded_object;
map<eversion_t,list<OpRequestRef> > waiting_for_ack, waiting_for_ondisk;
@@ -865,8 +873,12 @@ public:
virtual void _scrub(ScrubMap &map) { }
virtual void _scrub_clear_state() { }
virtual void _scrub_finish() { }
- virtual coll_t get_temp_coll() = 0;
- virtual bool have_temp_coll() = 0;
+ virtual void get_colls(list<coll_t> *out) = 0;
+ virtual void split_colls(
+ pg_t child,
+ int split_bits,
+ int seed,
+ ObjectStore::Transaction *t) = 0;
virtual bool _report_snap_collection_errors(
const hobject_t &hoid,
const map<string, bufferptr> &attrs,
@@ -1060,21 +1072,8 @@ public:
/* Encapsulates PG recovery process */
class RecoveryState {
- void start_handle(RecoveryCtx *new_ctx) {
- assert(!rctx);
- rctx = new_ctx;
- if (rctx)
- rctx->start_time = ceph_clock_now(g_ceph_context);
- }
-
- void end_handle() {
- if (rctx) {
- utime_t dur = ceph_clock_now(g_ceph_context) - rctx->start_time;
- machine.event_time += dur;
- }
- machine.event_count++;
- rctx = 0;
- }
+ void start_handle(RecoveryCtx *new_ctx);
+ void end_handle();
/* States */
struct Initial;
@@ -1797,10 +1796,10 @@ public:
// abstract bits
- void do_request(
+ virtual void do_request(
OpRequestRef op,
ThreadPool::TPHandle &handle
- );
+ ) = 0;
virtual void do_op(OpRequestRef op) = 0;
virtual void do_sub_op(OpRequestRef op) = 0;
@@ -1810,9 +1809,6 @@ public:
ThreadPool::TPHandle &handle
) = 0;
virtual void do_backfill(OpRequestRef op) = 0;
- virtual void do_push(OpRequestRef op) = 0;
- virtual void do_pull(OpRequestRef op) = 0;
- virtual void do_push_reply(OpRequestRef op) = 0;
virtual void snap_trimmer() = 0;
virtual int do_command(cmdmap_t cmdmap, ostream& ss,
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h
new file mode 100644
index 00000000000..408c589a08a
--- /dev/null
+++ b/src/osd/PGBackend.h
@@ -0,0 +1,230 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef PGBACKEND_H
+#define PGBACKEND_H
+
+#include "osd_types.h"
+#include "include/Context.h"
+#include <string>
+
+ /**
+ * PGBackend
+ *
+ * PGBackend defines an interface for logic handling IO and
+ * replication on RADOS objects. The PGBackend implementation
+ * is responsible for:
+ *
+ * 1) Handling client operations
+ * 2) Handling object recovery
+ * 3) Handling object access
+ */
+ class PGBackend {
+ public:
+ /**
+ * Provides interfaces for PGBackend callbacks
+ *
+ * The intention is that the parent calls into the PGBackend
+ * implementation holding a lock and that the callbacks are
+ * called under the same locks.
+ */
+ class Listener {
+ public:
+ /// Recovery
+
+ virtual void on_local_recover_start(
+ const hobject_t &oid,
+ ObjectStore::Transaction *t) = 0;
+ /**
+ * Called with the transaction recovering oid
+ */
+ virtual void on_local_recover(
+ const hobject_t &oid,
+ const object_stat_sum_t &stat_diff,
+ const ObjectRecoveryInfo &recovery_info,
+ ObjectContextRef obc,
+ ObjectStore::Transaction *t
+ ) = 0;
+
+ /**
+ * Called when transaction recovering oid is durable and
+ * applied on all replicas
+ */
+ virtual void on_global_recover(const hobject_t &oid) = 0;
+
+ /**
+ * Called when peer is recovered
+ */
+ virtual void on_peer_recover(
+ int peer,
+ const hobject_t &oid,
+ const ObjectRecoveryInfo &recovery_info,
+ const object_stat_sum_t &stat
+ ) = 0;
+
+ virtual void begin_peer_recover(
+ int peer,
+ const hobject_t oid) = 0;
+
+ virtual void failed_push(int from, const hobject_t &soid) = 0;
+
+
+ virtual void cancel_pull(const hobject_t &soid) = 0;
+
+ /**
+ * Bless a context
+ *
+ * Wraps a context in whatever outer layers the parent usually
+ * uses to call into the PGBackend
+ */
+ virtual Context *bless_context(Context *c) = 0;
+ virtual GenContext<ThreadPool::TPHandle&> *bless_gencontext(
+ GenContext<ThreadPool::TPHandle&> *c) = 0;
+
+ virtual void send_message(int to_osd, Message *m) = 0;
+ virtual void queue_transaction(ObjectStore::Transaction *t) = 0;
+ virtual epoch_t get_epoch() = 0;
+ virtual const vector<int> &get_acting() = 0;
+ virtual std::string gen_dbg_prefix() const = 0;
+
+ virtual const map<hobject_t, set<int> > &get_missing_loc() = 0;
+ virtual const map<int, pg_missing_t> &get_peer_missing() = 0;
+ virtual const map<int, pg_info_t> &get_peer_info() = 0;
+ virtual const pg_missing_t &get_local_missing() = 0;
+ virtual const PGLog &get_log() = 0;
+ virtual bool pgb_is_primary() const = 0;
+ virtual OSDMapRef pgb_get_osdmap() const = 0;
+ virtual const pg_info_t &get_info() const = 0;
+
+ virtual ObjectContextRef get_obc(
+ const hobject_t &hoid,
+ map<string, bufferptr> &attrs) = 0;
+
+ virtual ~Listener() {}
+ };
+ Listener *parent;
+ Listener *get_parent() const { return parent; }
+ PGBackend(Listener *l) : parent(l) {}
+ bool is_primary() const { return get_parent()->pgb_is_primary(); }
+ OSDMapRef get_osdmap() const { return get_parent()->pgb_get_osdmap(); }
+ const pg_info_t &get_info() { return get_parent()->get_info(); }
+
+ std::string gen_prefix() const {
+ return parent->gen_dbg_prefix();
+ }
+
+ /**
+ * RecoveryHandle
+ *
+ * We may want to recover multiple objects in the same set of
+ * messages. RecoveryHandle is an interface for the opaque
+ * object used by the implementation to store the details of
+ * the pending recovery operations.
+ */
+ struct RecoveryHandle {
+ virtual ~RecoveryHandle() {}
+ };
+
+ /// Get a fresh recovery operation
+ virtual RecoveryHandle *open_recovery_op() = 0;
+
+ /// run_recovery_op: finish the operation represented by h
+ virtual void run_recovery_op(
+ RecoveryHandle *h, ///< [in] op to finish
+ int priority ///< [in] msg priority
+ ) = 0;
+
+ /**
+ * recover_object
+ *
+ * Triggers a recovery operation on the specified hobject_t
+ * onreadable must be called before onwriteable
+ *
+ * On each replica (primary included), get_parent()->on_not_missing()
+ * must be called when the transaction finalizing the recovery
+ * is queued. Similarly, get_parent()->on_readable() must be called
+ * when the transaction is applied in the backing store.
+ *
+ * get_parent()->on_not_degraded() should be called on the primary
+ * when writes can resume on the object.
+ *
+ * obc may be NULL if the primary lacks the object.
+ *
+ * head may be NULL only if the head/snapdir is missing
+ *
+ * @param missing [in] set of info, missing pairs for queried nodes
+ * @param overlaps [in] mapping of object to file offset overlaps
+ */
+ virtual void recover_object(
+ const hobject_t &hoid, ///< [in] object to recover
+ ObjectContextRef head, ///< [in] context of the head/snapdir object
+ ObjectContextRef obc, ///< [in] context of the object
+ RecoveryHandle *h ///< [in,out] handle to attach recovery op to
+ ) = 0;
+
+ /// gives PGBackend a crack at an incoming message
+ virtual bool handle_message(
+ OpRequestRef op ///< [in] message received
+ ) = 0; ///< @return true if the message was handled
+
+ virtual void check_recovery_sources(const OSDMapRef osdmap) = 0;
+
+ /**
+ * implementation should clear itself, contexts blessed prior to on_change
+ * won't be called after on_change()
+ */
+ virtual void on_change(ObjectStore::Transaction *t) = 0;
+ virtual void clear_state() = 0;
+
+ virtual void on_flushed() = 0;
+
+
+ virtual void split_colls(
+ pg_t child,
+ int split_bits,
+ int seed,
+ ObjectStore::Transaction *t) = 0;
+
+ virtual void temp_colls(list<coll_t> *out) = 0;
+
+ virtual void dump_recovery_info(Formatter *f) const = 0;
+
+ virtual coll_t get_temp_coll(ObjectStore::Transaction *t) = 0;
+ virtual void add_temp_obj(const hobject_t &oid) = 0;
+ virtual void clear_temp_obj(const hobject_t &oid) = 0;
+
+ virtual ~PGBackend() {}
+
+ /// List objects in collection
+ virtual int objects_list_partial(
+ const hobject_t &begin,
+ int min,
+ int max,
+ snapid_t seq,
+ vector<hobject_t> *ls,
+ hobject_t *next) = 0;
+
+ virtual int objects_list_range(
+ const hobject_t &start,
+ const hobject_t &end,
+ snapid_t seq,
+ vector<hobject_t> *ls) = 0;
+
+ virtual int objects_get_attr(
+ const hobject_t &hoid,
+ const string &attr,
+ bufferlist *out) = 0;
+ };
+
+#endif
diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc
index dac1f33fd91..1949c96fd57 100644
--- a/src/osd/PGLog.cc
+++ b/src/osd/PGLog.cc
@@ -52,19 +52,15 @@ void PGLog::IndexedLog::split_into(
if (log.empty())
tail = head;
- else
- head = log.rbegin()->version;
if (olog->empty())
olog->tail = olog->head;
- else
- olog->head = olog->log.rbegin()->version;
olog->index();
index();
}
-void PGLog::IndexedLog::trim(eversion_t s)
+void PGLog::IndexedLog::trim(eversion_t s, set<eversion_t> *trimmed)
{
if (complete_to != log.end() &&
complete_to->version <= s) {
@@ -77,6 +73,8 @@ void PGLog::IndexedLog::trim(eversion_t s)
if (e.version > s)
break;
generic_dout(20) << "trim " << e << dendl;
+ if (trimmed)
+ trimmed->insert(e.version);
unindex(e); // remove from index,
log.pop_front(); // from log
}
@@ -142,14 +140,8 @@ void PGLog::trim(eversion_t trim_to, pg_info_t &info)
assert(trim_to <= info.last_complete);
dout(10) << "trim " << log << " to " << trim_to << dendl;
- log.trim(trim_to);
+ log.trim(trim_to, &trimmed);
info.log_tail = log.tail;
-
- if (log.log.empty()) {
- mark_dirty_to(eversion_t::max());
- } else {
- mark_dirty_to(log.log.front().version);
- }
}
}
@@ -514,6 +506,7 @@ void PGLog::merge_log(ObjectStore::Transaction& t,
log.index();
info.last_update = log.head = olog.head;
+ info.last_user_version = oinfo.last_user_version;
info.purged_snaps = oinfo.purged_snaps;
// process divergent items
@@ -541,13 +534,18 @@ void PGLog::write_log(
<< "dirty_to: " << dirty_to
<< ", dirty_from: " << dirty_from
<< ", dirty_divergent_priors: " << dirty_divergent_priors
+ << ", writeout_from: " << writeout_from
+ << ", trimmed: " << trimmed
<< dendl;
- _write_log(t, log, log_oid, divergent_priors,
- dirty_to,
- dirty_from,
- dirty_divergent_priors,
- !touched_log,
- &log_keys_debug);
+ _write_log(
+ t, log, log_oid, divergent_priors,
+ dirty_to,
+ dirty_from,
+ writeout_from,
+ trimmed,
+ dirty_divergent_priors,
+ !touched_log,
+ (pg_log_debug ? &log_keys_debug : 0));
undirty();
} else {
dout(10) << "log is not dirty" << dendl;
@@ -557,8 +555,11 @@ void PGLog::write_log(
void PGLog::write_log(ObjectStore::Transaction& t, pg_log_t &log,
const hobject_t &log_oid, map<eversion_t, hobject_t> &divergent_priors)
{
- _write_log(t, log, log_oid, divergent_priors, eversion_t::max(), eversion_t(),
- true, true, 0);
+ _write_log(
+ t, log, log_oid,
+ divergent_priors, eversion_t::max(), eversion_t(), eversion_t(),
+ set<eversion_t>(),
+ true, true, 0);
}
void PGLog::_write_log(
@@ -566,11 +567,24 @@ void PGLog::_write_log(
const hobject_t &log_oid, map<eversion_t, hobject_t> &divergent_priors,
eversion_t dirty_to,
eversion_t dirty_from,
+ eversion_t writeout_from,
+ const set<eversion_t> &trimmed,
bool dirty_divergent_priors,
bool touch_log,
set<string> *log_keys_debug
)
{
+ set<string> to_remove;
+ for (set<eversion_t>::const_iterator i = trimmed.begin();
+ i != trimmed.end();
+ ++i) {
+ to_remove.insert(i->get_key_name());
+ if (log_keys_debug) {
+ assert(log_keys_debug->count(i->get_key_name()));
+ log_keys_debug->erase(i->get_key_name());
+ }
+ }
+
//dout(10) << "write_log, clearing up to " << dirty_to << dendl;
if (touch_log)
t.touch(coll_t(), log_oid);
@@ -598,7 +612,8 @@ void PGLog::_write_log(
}
for (list<pg_log_entry_t>::reverse_iterator p = log.log.rbegin();
- p != log.log.rend() && p->version >= dirty_from &&
+ p != log.log.rend() &&
+ (p->version >= dirty_from || p->version >= writeout_from) &&
p->version >= dirty_to;
++p) {
bufferlist bl(sizeof(*p) * 2);
@@ -620,6 +635,7 @@ void PGLog::_write_log(
::encode(divergent_priors, keys["divergent_priors"]);
}
+ t.omap_rmkeys(coll_t::META_COLL, log_oid, to_remove);
t.omap_setkeys(coll_t::META_COLL, log_oid, keys);
}
@@ -762,10 +778,6 @@ void PGLog::read_log_old(ObjectStore *store, coll_t coll, hobject_t log_oid,
log.tail = info.log_tail;
- // In case of sobject_t based encoding, may need to list objects in the store
- // to find hashes
- vector<hobject_t> ls;
-
if (ondisklog_head > 0) {
// read
bufferlist bl;
@@ -783,7 +795,6 @@ void PGLog::read_log_old(ObjectStore *store, coll_t coll, hobject_t log_oid,
assert(log.empty());
eversion_t last;
bool reorder = false;
- bool listed_collection = false;
while (!p.end()) {
uint64_t pos = ondisklog_tail + p.get_off();
@@ -826,29 +837,7 @@ void PGLog::read_log_old(ObjectStore *store, coll_t coll, hobject_t log_oid,
<< e.version << " after " << last << "\n";
}
- if (e.invalid_hash) {
- // We need to find the object in the store to get the hash
- if (!listed_collection) {
- store->collection_list(coll, ls);
- listed_collection = true;
- }
- bool found = false;
- for (vector<hobject_t>::iterator i = ls.begin();
- i != ls.end();
- ++i) {
- if (i->oid == e.soid.oid && i->snap == e.soid.snap) {
- e.soid = *i;
- found = true;
- break;
- }
- }
- if (!found) {
- // Didn't find the correct hash
- std::ostringstream oss;
- oss << "Could not find hash for hoid " << e.soid << std::endl;
- throw read_log_error(oss.str().c_str());
- }
- }
+ assert(!e.invalid_hash);
if (e.invalid_pool) {
e.soid.pool = info.pgid.pool();
diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h
index 552f9b0cee9..792191bea2a 100644
--- a/src/osd/PGLog.h
+++ b/src/osd/PGLog.h
@@ -84,11 +84,11 @@ struct PGLog {
bool logged_req(const osd_reqid_t &r) const {
return caller_ops.count(r);
}
- eversion_t get_request_version(const osd_reqid_t &r) const {
+ const pg_log_entry_t *get_request(const osd_reqid_t &r) const {
hash_map<osd_reqid_t,pg_log_entry_t*>::const_iterator p = caller_ops.find(r);
if (p == caller_ops.end())
- return eversion_t();
- return p->second->version;
+ return NULL;
+ return p->second;
}
void index() {
@@ -142,7 +142,7 @@ struct PGLog {
caller_ops[e.reqid] = &(log.back());
}
- void trim(eversion_t s);
+ void trim(eversion_t s, set<eversion_t> *trimmed);
ostream& print(ostream& out) const;
};
@@ -150,6 +150,7 @@ struct PGLog {
protected:
//////////////////// data members ////////////////////
+ bool pg_log_debug;
map<eversion_t, hobject_t> divergent_priors;
pg_missing_t missing;
@@ -157,8 +158,10 @@ protected:
/// Log is clean on [dirty_to, dirty_from)
bool touched_log;
- eversion_t dirty_to;
- eversion_t dirty_from;
+ eversion_t dirty_to; ///< must clear/writeout all keys up to dirty_to
+ eversion_t dirty_from; ///< must clear/writeout all keys past dirty_from
+ eversion_t writeout_from; ///< must writout keys past writeout_from
+ set<eversion_t> trimmed; ///< must clear keys in trimmed
bool dirty_divergent_priors;
CephContext *cct;
@@ -166,7 +169,9 @@ protected:
return !touched_log ||
(dirty_to != eversion_t()) ||
(dirty_from != eversion_t::max()) ||
- dirty_divergent_priors;
+ dirty_divergent_priors ||
+ (writeout_from != eversion_t::max()) ||
+ !(trimmed.empty());
}
void mark_dirty_to(eversion_t to) {
if (to > dirty_to)
@@ -176,6 +181,10 @@ protected:
if (from < dirty_from)
dirty_from = from;
}
+ void mark_writeout_from(eversion_t from) {
+ if (from < writeout_from)
+ writeout_from = from;
+ }
void add_divergent_prior(eversion_t version, hobject_t obj) {
divergent_priors.insert(make_pair(version, obj));
dirty_divergent_priors = true;
@@ -205,11 +214,9 @@ protected:
log_keys_debug->erase(i++));
}
void check() {
- assert(log.log.size() == log_keys_debug.size());
- if (cct &&
- !(cct->_conf->osd_debug_pg_log_writeout)) {
+ if (!pg_log_debug)
return;
- }
+ assert(log.log.size() == log_keys_debug.size());
for (list<pg_log_entry_t>::iterator i = log.log.begin();
i != log.log.end();
++i) {
@@ -222,11 +229,15 @@ protected:
dirty_from = eversion_t::max();
dirty_divergent_priors = false;
touched_log = true;
+ trimmed.clear();
+ writeout_from = eversion_t::max();
check();
}
public:
PGLog(CephContext *cct = 0) :
+ pg_log_debug(!(cct && !(cct->_conf->osd_debug_pg_log_writeout))),
touched_log(false), dirty_from(eversion_t::max()),
+ writeout_from(eversion_t::max()),
dirty_divergent_priors(false), cct(cct) {}
@@ -281,7 +292,7 @@ public:
void unindex() { log.unindex(); }
void add(pg_log_entry_t& e) {
- mark_dirty_from(e.version);
+ mark_writeout_from(e.version);
log.add(e);
}
@@ -374,6 +385,8 @@ public:
const hobject_t &log_oid, map<eversion_t, hobject_t> &divergent_priors,
eversion_t dirty_to,
eversion_t dirty_from,
+ eversion_t writeout_from,
+ const set<eversion_t> &trimmed,
bool dirty_divergent_priors,
bool touch_log,
set<string> *log_keys_debug
@@ -381,8 +394,10 @@ public:
bool read_log(ObjectStore *store, coll_t coll, hobject_t log_oid,
const pg_info_t &info, ostringstream &oss) {
- return read_log(store, coll, log_oid, info, divergent_priors,
- log, missing, oss, &log_keys_debug);
+ return read_log(
+ store, coll, log_oid, info, divergent_priors,
+ log, missing, oss,
+ (pg_log_debug ? &log_keys_debug : 0));
}
/// return true if the log should be rewritten
diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc
new file mode 100644
index 00000000000..9529e15ae77
--- /dev/null
+++ b/src/osd/ReplicatedBackend.cc
@@ -0,0 +1,268 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include "ReplicatedBackend.h"
+#include "messages/MOSDSubOp.h"
+#include "messages/MOSDSubOpReply.h"
+#include "messages/MOSDPGPush.h"
+#include "messages/MOSDPGPull.h"
+#include "messages/MOSDPGPushReply.h"
+
+#define dout_subsys ceph_subsys_osd
+#define DOUT_PREFIX_ARGS this
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+static ostream& _prefix(std::ostream *_dout, ReplicatedBackend *pgb) {
+ return *_dout << pgb->get_parent()->gen_dbg_prefix();
+}
+
+ReplicatedBackend::ReplicatedBackend(
+ PGBackend::Listener *pg, coll_t coll, OSDService *osd) :
+ PGBackend(pg), temp_created(false),
+ temp_coll(coll_t::make_temp_coll(pg->get_info().pgid)),
+ coll(coll), osd(osd), cct(osd->cct) {}
+
+void ReplicatedBackend::run_recovery_op(
+ PGBackend::RecoveryHandle *_h,
+ int priority)
+{
+ RPGHandle *h = static_cast<RPGHandle *>(_h);
+ send_pushes(priority, h->pushes);
+ send_pulls(priority, h->pulls);
+ delete h;
+}
+
+void ReplicatedBackend::recover_object(
+ const hobject_t &hoid,
+ ObjectContextRef head,
+ ObjectContextRef obc,
+ RecoveryHandle *_h
+ )
+{
+ dout(10) << __func__ << ": " << hoid << dendl;
+ RPGHandle *h = static_cast<RPGHandle *>(_h);
+ if (get_parent()->get_local_missing().is_missing(hoid)) {
+ assert(!obc);
+ // pull
+ prepare_pull(
+ hoid,
+ head,
+ h);
+ return;
+ } else {
+ assert(obc);
+ int started = start_pushes(
+ hoid,
+ obc,
+ h);
+ assert(started > 0);
+ }
+}
+
+void ReplicatedBackend::check_recovery_sources(const OSDMapRef osdmap)
+{
+ for(map<int, set<hobject_t> >::iterator i = pull_from_peer.begin();
+ i != pull_from_peer.end();
+ ) {
+ if (osdmap->is_down(i->first)) {
+ dout(10) << "check_recovery_sources resetting pulls from osd." << i->first
+ << ", osdmap has it marked down" << dendl;
+ for (set<hobject_t>::iterator j = i->second.begin();
+ j != i->second.end();
+ ++j) {
+ assert(pulling.count(*j) == 1);
+ get_parent()->cancel_pull(*j);
+ pulling.erase(*j);
+ }
+ pull_from_peer.erase(i++);
+ } else {
+ ++i;
+ }
+ }
+}
+
+bool ReplicatedBackend::handle_message(
+ OpRequestRef op
+ )
+{
+ dout(10) << __func__ << ": " << op << dendl;
+ switch (op->get_req()->get_type()) {
+ case MSG_OSD_PG_PUSH:
+ // TODOXXX: needs to be active possibly
+ do_push(op);
+ return true;
+
+ case MSG_OSD_PG_PULL:
+ do_pull(op);
+ return true;
+
+ case MSG_OSD_PG_PUSH_REPLY:
+ do_push_reply(op);
+ return true;
+
+ case MSG_OSD_SUBOP: {
+ MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req());
+ if (m->ops.size() >= 1) {
+ OSDOp *first = &m->ops[0];
+ switch (first->op.op) {
+ case CEPH_OSD_OP_PULL:
+ sub_op_pull(op);
+ return true;
+ case CEPH_OSD_OP_PUSH:
+ // TODOXXX: needs to be active possibly
+ sub_op_push(op);
+ return true;
+ default:
+ break;
+ }
+ }
+ break;
+ }
+
+ case MSG_OSD_SUBOPREPLY: {
+ MOSDSubOpReply *r = static_cast<MOSDSubOpReply*>(op->get_req());
+ if (r->ops.size() >= 1) {
+ OSDOp &first = r->ops[0];
+ switch (first.op.op) {
+ case CEPH_OSD_OP_PUSH:
+ // continue peer recovery
+ sub_op_push_reply(op);
+ return true;
+ }
+ }
+ break;
+ }
+
+ default:
+ break;
+ }
+ return false;
+}
+
+void ReplicatedBackend::clear_state()
+{
+ // clear pushing/pulling maps
+ pushing.clear();
+ pulling.clear();
+ pull_from_peer.clear();
+}
+
+void ReplicatedBackend::on_change(ObjectStore::Transaction *t)
+{
+ dout(10) << __func__ << dendl;
+ // clear temp
+ for (set<hobject_t>::iterator i = temp_contents.begin();
+ i != temp_contents.end();
+ ++i) {
+ dout(10) << __func__ << ": Removing oid "
+ << *i << " from the temp collection" << dendl;
+ t->remove(get_temp_coll(t), *i);
+ }
+ temp_contents.clear();
+ clear_state();
+}
+
+coll_t ReplicatedBackend::get_temp_coll(ObjectStore::Transaction *t)
+{
+ if (temp_created)
+ return temp_coll;
+ if (!osd->store->collection_exists(temp_coll))
+ t->create_collection(temp_coll);
+ temp_created = true;
+ return temp_coll;
+}
+
+void ReplicatedBackend::on_flushed()
+{
+ if (have_temp_coll() &&
+ !osd->store->collection_empty(get_temp_coll())) {
+ vector<hobject_t> objects;
+ osd->store->collection_list(get_temp_coll(), objects);
+ derr << __func__ << ": found objects in the temp collection: "
+ << objects << ", crashing now"
+ << dendl;
+ assert(0 == "found garbage in the temp collection");
+ }
+}
+
+
+int ReplicatedBackend::objects_list_partial(
+ const hobject_t &begin,
+ int min,
+ int max,
+ snapid_t seq,
+ vector<hobject_t> *ls,
+ hobject_t *next)
+{
+ vector<ghobject_t> objects;
+ ghobject_t _next;
+ int r = osd->store->collection_list_partial(
+ coll,
+ begin,
+ min,
+ max,
+ seq,
+ &objects,
+ &_next);
+ ls->reserve(objects.size());
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ assert(i->is_degenerate());
+ ls->push_back(i->hobj);
+ }
+ assert(_next.is_degenerate());
+ *next = _next.hobj;
+ return r;
+}
+
+int ReplicatedBackend::objects_list_range(
+ const hobject_t &start,
+ const hobject_t &end,
+ snapid_t seq,
+ vector<hobject_t> *ls)
+{
+ vector<ghobject_t> objects;
+ int r = osd->store->collection_list_range(
+ coll,
+ start,
+ end,
+ seq,
+ &objects);
+ ls->reserve(objects.size());
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ assert(i->is_degenerate());
+ ls->push_back(i->hobj);
+ }
+ return r;
+}
+
+int ReplicatedBackend::objects_get_attr(
+ const hobject_t &hoid,
+ const string &attr,
+ bufferlist *out)
+{
+ bufferptr bp;
+ int r = osd->store->getattr(
+ coll,
+ hoid,
+ attr.c_str(),
+ bp);
+ if (r >= 0 && out) {
+ out->clear();
+ out->push_back(bp);
+ }
+ return r;
+}
diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h
new file mode 100644
index 00000000000..cc5f060e136
--- /dev/null
+++ b/src/osd/ReplicatedBackend.h
@@ -0,0 +1,329 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef REPBACKEND_H
+#define REPBACKEND_H
+
+#include "OSD.h"
+#include "PGBackend.h"
+#include "osd_types.h"
+
+struct C_ReplicatedBackend_OnPullComplete;
+class ReplicatedBackend : public PGBackend {
+ struct RPGHandle : public PGBackend::RecoveryHandle {
+ map<int, vector<PushOp> > pushes;
+ map<int, vector<PullOp> > pulls;
+ };
+ friend struct C_ReplicatedBackend_OnPullComplete;
+private:
+ bool temp_created;
+ const coll_t temp_coll;
+ coll_t get_temp_coll() const {
+ return temp_coll;
+ }
+ bool have_temp_coll() const { return temp_created; }
+
+ // Track contents of temp collection, clear on reset
+ set<hobject_t> temp_contents;
+public:
+ coll_t coll;
+ OSDService *osd;
+ CephContext *cct;
+
+ ReplicatedBackend(PGBackend::Listener *pg, coll_t coll, OSDService *osd);
+
+ /// @see PGBackend::open_recovery_op
+ RPGHandle *_open_recovery_op() {
+ return new RPGHandle();
+ }
+ PGBackend::RecoveryHandle *open_recovery_op() {
+ return _open_recovery_op();
+ }
+
+ /// @see PGBackend::run_recovery_op
+ void run_recovery_op(
+ PGBackend::RecoveryHandle *h,
+ int priority);
+
+ /// @see PGBackend::recover_object
+ void recover_object(
+ const hobject_t &hoid,
+ ObjectContextRef head,
+ ObjectContextRef obc,
+ RecoveryHandle *h
+ );
+
+ void check_recovery_sources(const OSDMapRef osdmap);
+
+ /// @see PGBackend::handle_message
+ bool handle_message(
+ OpRequestRef op
+ );
+
+ void on_change(ObjectStore::Transaction *t);
+ void clear_state();
+ void on_flushed();
+
+ void temp_colls(list<coll_t> *out) {
+ if (temp_created)
+ out->push_back(temp_coll);
+ }
+ void split_colls(
+ pg_t child,
+ int split_bits,
+ int seed,
+ ObjectStore::Transaction *t) {
+ coll_t target = coll_t::make_temp_coll(child);
+ if (!temp_created)
+ return;
+ t->create_collection(target);
+ t->split_collection(
+ temp_coll,
+ split_bits,
+ seed,
+ target);
+ }
+
+ virtual void dump_recovery_info(Formatter *f) const {
+ {
+ f->open_array_section("pull_from_peer");
+ for (map<int, set<hobject_t> >::const_iterator i = pull_from_peer.begin();
+ i != pull_from_peer.end();
+ ++i) {
+ f->open_object_section("pulling_from");
+ f->dump_int("pull_from", i->first);
+ {
+ f->open_array_section("pulls");
+ for (set<hobject_t>::const_iterator j = i->second.begin();
+ j != i->second.end();
+ ++j) {
+ f->open_object_section("pull_info");
+ assert(pulling.count(*j));
+ pulling.find(*j)->second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section();
+ }
+ {
+ f->open_array_section("pushing");
+ for (map<hobject_t, map<int, PushInfo> >::const_iterator i =
+ pushing.begin();
+ i != pushing.end();
+ ++i) {
+ f->open_object_section("object");
+ f->dump_stream("pushing") << i->first;
+ {
+ f->open_array_section("pushing_to");
+ for (map<int, PushInfo>::const_iterator j = i->second.begin();
+ j != i->second.end();
+ ++j) {
+ f->open_object_section("push_progress");
+ f->dump_stream("object_pushing") << j->first;
+ {
+ f->open_object_section("push_info");
+ j->second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section();
+ }
+ }
+
+ /// List objects in collection
+ int objects_list_partial(
+ const hobject_t &begin,
+ int min,
+ int max,
+ snapid_t seq,
+ vector<hobject_t> *ls,
+ hobject_t *next);
+
+ int objects_list_range(
+ const hobject_t &start,
+ const hobject_t &end,
+ snapid_t seq,
+ vector<hobject_t> *ls);
+
+ int objects_get_attr(
+ const hobject_t &hoid,
+ const string &attr,
+ bufferlist *out);
+private:
+ // push
+ struct PushInfo {
+ ObjectRecoveryProgress recovery_progress;
+ ObjectRecoveryInfo recovery_info;
+ ObjectContextRef obc;
+ object_stat_sum_t stat;
+
+ void dump(Formatter *f) const {
+ {
+ f->open_object_section("recovery_progress");
+ recovery_progress.dump(f);
+ f->close_section();
+ }
+ {
+ f->open_object_section("recovery_info");
+ recovery_info.dump(f);
+ f->close_section();
+ }
+ }
+ };
+ map<hobject_t, map<int, PushInfo> > pushing;
+
+ // pull
+ struct PullInfo {
+ ObjectRecoveryProgress recovery_progress;
+ ObjectRecoveryInfo recovery_info;
+ ObjectContextRef head_ctx;
+ ObjectContextRef obc;
+ object_stat_sum_t stat;
+
+ void dump(Formatter *f) const {
+ {
+ f->open_object_section("recovery_progress");
+ recovery_progress.dump(f);
+ f->close_section();
+ }
+ {
+ f->open_object_section("recovery_info");
+ recovery_info.dump(f);
+ f->close_section();
+ }
+ }
+
+ bool is_complete() const {
+ return recovery_progress.is_complete(recovery_info);
+ }
+ };
+
+ coll_t get_temp_coll(ObjectStore::Transaction *t);
+ void add_temp_obj(const hobject_t &oid) {
+ temp_contents.insert(oid);
+ }
+ void clear_temp_obj(const hobject_t &oid) {
+ temp_contents.erase(oid);
+ }
+
+ map<hobject_t, PullInfo> pulling;
+
+ // Reverse mapping from osd peer to objects beging pulled from that peer
+ map<int, set<hobject_t> > pull_from_peer;
+
+ void sub_op_push(OpRequestRef op);
+ void sub_op_push_reply(OpRequestRef op);
+ void sub_op_pull(OpRequestRef op);
+
+ void _do_push(OpRequestRef op);
+ void _do_pull_response(OpRequestRef op);
+ void do_push(OpRequestRef op) {
+ if (is_primary()) {
+ _do_pull_response(op);
+ } else {
+ _do_push(op);
+ }
+ }
+ void do_pull(OpRequestRef op);
+ void do_push_reply(OpRequestRef op);
+
+ bool handle_push_reply(int peer, PushReplyOp &op, PushOp *reply);
+ void handle_pull(int peer, PullOp &op, PushOp *reply);
+ bool handle_pull_response(
+ int from, PushOp &op, PullOp *response,
+ list<ObjectContextRef> *to_continue,
+ ObjectStore::Transaction *t);
+ void handle_push(int from, PushOp &op, PushReplyOp *response,
+ ObjectStore::Transaction *t);
+
+ static void trim_pushed_data(const interval_set<uint64_t> &copy_subset,
+ const interval_set<uint64_t> &intervals_received,
+ bufferlist data_received,
+ interval_set<uint64_t> *intervals_usable,
+ bufferlist *data_usable);
+ void _failed_push(int from, const hobject_t &soid);
+
+ void send_pushes(int prio, map<int, vector<PushOp> > &pushes);
+ void prep_push_op_blank(const hobject_t& soid, PushOp *op);
+ int send_push_op_legacy(int priority, int peer,
+ PushOp &pop);
+ int send_pull_legacy(int priority, int peer,
+ const ObjectRecoveryInfo& recovery_info,
+ ObjectRecoveryProgress progress);
+ void send_pulls(
+ int priority,
+ map<int, vector<PullOp> > &pulls);
+
+ int build_push_op(const ObjectRecoveryInfo &recovery_info,
+ const ObjectRecoveryProgress &progress,
+ ObjectRecoveryProgress *out_progress,
+ PushOp *out_op,
+ object_stat_sum_t *stat = 0);
+ void submit_push_data(ObjectRecoveryInfo &recovery_info,
+ bool first,
+ bool complete,
+ const interval_set<uint64_t> &intervals_included,
+ bufferlist data_included,
+ bufferlist omap_header,
+ map<string, bufferptr> &attrs,
+ map<string, bufferlist> &omap_entries,
+ ObjectStore::Transaction *t);
+ void submit_push_complete(ObjectRecoveryInfo &recovery_info,
+ ObjectStore::Transaction *t);
+
+ void calc_clone_subsets(
+ SnapSet& snapset, const hobject_t& poid, const pg_missing_t& missing,
+ const hobject_t &last_backfill,
+ interval_set<uint64_t>& data_subset,
+ map<hobject_t, interval_set<uint64_t> >& clone_subsets);
+ void prepare_pull(
+ const hobject_t& soid,
+ ObjectContextRef headctx,
+ RPGHandle *h);
+ int start_pushes(
+ const hobject_t &soid,
+ ObjectContextRef obj,
+ RPGHandle *h);
+ void prep_push_to_replica(
+ ObjectContextRef obc, const hobject_t& soid, int peer,
+ PushOp *pop);
+ void prep_push(ObjectContextRef obc,
+ const hobject_t& oid, int dest,
+ PushOp *op);
+ void prep_push(ObjectContextRef obc,
+ const hobject_t& soid, int peer,
+ eversion_t version,
+ interval_set<uint64_t> &data_subset,
+ map<hobject_t, interval_set<uint64_t> >& clone_subsets,
+ PushOp *op);
+ void calc_head_subsets(ObjectContextRef obc, SnapSet& snapset, const hobject_t& head,
+ const pg_missing_t& missing,
+ const hobject_t &last_backfill,
+ interval_set<uint64_t>& data_subset,
+ map<hobject_t, interval_set<uint64_t> >& clone_subsets);
+ ObjectRecoveryInfo recalc_subsets(
+ const ObjectRecoveryInfo& recovery_info,
+ SnapSetContext *ssc
+ );
+};
+
+#endif
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index bca49024c0b..c4dccf68442 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -50,6 +50,9 @@
#include "include/compat.h"
#include "common/cmdparse.h"
+#include "mon/MonClient.h"
+#include "osdc/Objecter.h"
+
#include "json_spirit/json_spirit_value.h"
#include "json_spirit/json_spirit_reader.h"
#include "include/assert.h" // json_spirit clobbers it
@@ -57,8 +60,9 @@
#define dout_subsys ceph_subsys_osd
#define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
#undef dout_prefix
-#define dout_prefix _prefix(_dout, this, osd->whoami, get_osdmap())
-static ostream& _prefix(std::ostream *_dout, PG *pg, int whoami, OSDMapRef osdmap) {
+#define dout_prefix _prefix(_dout, this)
+template <typename T>
+static ostream& _prefix(std::ostream *_dout, T *pg) {
return *_dout << pg->gen_prefix();
}
@@ -76,6 +80,159 @@ PGLSFilter::~PGLSFilter()
{
}
+static void log_subop_stats(
+ OSDService *osd,
+ OpRequestRef op, int tag_inb, int tag_lat)
+{
+ utime_t now = ceph_clock_now(g_ceph_context);
+ utime_t latency = now;
+ latency -= op->get_req()->get_recv_stamp();
+
+ uint64_t inb = op->get_req()->get_data().length();
+
+ osd->logger->inc(l_osd_sop);
+
+ osd->logger->inc(l_osd_sop_inb, inb);
+ osd->logger->tinc(l_osd_sop_lat, latency);
+
+ if (tag_inb)
+ osd->logger->inc(tag_inb, inb);
+ osd->logger->tinc(tag_lat, latency);
+}
+
+// ======================
+// PGBackend::Listener
+
+
+void ReplicatedPG::on_local_recover_start(
+ const hobject_t &oid,
+ ObjectStore::Transaction *t)
+{
+ pg_log.revise_have(oid, eversion_t());
+ remove_snap_mapped_object(*t, oid);
+ t->remove(coll, oid);
+}
+
+void ReplicatedPG::on_local_recover(
+ const hobject_t &hoid,
+ const object_stat_sum_t &stat_diff,
+ const ObjectRecoveryInfo &_recovery_info,
+ ObjectContextRef obc,
+ ObjectStore::Transaction *t
+ )
+{
+ ObjectRecoveryInfo recovery_info(_recovery_info);
+ if (recovery_info.soid.snap < CEPH_NOSNAP) {
+ assert(recovery_info.oi.snaps.size());
+ OSDriver::OSTransaction _t(osdriver.get_transaction(t));
+ set<snapid_t> snaps(
+ recovery_info.oi.snaps.begin(),
+ recovery_info.oi.snaps.end());
+ snap_mapper.add_oid(
+ recovery_info.soid,
+ snaps,
+ &_t);
+ }
+
+ if (pg_log.get_missing().is_missing(recovery_info.soid) &&
+ pg_log.get_missing().missing.find(recovery_info.soid)->second.need > recovery_info.version) {
+ assert(is_primary());
+ const pg_log_entry_t *latest = pg_log.get_log().objects.find(recovery_info.soid)->second;
+ if (latest->op == pg_log_entry_t::LOST_REVERT &&
+ latest->reverting_to == recovery_info.version) {
+ dout(10) << " got old revert version " << recovery_info.version
+ << " for " << *latest << dendl;
+ recovery_info.version = latest->version;
+ // update the attr to the revert event version
+ recovery_info.oi.prior_version = recovery_info.oi.version;
+ recovery_info.oi.version = latest->version;
+ bufferlist bl;
+ ::encode(recovery_info.oi, bl);
+ t->setattr(coll, recovery_info.soid, OI_ATTR, bl);
+ }
+ }
+
+ // keep track of active pushes for scrub
+ ++active_pushes;
+
+ recover_got(recovery_info.soid, recovery_info.version);
+
+ if (is_primary()) {
+ info.stats.stats.sum.add(stat_diff);
+
+ assert(obc);
+ obc->obs.exists = true;
+ obc->ondisk_write_lock();
+ obc->obs.oi = recovery_info.oi; // may have been updated above
+
+
+ t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
+ t->register_on_applied_sync(new C_OSD_OndiskWriteUnlock(obc));
+
+ publish_stats_to_osd();
+ if (waiting_for_missing_object.count(hoid)) {
+ dout(20) << " kicking waiters on " << hoid << dendl;
+ requeue_ops(waiting_for_missing_object[hoid]);
+ waiting_for_missing_object.erase(hoid);
+ if (pg_log.get_missing().missing.size() == 0) {
+ requeue_ops(waiting_for_all_missing);
+ waiting_for_all_missing.clear();
+ }
+ }
+ } else {
+ t->register_on_applied(
+ new C_OSD_AppliedRecoveredObjectReplica(this));
+
+ }
+
+ t->register_on_commit(
+ new C_OSD_CommittedPushedObject(
+ this,
+ get_osdmap()->get_epoch(),
+ info.last_complete));
+
+ // update pg
+ dirty_info = true;
+ write_if_dirty(*t);
+
+}
+
+void ReplicatedPG::on_global_recover(
+ const hobject_t &soid)
+{
+ publish_stats_to_osd();
+ dout(10) << "pushed " << soid << " to all replicas" << dendl;
+ assert(recovering.count(soid));
+ recovering.erase(soid);
+ finish_recovery_op(soid);
+ if (waiting_for_degraded_object.count(soid)) {
+ requeue_ops(waiting_for_degraded_object[soid]);
+ waiting_for_degraded_object.erase(soid);
+ }
+ finish_degraded_object(soid);
+}
+
+void ReplicatedPG::on_peer_recover(
+ int peer,
+ const hobject_t &soid,
+ const ObjectRecoveryInfo &recovery_info,
+ const object_stat_sum_t &stat)
+{
+ info.stats.stats.sum.add(stat);
+ publish_stats_to_osd();
+ // done!
+ peer_missing[peer].got(soid, recovery_info.version);
+ if (peer == backfill_target && backfills_in_flight.count(soid))
+ backfills_in_flight.erase(soid);
+}
+
+void ReplicatedPG::begin_peer_recover(
+ int peer,
+ const hobject_t soid)
+{
+ peer_missing[peer].revise_have(soid, eversion_t());
+}
+
// =======================
// pg changes
@@ -114,18 +271,18 @@ void ReplicatedPG::wait_for_missing_object(const hobject_t& soid, OpRequestRef o
assert(g != missing.missing.end());
const eversion_t &v(g->second.need);
- map<hobject_t, PullInfo>::const_iterator p = pulling.find(soid);
- if (p != pulling.end()) {
- dout(7) << "missing " << soid << " v " << v << ", already pulling." << dendl;
+ set<hobject_t>::const_iterator p = recovering.find(soid);
+ if (p != recovering.end()) {
+ dout(7) << "missing " << soid << " v " << v << ", already recovering." << dendl;
}
else if (missing_loc.find(soid) == missing_loc.end()) {
dout(7) << "missing " << soid << " v " << v << ", is unfound." << dendl;
}
else {
- dout(7) << "missing " << soid << " v " << v << ", pulling." << dendl;
- map<int, vector<PullOp> > pulls;
- prepare_pull(soid, v, g_conf->osd_client_op_priority, &pulls);
- send_pulls(g_conf->osd_client_op_priority, pulls);
+ dout(7) << "missing " << soid << " v " << v << ", recovering." << dendl;
+ PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
+ recover_missing(soid, v, cct->_conf->osd_client_op_priority, h);
+ pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
}
waiting_for_missing_object[soid].push_back(op);
op->mark_delayed("waiting for missing object");
@@ -162,15 +319,15 @@ void ReplicatedPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef
assert(is_degraded_object(soid));
// we don't have it (yet).
- if (pushing.count(soid)) {
+ if (recovering.count(soid)) {
dout(7) << "degraded "
<< soid
- << ", already pushing"
+ << ", already recovering"
<< dendl;
} else {
dout(7) << "degraded "
<< soid
- << ", pushing"
+ << ", recovering"
<< dendl;
eversion_t v;
for (unsigned i = 1; i < acting.size(); i++) {
@@ -181,14 +338,21 @@ void ReplicatedPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef
break;
}
}
- map<int, vector<PushOp> > pushes;
- prep_object_replica_pushes(soid, v, g_conf->osd_client_op_priority, &pushes);
- send_pushes(g_conf->osd_client_op_priority, pushes);
+ PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
+ prep_object_replica_pushes(soid, v, h);
+ pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
}
waiting_for_degraded_object[soid].push_back(op);
op->mark_delayed("waiting for degraded object");
}
+void ReplicatedPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
+{
+ dout(10) << __func__ << " " << soid << " " << op << dendl;
+ waiting_for_blocked_object[soid].push_back(op);
+ op->mark_delayed("waiting for blocked object");
+}
+
void ReplicatedPG::wait_for_backfill_pos(OpRequestRef op)
{
waiting_for_backfill_pos.push_back(op);
@@ -234,8 +398,10 @@ bool PGLSPlainFilter::filter(bufferlist& xattr_data, bufferlist& outdata)
bool ReplicatedPG::pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata)
{
bufferlist bl;
-
- int ret = osd->store->getattr(coll_t(info.pgid), sobj, filter->get_xattr().c_str(), bl);
+ int ret = pgbackend->objects_get_attr(
+ sobj,
+ filter->get_xattr(),
+ &bl);
dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl;
if (ret < 0)
return false;
@@ -278,14 +444,14 @@ int ReplicatedPG::do_command(cmdmap_t cmdmap, ostream& ss,
string prefix;
string format;
- cmd_getval(g_ceph_context, cmdmap, "format", format);
+ cmd_getval(cct, cmdmap, "format", format);
boost::scoped_ptr<Formatter> f(new_formatter(format));
// demand that we have a formatter
if (!f)
f.reset(new_formatter("json"));
string command;
- cmd_getval(g_ceph_context, cmdmap, "cmd", command);
+ cmd_getval(cct, cmdmap, "cmd", command);
if (command == "query") {
f->open_object_section("pg");
f->dump_string("state", pg_state_string(get_state()));
@@ -312,7 +478,7 @@ int ReplicatedPG::do_command(cmdmap_t cmdmap, ostream& ss,
}
else if (command == "mark_unfound_lost") {
string mulcmd;
- cmd_getval(g_ceph_context, cmdmap, "mulcmd", mulcmd);
+ cmd_getval(cct, cmdmap, "mulcmd", mulcmd);
if (mulcmd != "revert") {
ss << "mode must be 'revert'; mark and delete not yet implemented";
return -EINVAL;
@@ -332,7 +498,7 @@ int ReplicatedPG::do_command(cmdmap_t cmdmap, ostream& ss,
if (!all_unfound_are_queried_or_lost(get_osdmap())) {
ss << "pg has " << unfound
- << " objects but we haven't probed all sources, not marking lost";
+ << " unfound objects but we haven't probed all sources, not marking lost";
return -EINVAL;
}
@@ -343,7 +509,7 @@ int ReplicatedPG::do_command(cmdmap_t cmdmap, ostream& ss,
else if (command == "list_missing") {
hobject_t offset;
string offset_json;
- if (cmd_getval(g_ceph_context, cmdmap, "offset", offset_json)) {
+ if (cmd_getval(cct, cmdmap, "offset", offset_json)) {
json_spirit::Value v;
try {
if (!json_spirit::read(offset_json, v))
@@ -367,7 +533,7 @@ int ReplicatedPG::do_command(cmdmap_t cmdmap, ostream& ss,
f->open_array_section("objects");
int32_t num = 0;
bufferlist bl;
- while (p != missing.missing.end() && num < g_conf->osd_command_max_records) {
+ while (p != missing.missing.end() && num < cct->_conf->osd_command_max_records) {
f->open_object_section("object");
{
f->open_object_section("oid");
@@ -417,7 +583,7 @@ bool ReplicatedPG::pg_op_must_wait(MOSDOp *op)
void ReplicatedPG::do_pg_op(OpRequestRef op)
{
- MOSDOp *m = static_cast<MOSDOp *>(op->request);
+ MOSDOp *m = static_cast<MOSDOp *>(op->get_req());
assert(m->get_header().type == CEPH_MSG_OSD_OP);
dout(10) << "do_pg_op " << *m << dendl;
@@ -457,7 +623,7 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
dout(10) << " pgls pg=" << m->get_pg() << " != " << info.pgid << dendl;
result = 0; // hmm?
} else {
- unsigned list_size = MIN(g_conf->osd_max_pgls, p->op.pgls.count);
+ unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
// read into a buffer
@@ -475,12 +641,13 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
hobject_t next;
hobject_t current = response.handle;
osr->flush();
- int r = osd->store->collection_list_partial(coll, current,
- list_size,
- list_size,
- snapid,
- &sentries,
- &next);
+ int r = pgbackend->objects_list_partial(
+ current,
+ list_size,
+ list_size,
+ snapid,
+ &sentries,
+ &next);
if (r != 0) {
result = -EINVAL;
break;
@@ -518,13 +685,17 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
if (snapid != CEPH_NOSNAP) {
bufferlist bl;
if (candidate.snap == CEPH_NOSNAP) {
- osd->store->getattr(coll, candidate, SS_ATTR, bl);
+ pgbackend->objects_get_attr(
+ candidate,
+ SS_ATTR,
+ &bl);
SnapSet snapset(bl);
if (snapid <= snapset.seq)
continue;
} else {
bufferlist attr_bl;
- osd->store->getattr(coll, candidate, OI_ATTR, attr_bl);
+ pgbackend->objects_get_attr(
+ candidate, OI_ATTR, &attr_bl);
object_info_t oi(attr_bl);
vector<snapid_t>::iterator i = find(oi.snaps.begin(),
oi.snaps.end(),
@@ -569,6 +740,7 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
reply->set_data(outdata);
reply->set_result(result);
+ reply->set_reply_versions(info.last_update, info.last_user_version);
osd->send_message_osd_client(reply, m->get_connection());
delete filter;
}
@@ -581,14 +753,14 @@ void ReplicatedPG::calc_trim_to()
return;
}
- size_t target = g_conf->osd_min_pg_log_entries;
+ size_t target = cct->_conf->osd_min_pg_log_entries;
if (is_degraded() ||
state_test(PG_STATE_RECOVERING |
PG_STATE_RECOVERY_WAIT |
PG_STATE_BACKFILL |
PG_STATE_BACKFILL_WAIT |
PG_STATE_BACKFILL_TOOFULL)) {
- target = g_conf->osd_max_pg_log_entries;
+ target = cct->_conf->osd_max_pg_log_entries;
}
if (min_last_complete_ondisk != eversion_t() &&
@@ -617,9 +789,10 @@ ReplicatedPG::ReplicatedPG(OSDService *o, OSDMapRef curmap,
const PGPool &_pool, pg_t p, const hobject_t& oid,
const hobject_t& ioid) :
PG(o, curmap, _pool, p, oid, ioid),
+ pgbackend(new ReplicatedBackend(this, coll_t(p), o)),
snapset_contexts_lock("ReplicatedPG::snapset_contexts"),
- temp_created(false),
- temp_coll(coll_t::make_temp_coll(p)), snap_trimmer_machine(this)
+ temp_seq(0),
+ snap_trimmer_machine(this)
{
snap_trimmer_machine.initiate();
}
@@ -631,13 +804,69 @@ void ReplicatedPG::get_src_oloc(const object_t& oid, const object_locator_t& olo
src_oloc.key = oid.name;
}
+void ReplicatedPG::do_request(
+ OpRequestRef op,
+ ThreadPool::TPHandle &handle)
+{
+ // do any pending flush
+ do_pending_flush();
+
+ if (!op_has_sufficient_caps(op)) {
+ osd->reply_op_error(op, -EPERM);
+ return;
+ }
+ assert(!op_must_wait_for_map(get_osdmap(), op));
+ if (can_discard_request(op)) {
+ return;
+ }
+ if (!flushed) {
+ dout(20) << " !flushed, waiting for active on " << op << dendl;
+ waiting_for_active.push_back(op);
+ return;
+ }
+
+ if (pgbackend->handle_message(op))
+ return;
+
+ switch (op->get_req()->get_type()) {
+ case CEPH_MSG_OSD_OP:
+ if (is_replay() || !is_active()) {
+ dout(20) << " replay, waiting for active on " << op << dendl;
+ waiting_for_active.push_back(op);
+ return;
+ }
+ do_op(op); // do it now
+ break;
+
+ case MSG_OSD_SUBOP:
+ do_sub_op(op);
+ break;
+
+ case MSG_OSD_SUBOPREPLY:
+ do_sub_op_reply(op);
+ break;
+
+ case MSG_OSD_PG_SCAN:
+ do_scan(op, handle);
+ break;
+
+ case MSG_OSD_PG_BACKFILL:
+ do_backfill(op);
+ break;
+
+ default:
+ assert(0 == "bad message type in do_request");
+ }
+}
+
+
/** do_op - do an op
* pg lock will be held (if multithreaded)
* osd_lock NOT held.
*/
void ReplicatedPG::do_op(OpRequestRef op)
{
- MOSDOp *m = static_cast<MOSDOp*>(op->request);
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
assert(m->get_header().type == CEPH_MSG_OSD_OP);
if (op->includes_pg_op()) {
if (pg_op_must_wait(m)) {
@@ -647,13 +876,21 @@ void ReplicatedPG::do_op(OpRequestRef op)
return do_pg_op(op);
}
- dout(10) << "do_op " << *m << (op->may_write() ? " may_write" : "") << dendl;
+ // order this op as a write?
+ bool write_ordered = op->may_write() || (m->get_flags() & CEPH_OSD_FLAG_RWORDERED);
+
+ dout(10) << "do_op " << *m
+ << (op->may_write() ? " may_write" : "")
+ << (op->may_read() ? " may_read" : "")
+ << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
+ << dendl;
hobject_t head(m->get_oid(), m->get_object_locator().key,
CEPH_NOSNAP, m->get_pg().ps(),
info.pgid.pool(), m->get_object_locator().nspace);
- if (op->may_write() && scrubber.write_blocked_by_scrub(head)) {
+
+ if (write_ordered && scrubber.write_blocked_by_scrub(head)) {
dout(20) << __func__ << ": waiting for scrub" << dendl;
waiting_for_active.push_back(op);
op->mark_delayed("waiting for scrub");
@@ -667,7 +904,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
}
// degraded object?
- if (op->may_write() && is_degraded_object(head)) {
+ if (write_ordered && is_degraded_object(head)) {
wait_for_degraded_object(head, op);
return;
}
@@ -687,7 +924,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
}
// degraded object?
- if (op->may_write() && is_degraded_object(snapdir)) {
+ if (write_ordered && is_degraded_object(snapdir)) {
wait_for_degraded_object(snapdir, op);
return;
}
@@ -709,27 +946,32 @@ void ReplicatedPG::do_op(OpRequestRef op)
m->get_object_locator().get_pool(),
m->get_object_locator().nspace),
&obc, can_create, &snapid);
- if (r) {
- if (r == -EAGAIN) {
- // If we're not the primary of this OSD, and we have
- // CEPH_OSD_FLAG_LOCALIZE_READS set, we just return -EAGAIN. Otherwise,
- // we have to wait for the object.
- if (is_primary() ||
- (!(m->get_flags() & CEPH_OSD_FLAG_BALANCE_READS) &&
- !(m->get_flags() & CEPH_OSD_FLAG_LOCALIZE_READS))) {
- // missing the specific snap we need; requeue and wait.
- assert(!can_create); // only happens on a read
- hobject_t soid(m->get_oid(), m->get_object_locator().key,
- snapid, m->get_pg().ps(),
- info.pgid.pool(), m->get_object_locator().nspace);
- wait_for_missing_object(soid, op);
- return;
- }
+
+ if (r == -EAGAIN) {
+ // If we're not the primary of this OSD, and we have
+ // CEPH_OSD_FLAG_LOCALIZE_READS set, we just return -EAGAIN. Otherwise,
+ // we have to wait for the object.
+ if (is_primary() ||
+ (!(m->get_flags() & CEPH_OSD_FLAG_BALANCE_READS) &&
+ !(m->get_flags() & CEPH_OSD_FLAG_LOCALIZE_READS))) {
+ // missing the specific snap we need; requeue and wait.
+ assert(!can_create); // only happens on a read
+ hobject_t soid(m->get_oid(), m->get_object_locator().key,
+ snapid, m->get_pg().ps(),
+ info.pgid.pool(), m->get_object_locator().nspace);
+ wait_for_missing_object(soid, op);
+ return;
}
+ }
+
+ if (maybe_handle_cache(op, obc, r))
+ return;
+
+ if (r) {
osd->reply_op_error(op, r);
return;
}
-
+
// make sure locator is consistent
object_locator_t oloc(obc->obs.oi.soid);
if (m->get_object_locator() != oloc) {
@@ -740,20 +982,14 @@ void ReplicatedPG::do_op(OpRequestRef op)
<< " op " << *m << "\n";
}
- if ((op->may_read()) && (obc->obs.oi.lost)) {
- // This object is lost. Reading from it returns an error.
- dout(20) << __func__ << ": object " << obc->obs.oi.soid
- << " is lost" << dendl;
- osd->reply_op_error(op, -ENFILE);
+ // io blocked on obc?
+ if (obc->is_blocked()) {
+ wait_for_blocked_object(obc->obs.oi.soid, op);
return;
}
+
dout(25) << __func__ << ": object " << obc->obs.oi.soid
<< " has oi of " << obc->obs.oi << dendl;
-
- if (!op->may_write() && !obc->obs.exists) {
- osd->reply_op_error(op, -ENOENT);
- return;
- }
// are writes blocked by another object?
if (obc->blocked_by) {
@@ -807,6 +1043,8 @@ void ReplicatedPG::do_op(OpRequestRef op)
wait_for_missing_object(wait_oid, op);
} else if (r) {
osd->reply_op_error(op, r);
+ } else if (sobc->obs.oi.is_whiteout()) {
+ osd->reply_op_error(op, -ENOENT);
} else {
if (sobc->obs.oi.soid.get_key() != obc->obs.oi.soid.get_key() &&
sobc->obs.oi.soid.get_key() != obc->obs.oi.soid.oid.name &&
@@ -836,7 +1074,6 @@ void ReplicatedPG::do_op(OpRequestRef op)
dout(10) << "no src oid specified for multi op " << osd_op << dendl;
osd->reply_op_error(op, -EINVAL);
}
- src_obc.clear();
return;
}
@@ -862,12 +1099,13 @@ void ReplicatedPG::do_op(OpRequestRef op)
wait_for_missing_object(wait_oid, op);
} else if (r) {
osd->reply_op_error(op, r);
+ } else if (sobc->obs.oi.is_whiteout()) {
+ osd->reply_op_error(op, -ENOENT);
} else {
dout(10) << " clone_oid " << clone_oid << " obc " << sobc << dendl;
src_obc[clone_oid] = sobc;
continue;
}
- src_obc.clear();
return;
} else {
continue;
@@ -875,48 +1113,114 @@ void ReplicatedPG::do_op(OpRequestRef op)
}
}
- op->mark_started();
-
- const hobject_t& soid = obc->obs.oi.soid;
OpContext *ctx = new OpContext(op, m->get_reqid(), m->ops,
&obc->obs, obc->ssc,
this);
+ if (!get_rw_locks(ctx)) {
+ op->mark_delayed("waiting for rw locks");
+ close_op_ctx(ctx);
+ return;
+ }
+
+ if ((op->may_read()) && (obc->obs.oi.is_lost())) {
+ // This object is lost. Reading from it returns an error.
+ dout(20) << __func__ << ": object " << obc->obs.oi.soid
+ << " is lost" << dendl;
+ close_op_ctx(ctx);
+ osd->reply_op_error(op, -ENFILE);
+ return;
+ }
+ if (!op->may_write() && (!obc->obs.exists ||
+ obc->obs.oi.is_whiteout())) {
+ close_op_ctx(ctx);
+ osd->reply_op_error(op, -ENOENT);
+ return;
+ }
+
+ op->mark_started();
ctx->obc = obc;
ctx->src_obc = src_obc;
- if (op->may_write()) {
- // snap
- if (pool.info.is_pool_snaps_mode()) {
- // use pool's snapc
- ctx->snapc = pool.snapc;
+ execute_ctx(ctx);
+}
+
+bool ReplicatedPG::maybe_handle_cache(OpRequestRef op, ObjectContextRef obc,
+ int r)
+{
+ switch(pool.info.cache_mode) {
+ case pg_pool_t::CACHEMODE_NONE:
+ return false;
+ break;
+ case pg_pool_t::CACHEMODE_WRITEBACK:
+ if (obc.get()) {
+ return false;
} else {
- // client specified snapc
- ctx->snapc.seq = m->get_snap_seq();
- ctx->snapc.snaps = m->get_snaps();
+ do_cache_redirect(op, obc);
+ return true;
}
- if ((m->get_flags() & CEPH_OSD_FLAG_ORDERSNAP) &&
- ctx->snapc.seq < obc->ssc->snapset.seq) {
- dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
- << " < snapset seq " << obc->ssc->snapset.seq
- << " on " << soid << dendl;
- delete ctx;
- src_obc.clear();
- osd->reply_op_error(op, -EOLDSNAPC);
- return;
+ break;
+ case pg_pool_t::CACHEMODE_INVALIDATE_FORWARD:
+ do_cache_redirect(op, obc);
+ return true;
+ break;
+ case pg_pool_t::CACHEMODE_READONLY:
+ if (obc.get() && !r) {
+ return false;
+ } else {
+ do_cache_redirect(op, obc);
+ return true;
}
+ break;
+ default:
+ assert(0);
+ }
+ return false;
+}
+
+void ReplicatedPG::do_cache_redirect(OpRequestRef op, ObjectContextRef obc)
+{
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
+ int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
+ MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT,
+ get_osdmap()->get_epoch(), flags);
+ request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
+ reply->set_redirect(redir);
+ dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
+ << op << dendl;
+ m->get_connection()->get_messenger()->send_message(reply, m->get_connection());
+ return;
+}
+
+void ReplicatedPG::execute_ctx(OpContext *ctx)
+{
+ dout(10) << __func__ << " " << ctx << dendl;
+ OpRequestRef op = ctx->op;
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
+ ObjectContextRef obc = ctx->obc;
+ const hobject_t& soid = obc->obs.oi.soid;
+ map<hobject_t,ObjectContextRef>& src_obc = ctx->src_obc;
+
+ // this method must be idempotent since we may call it several times
+ // before we finally apply the resulting transaction.
+ ctx->op_t = ObjectStore::Transaction();
+ ctx->local_t = ObjectStore::Transaction();
- eversion_t oldv = pg_log.get_log().get_request_version(ctx->reqid);
- if (oldv != eversion_t()) {
+ // dup/replay?
+ if (op->may_write()) {
+ const pg_log_entry_t *entry = pg_log.get_log().get_request(ctx->reqid);
+ if (entry) {
+ const eversion_t& oldv = entry->version;
dout(3) << "do_op dup " << ctx->reqid << " was " << oldv << dendl;
- delete ctx;
- src_obc.clear();
if (already_complete(oldv)) {
- osd->reply_op_error(op, 0, oldv);
+ reply_ctx(ctx, 0, oldv, entry->user_version);
} else {
+ close_op_ctx(ctx);
+
if (m->wants_ack()) {
if (already_ack(oldv)) {
MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0);
reply->add_flags(CEPH_OSD_FLAG_ACK);
+ reply->set_reply_versions(oldv, entry->user_version);
osd->send_message_osd_client(reply, m->get_connection());
} else {
dout(10) << " waiting for " << oldv << " to ack" << dendl;
@@ -932,6 +1236,24 @@ void ReplicatedPG::do_op(OpRequestRef op)
op->mark_started();
+ // snap
+ if (pool.info.is_pool_snaps_mode()) {
+ // use pool's snapc
+ ctx->snapc = pool.snapc;
+ } else {
+ // client specified snapc
+ ctx->snapc.seq = m->get_snap_seq();
+ ctx->snapc.snaps = m->get_snaps();
+ }
+ if ((m->get_flags() & CEPH_OSD_FLAG_ORDERSNAP) &&
+ ctx->snapc.seq < obc->ssc->snapset.seq) {
+ dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
+ << " < snapset seq " << obc->ssc->snapset.seq
+ << " on " << obc->obs.oi.soid << dendl;
+ reply_ctx(ctx, -EOLDSNAPC);
+ return;
+ }
+
// version
ctx->at_version = pg_log.get_head();
@@ -941,7 +1263,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
assert(ctx->at_version > pg_log.get_head());
ctx->mtime = m->get_mtime();
-
+
dout(10) << "do_op " << soid << " " << ctx->ops
<< " ov " << obc->obs.oi.version << " av " << ctx->at_version
<< " snapc " << ctx->snapc
@@ -953,14 +1275,10 @@ void ReplicatedPG::do_op(OpRequestRef op)
<< dendl;
}
- // note my stats
- utime_t now = ceph_clock_now(g_ceph_context);
+ ctx->user_at_version = obc->obs.oi.user_version;
- // note some basic context for op replication that prepare_transaction may clobber
- eversion_t old_last_update = pg_log.get_head();
- bool old_exists = obc->obs.exists;
- uint64_t old_size = obc->obs.oi.size;
- eversion_t old_version = obc->obs.oi.version;
+ // note my stats
+ utime_t now = ceph_clock_now(cct);
if (op->may_read()) {
dout(10) << " taking ondisk_read_lock" << dendl;
@@ -982,19 +1300,21 @@ void ReplicatedPG::do_op(OpRequestRef op)
p->second->ondisk_read_unlock();
}
+ if (result == -EINPROGRESS) {
+ // come back later.
+ return;
+ }
+
if (result == -EAGAIN) {
// clean up after the ctx
- delete ctx;
- src_obc.clear();
+ close_op_ctx(ctx);
return;
}
// check for full
if (ctx->delta_stats.num_bytes > 0 &&
pool.info.get_flags() & pg_pool_t::FLAG_FULL) {
- delete ctx;
- src_obc.clear();
- osd->reply_op_error(op, -ENOSPC);
+ reply_ctx(ctx, -ENOSPC);
return;
}
@@ -1021,27 +1341,30 @@ void ReplicatedPG::do_op(OpRequestRef op)
}
ctx->reply->set_result(result);
- if (result >= 0)
- ctx->reply->set_version(ctx->reply_version);
- else if (result == -ENOENT)
- ctx->reply->set_version(info.last_update);
-
// read or error?
if (ctx->op_t.empty() || result < 0) {
+ MOSDOpReply *reply = ctx->reply;
+ ctx->reply = NULL;
+
if (result >= 0) {
log_op_stats(ctx);
publish_stats_to_osd();
+
+ // on read, return the current object version
+ reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
+ } else if (result == -ENOENT) {
+ // on ENOENT, set a floor for what the next user version will be.
+ reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
}
- MOSDOpReply *reply = ctx->reply;
- ctx->reply = NULL;
reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
osd->send_message_osd_client(reply, m->get_connection());
- delete ctx;
- src_obc.clear();
+ close_op_ctx(ctx);
return;
}
+ ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
+
assert(op->may_write());
// trim log?
@@ -1050,7 +1373,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
append_log(ctx->log, pg_trim_to, ctx->local_t);
// verify that we are doing this in order?
- if (g_conf->osd_debug_op_order && m->get_source().is_client()) {
+ if (cct->_conf->osd_debug_op_order && m->get_source().is_client()) {
map<client_t,tid_t>& cm = debug_op_order[obc->obs.oi.soid];
tid_t t = m->get_tid();
client_t n = m->get_source().num();
@@ -1075,26 +1398,37 @@ void ReplicatedPG::do_op(OpRequestRef op)
repop->src_obc.swap(src_obc); // and src_obc.
- issue_repop(repop, now, old_last_update, old_exists, old_size, old_version);
+ issue_repop(repop, now);
eval_repop(repop);
repop->put();
}
+void ReplicatedPG::reply_ctx(OpContext *ctx, int r)
+{
+ osd->reply_op_error(ctx->op, r);
+ close_op_ctx(ctx);
+}
+
+void ReplicatedPG::reply_ctx(OpContext *ctx, int r, eversion_t v, version_t uv)
+{
+ osd->reply_op_error(ctx->op, r, v, uv);
+ close_op_ctx(ctx);
+}
void ReplicatedPG::log_op_stats(OpContext *ctx)
{
OpRequestRef op = ctx->op;
- MOSDOp *m = static_cast<MOSDOp*>(op->request);
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
- utime_t now = ceph_clock_now(g_ceph_context);
+ utime_t now = ceph_clock_now(cct);
utime_t latency = now;
- latency -= ctx->op->request->get_recv_stamp();
+ latency -= ctx->op->get_req()->get_recv_stamp();
utime_t rlatency;
if (ctx->readable_stamp != utime_t()) {
rlatency = ctx->readable_stamp;
- rlatency -= ctx->op->request->get_recv_stamp();
+ rlatency -= ctx->op->get_req()->get_recv_stamp();
}
uint64_t inb = ctx->bytes_written;
@@ -1131,41 +1465,16 @@ void ReplicatedPG::log_op_stats(OpContext *ctx)
<< " lat " << latency << dendl;
}
-void ReplicatedPG::log_subop_stats(OpRequestRef op, int tag_inb, int tag_lat)
-{
- utime_t now = ceph_clock_now(g_ceph_context);
- utime_t latency = now;
- latency -= op->request->get_recv_stamp();
-
- uint64_t inb = op->request->get_data().length();
-
- osd->logger->inc(l_osd_sop);
-
- osd->logger->inc(l_osd_sop_inb, inb);
- osd->logger->tinc(l_osd_sop_lat, latency);
-
- if (tag_inb)
- osd->logger->inc(tag_inb, inb);
- osd->logger->tinc(tag_lat, latency);
-}
-
-
-
void ReplicatedPG::do_sub_op(OpRequestRef op)
{
- MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request);
+ MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req());
assert(have_same_or_newer_map(m->map_epoch));
assert(m->get_header().type == MSG_OSD_SUBOP);
- dout(15) << "do_sub_op " << *op->request << dendl;
+ dout(15) << "do_sub_op " << *op->get_req() << dendl;
OSDOp *first = NULL;
if (m->ops.size() >= 1) {
first = &m->ops[0];
- switch (first->op.op) {
- case CEPH_OSD_OP_PULL:
- sub_op_pull(op);
- return;
- }
}
if (!is_active()) {
@@ -1176,9 +1485,6 @@ void ReplicatedPG::do_sub_op(OpRequestRef op)
if (first) {
switch (first->op.op) {
- case CEPH_OSD_OP_PUSH:
- sub_op_push(op);
- return;
case CEPH_OSD_OP_DELETE:
sub_op_remove(op);
return;
@@ -1202,16 +1508,11 @@ void ReplicatedPG::do_sub_op(OpRequestRef op)
void ReplicatedPG::do_sub_op_reply(OpRequestRef op)
{
- MOSDSubOpReply *r = static_cast<MOSDSubOpReply *>(op->request);
+ MOSDSubOpReply *r = static_cast<MOSDSubOpReply *>(op->get_req());
assert(r->get_header().type == MSG_OSD_SUBOPREPLY);
if (r->ops.size() >= 1) {
OSDOp& first = r->ops[0];
switch (first.op.op) {
- case CEPH_OSD_OP_PUSH:
- // continue peer recovery
- sub_op_push_reply(op);
- return;
-
case CEPH_OSD_OP_SCRUB_RESERVE:
sub_op_scrub_reserve_reply(op);
return;
@@ -1225,7 +1526,7 @@ void ReplicatedPG::do_scan(
OpRequestRef op,
ThreadPool::TPHandle &handle)
{
- MOSDPGScan *m = static_cast<MOSDPGScan*>(op->request);
+ MOSDPGScan *m = static_cast<MOSDPGScan*>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_SCAN);
dout(10) << "do_scan " << *m << dendl;
@@ -1248,10 +1549,14 @@ void ReplicatedPG::do_scan(
}
BackfillInterval bi;
- osr->flush();
+ bi.begin = m->begin;
+ // No need to flush, there won't be any in progress writes occuring
+ // past m->begin
scan_range(
- m->begin, g_conf->osd_backfill_scan_min,
- g_conf->osd_backfill_scan_max, &bi, handle);
+ cct->_conf->osd_backfill_scan_min,
+ cct->_conf->osd_backfill_scan_max,
+ &bi,
+ handle);
MOSDPGScan *reply = new MOSDPGScan(MOSDPGScan::OP_SCAN_DIGEST,
get_osdmap()->get_epoch(), m->query_epoch,
info.pgid, bi.begin, bi.end);
@@ -1297,9 +1602,9 @@ void ReplicatedPG::do_scan(
}
}
-void ReplicatedPG::_do_push(OpRequestRef op)
+void ReplicatedBackend::_do_push(OpRequestRef op)
{
- MOSDPGPush *m = static_cast<MOSDPGPush *>(op->request);
+ MOSDPGPush *m = static_cast<MOSDPGPush *>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_PUSH);
int from = m->get_source().num();
@@ -1314,52 +1619,89 @@ void ReplicatedPG::_do_push(OpRequestRef op)
MOSDPGPushReply *reply = new MOSDPGPushReply;
reply->set_priority(m->get_priority());
- reply->pgid = info.pgid;
+ reply->pgid = get_info().pgid;
reply->map_epoch = m->map_epoch;
reply->replies.swap(replies);
- reply->compute_cost(g_ceph_context);
+ reply->compute_cost(cct);
- t->register_on_complete(new C_OSD_SendMessageOnConn(
- osd, reply, m->get_connection()));
+ t->register_on_complete(
+ new C_OSD_SendMessageOnConn(
+ osd, reply, m->get_connection()));
- osd->store->queue_transaction(osr.get(), t);
+ get_parent()->queue_transaction(t);
}
-void ReplicatedPG::_do_pull_response(OpRequestRef op)
+struct C_ReplicatedBackend_OnPullComplete : GenContext<ThreadPool::TPHandle&> {
+ ReplicatedBackend *bc;
+ list<ObjectContextRef> to_continue;
+ int priority;
+ C_ReplicatedBackend_OnPullComplete(ReplicatedBackend *bc, int priority)
+ : bc(bc), priority(priority) {}
+
+ void finish(ThreadPool::TPHandle &handle) {
+ ReplicatedBackend::RPGHandle *h = bc->_open_recovery_op();
+ for (list<ObjectContextRef>::iterator i =
+ to_continue.begin();
+ i != to_continue.end();
+ ++i) {
+ if (!bc->start_pushes((*i)->obs.oi.soid, *i, h)) {
+ bc->get_parent()->on_global_recover(
+ (*i)->obs.oi.soid);
+ }
+ handle.reset_tp_timeout();
+ }
+ bc->run_recovery_op(h, priority);
+ }
+};
+
+void ReplicatedBackend::_do_pull_response(OpRequestRef op)
{
- MOSDPGPush *m = static_cast<MOSDPGPush *>(op->request);
+ MOSDPGPush *m = static_cast<MOSDPGPush *>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_PUSH);
int from = m->get_source().num();
vector<PullOp> replies(1);
ObjectStore::Transaction *t = new ObjectStore::Transaction;
+ list<ObjectContextRef> to_continue;
for (vector<PushOp>::iterator i = m->pushes.begin();
i != m->pushes.end();
++i) {
- bool more = handle_pull_response(from, *i, &(replies.back()), t);
+ bool more = handle_pull_response(from, *i, &(replies.back()), &to_continue, t);
if (more)
replies.push_back(PullOp());
}
+ if (!to_continue.empty()) {
+ C_ReplicatedBackend_OnPullComplete *c =
+ new C_ReplicatedBackend_OnPullComplete(
+ this,
+ m->get_priority());
+ c->to_continue.swap(to_continue);
+ t->register_on_complete(
+ new C_QueueInWQ(
+ &osd->push_wq,
+ get_parent()->bless_gencontext(c)));
+ }
replies.erase(replies.end() - 1);
if (replies.size()) {
MOSDPGPull *reply = new MOSDPGPull;
reply->set_priority(m->get_priority());
- reply->pgid = info.pgid;
+ reply->pgid = get_info().pgid;
reply->map_epoch = m->map_epoch;
reply->pulls.swap(replies);
- reply->compute_cost(g_ceph_context);
+ reply->compute_cost(cct);
- t->register_on_complete(new C_OSD_SendMessageOnConn(
- osd, reply, m->get_connection()));
+ t->register_on_complete(
+ new C_OSD_SendMessageOnConn(
+ osd, reply, m->get_connection()));
}
- osd->store->queue_transaction(osr.get(), t);
+ get_parent()->queue_transaction(t);
}
-void ReplicatedPG::do_pull(OpRequestRef op)
+void ReplicatedBackend::do_pull(OpRequestRef op)
{
- MOSDPGPull *m = static_cast<MOSDPGPull *>(op->request);
+ MOSDPGPull *m = static_cast<MOSDPGPull *>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_PULL);
int from = m->get_source().num();
@@ -1373,9 +1715,9 @@ void ReplicatedPG::do_pull(OpRequestRef op)
send_pushes(m->get_priority(), replies);
}
-void ReplicatedPG::do_push_reply(OpRequestRef op)
+void ReplicatedBackend::do_push_reply(OpRequestRef op)
{
- MOSDPGPushReply *m = static_cast<MOSDPGPushReply *>(op->request);
+ MOSDPGPushReply *m = static_cast<MOSDPGPushReply *>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_PUSH_REPLY);
int from = m->get_source().num();
@@ -1396,7 +1738,7 @@ void ReplicatedPG::do_push_reply(OpRequestRef op)
void ReplicatedPG::do_backfill(OpRequestRef op)
{
- MOSDPGBackfill *m = static_cast<MOSDPGBackfill*>(op->request);
+ MOSDPGBackfill *m = static_cast<MOSDPGBackfill*>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_BACKFILL);
dout(10) << "do_backfill " << *m << dendl;
@@ -1406,12 +1748,12 @@ void ReplicatedPG::do_backfill(OpRequestRef op)
case MOSDPGBackfill::OP_BACKFILL_FINISH:
{
assert(is_replica());
- assert(g_conf->osd_kill_backfill_at != 1);
+ assert(cct->_conf->osd_kill_backfill_at != 1);
MOSDPGBackfill *reply = new MOSDPGBackfill(MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
get_osdmap()->get_epoch(), m->query_epoch,
info.pgid);
- reply->set_priority(g_conf->osd_recovery_op_priority);
+ reply->set_priority(cct->_conf->osd_recovery_op_priority);
osd->send_message_osd_cluster(reply, m->get_connection());
queue_peering_event(
CephPeeringEvtRef(
@@ -1425,7 +1767,7 @@ void ReplicatedPG::do_backfill(OpRequestRef op)
case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
{
assert(is_replica());
- assert(g_conf->osd_kill_backfill_at != 2);
+ assert(cct->_conf->osd_kill_backfill_at != 2);
info.last_backfill = m->last_backfill;
if (m->compat_stat_sum) {
@@ -1445,7 +1787,7 @@ void ReplicatedPG::do_backfill(OpRequestRef op)
case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
{
assert(is_primary());
- assert(g_conf->osd_kill_backfill_at != 3);
+ assert(cct->_conf->osd_kill_backfill_at != 3);
finish_recovery_op(hobject_t::get_max());
}
break;
@@ -1494,7 +1836,7 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid)
&obc->obs,
obc->ssc,
this);
- ctx->mtime = ceph_clock_now(g_ceph_context);
+ ctx->mtime = ceph_clock_now(cct);
ctx->at_version.epoch = get_osdmap()->get_epoch();
ctx->at_version.version = pg_log.get_head().version + 1;
@@ -1567,6 +1909,7 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid)
coid,
ctx->at_version,
ctx->obs->oi.version,
+ 0,
osd_reqid_t(),
ctx->mtime)
);
@@ -1589,6 +1932,7 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid)
coid,
coi.version,
coi.prior_version,
+ 0,
osd_reqid_t(),
ctx->mtime)
);
@@ -1613,6 +1957,7 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid)
snapoid,
ctx->at_version,
ctx->snapset_obc->obs.oi.version,
+ 0,
osd_reqid_t(),
ctx->mtime)
);
@@ -1627,6 +1972,7 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid)
snapoid,
ctx->at_version,
ctx->snapset_obc->obs.oi.version,
+ 0,
osd_reqid_t(),
ctx->mtime)
);
@@ -2012,11 +2358,11 @@ int ReplicatedPG::do_tmapup(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd
return result;
}
-static int check_offset_and_length(uint64_t offset, uint64_t length)
+static int check_offset_and_length(uint64_t offset, uint64_t length, uint64_t max)
{
- if (offset >= g_conf->osd_max_object_size ||
- length > g_conf->osd_max_object_size ||
- offset + length > g_conf->osd_max_object_size)
+ if (offset >= max ||
+ length > max ||
+ offset + length > max)
return -EFBIG;
return 0;
@@ -2056,7 +2402,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
ObjectContextRef src_obc;
if (ceph_osd_op_type_multi(op.op)) {
- MOSDOp *m = static_cast<MOSDOp *>(ctx->op->request);
+ MOSDOp *m = static_cast<MOSDOp *>(ctx->op->get_req());
object_locator_t src_oloc;
get_src_oloc(soid.oid, m->get_object_locator(), src_oloc);
hobject_t src_oid(osd_op.soid, src_oloc.key, soid.hash,
@@ -2075,9 +2421,9 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
// munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes)
if (op.op == CEPH_OSD_OP_ZERO &&
obs.exists &&
- op.extent.offset < g_conf->osd_max_object_size &&
+ op.extent.offset < cct->_conf->osd_max_object_size &&
op.extent.length >= 1 &&
- op.extent.length <= g_conf->osd_max_object_size &&
+ op.extent.length <= cct->_conf->osd_max_object_size &&
op.extent.offset + op.extent.length >= oi.size) {
if (op.extent.offset >= oi.size) {
// no-op
@@ -2177,7 +2523,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
uint64_t last = op.extent.offset;
for (miter = m.begin(); miter != m.end(); ++miter) {
// verify hole?
- if (g_conf->osd_verify_sparse_read_holes &&
+ if (cct->_conf->osd_verify_sparse_read_holes &&
last < miter->first) {
bufferlist t;
uint64_t len = miter->first - last;
@@ -2202,7 +2548,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
}
// verify trailing hole?
- if (g_conf->osd_verify_sparse_read_holes) {
+ if (cct->_conf->osd_verify_sparse_read_holes) {
uint64_t end = MIN(op.extent.offset + op.extent.length, oi.size);
if (last < end) {
bufferlist t;
@@ -2306,13 +2652,35 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
}
break;
+ case CEPH_OSD_OP_ISDIRTY:
+ ++ctx->num_read;
+ {
+ bool is_dirty = obs.oi.is_dirty();
+ ::encode(is_dirty, osd_op.outdata);
+ ctx->delta_stats.num_rd++;
+ result = 0;
+ }
+ break;
+
+ case CEPH_OSD_OP_UNDIRTY:
+ ++ctx->num_write;
+ {
+ ctx->undirty = true; // see make_writeable()
+ ctx->modify = true;
+ ctx->delta_stats.num_wr++;
+ }
+ break;
+
case CEPH_OSD_OP_GETXATTR:
++ctx->num_read;
{
string aname;
bp.copy(op.xattr.name_len, aname);
string name = "_" + aname;
- int r = osd->store->getattr(coll, soid, name.c_str(), osd_op.outdata);
+ int r = pgbackend->objects_get_attr(
+ soid,
+ name,
+ &(osd_op.outdata));
if (r >= 0) {
op.xattr.value_len = r;
result = 0;
@@ -2355,9 +2723,15 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
bufferlist xattr;
if (op.op == CEPH_OSD_OP_CMPXATTR)
- result = osd->store->getattr(coll, soid, name.c_str(), xattr);
+ result = pgbackend->objects_get_attr(
+ soid,
+ name,
+ &xattr);
else
- result = osd->store->getattr(coll, src_obc->obs.oi.soid, name.c_str(), xattr);
+ result = pgbackend->objects_get_attr(
+ src_obc->obs.oi.soid,
+ name,
+ &xattr);
if (result < 0 && result != -EEXIST && result != -ENODATA)
break;
@@ -2417,12 +2791,12 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
uint64_t ver = op.watch.ver;
if (!ver)
result = -EINVAL;
- else if (ver < oi.user_version.version)
+ else if (ver < oi.user_version)
result = -ERANGE;
- else if (ver > oi.user_version.version)
+ else if (ver > oi.user_version)
result = -EOVERFLOW;
- break;
}
+ break;
case CEPH_OSD_OP_LIST_WATCHERS:
++ctx->num_read;
@@ -2531,12 +2905,12 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
case CEPH_OSD_OP_ASSERT_SRC_VERSION:
++ctx->num_read;
{
- uint64_t ver = op.watch.ver;
+ uint64_t ver = op.assert_ver.ver;
if (!ver)
result = -EINVAL;
- else if (ver < src_obc->obs.oi.user_version.version)
+ else if (ver < src_obc->obs.oi.user_version)
result = -ERANGE;
- else if (ver > src_obc->obs.oi.user_version.version)
+ else if (ver > src_obc->obs.oi.user_version)
result = -EOVERFLOW;
break;
}
@@ -2556,7 +2930,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
timeout = 0;
}
if (!timeout)
- timeout = g_conf->osd_default_notify_timeout;
+ timeout = cct->_conf->osd_default_notify_timeout;
notify_info_t n;
n.timeout = timeout;
@@ -2605,6 +2979,9 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
<< ", adjusting write length to " << op.extent.length << dendl;
+ bufferlist t;
+ t.substr_of(osd_op.indata, 0, op.extent.length);
+ osd_op.indata.swap(t);
}
if (op.extent.truncate_seq > seq) {
// write arrives before trimtrunc
@@ -2626,7 +3003,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
oi.truncate_size = op.extent.truncate_size;
}
}
- result = check_offset_and_length(op.extent.offset, op.extent.length);
+ result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
if (result < 0)
break;
t.write(coll, soid, op.extent.offset, op.extent.length, osd_op.indata);
@@ -2646,7 +3023,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
result = -EINVAL;
break;
}
- result = check_offset_and_length(op.extent.offset, op.extent.length);
+ result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
if (result < 0)
break;
if (obs.exists) {
@@ -2678,7 +3055,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
case CEPH_OSD_OP_ZERO:
++ctx->num_write;
{ // zero
- result = check_offset_and_length(op.extent.offset, op.extent.length);
+ result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
if (result < 0)
break;
assert(op.extent.length);
@@ -2727,7 +3104,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
}
}
break;
-
+
case CEPH_OSD_OP_TRIMTRUNC:
op.extent.offset = op.extent.truncate_size;
// falling through
@@ -2741,7 +3118,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
break;
}
- if (op.extent.offset > g_conf->osd_max_object_size) {
+ if (op.extent.offset > cct->_conf->osd_max_object_size) {
result = -EFBIG;
break;
}
@@ -2821,12 +3198,12 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
dout(10) << "watch: ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
<< " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
- dout(10) << "watch: oi.user_version=" << oi.user_version.version << dendl;
+ dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
dout(10) << "watch: peer_addr="
- << ctx->op->request->get_connection()->get_peer_addr() << dendl;
+ << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
- watch_info_t w(cookie, g_conf->osd_client_watch_timeout,
- ctx->op->request->get_connection()->get_peer_addr());
+ watch_info_t w(cookie, cct->_conf->osd_client_watch_timeout,
+ ctx->op->get_req()->get_connection()->get_peer_addr());
if (do_watch) {
if (oi.watchers.count(make_pair(cookie, entity))) {
dout(10) << " found existing watch " << w << " by " << entity << dendl;
@@ -2858,7 +3235,8 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
case CEPH_OSD_OP_SETXATTR:
++ctx->num_write;
{
- if (op.xattr.value_len > g_conf->osd_max_attr_size) {
+ if (cct->_conf->osd_max_attr_size > 0 &&
+ op.xattr.value_len > cct->_conf->osd_max_attr_size) {
result = -EFBIG;
break;
}
@@ -2956,11 +3334,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
}
}
- if (g_conf->osd_tmapput_sets_uses_tmap) {
- assert(g_conf->osd_auto_upgrade_tmap);
- oi.uses_tmap = true;
- }
-
// write it
vector<OSDOp> nops(1);
OSDOp& newop = nops[0];
@@ -3006,29 +3379,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
}
set<string> out_set;
- if (oi.uses_tmap && g_conf->osd_auto_upgrade_tmap) {
- dout(20) << "CEPH_OSD_OP_OMAPGETKEYS: "
- << " Reading " << oi.soid << " omap from tmap" << dendl;
- map<string, bufferlist> vals;
- bufferlist header;
- int r = _get_tmap(ctx, &vals, &header);
- if (r == 0) {
- map<string, bufferlist>::iterator iter =
- vals.upper_bound(start_after);
- for (uint64_t i = 0;
- i < max_return && iter != vals.end();
- ++i, iter++) {
- out_set.insert(iter->first);
- }
- ::encode(out_set, osd_op.outdata);
- ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
- ctx->delta_stats.num_rd++;
- break;
- }
- dout(10) << "failed, reading from omap" << dendl;
- // No valid tmap, use omap
- }
-
{
ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
coll, soid
@@ -3046,6 +3396,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
ctx->delta_stats.num_rd++;
}
break;
+
case CEPH_OSD_OP_OMAPGETVALS:
++ctx->num_read;
{
@@ -3063,30 +3414,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
}
map<string, bufferlist> out_set;
- if (oi.uses_tmap && g_conf->osd_auto_upgrade_tmap) {
- dout(20) << "CEPH_OSD_OP_OMAPGETVALS: "
- << " Reading " << oi.soid << " omap from tmap" << dendl;
- map<string, bufferlist> vals;
- bufferlist header;
- int r = _get_tmap(ctx, &vals, &header);
- if (r == 0) {
- map<string, bufferlist>::iterator iter = vals.upper_bound(start_after);
- if (filter_prefix > start_after) iter = vals.lower_bound(filter_prefix);
- for (uint64_t i = 0;
- i < max_return && iter != vals.end() &&
- iter->first.substr(0, filter_prefix.size()) == filter_prefix;
- ++i, iter++) {
- out_set.insert(*iter);
- }
- ::encode(out_set, osd_op.outdata);
- ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
- ctx->delta_stats.num_rd++;
- break;
- }
- // No valid tmap, use omap
- dout(10) << "failed, reading from omap" << dendl;
- }
-
{
ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
coll, soid
@@ -3110,27 +3437,16 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
ctx->delta_stats.num_rd++;
}
break;
+
case CEPH_OSD_OP_OMAPGETHEADER:
++ctx->num_read;
{
- if (oi.uses_tmap && g_conf->osd_auto_upgrade_tmap) {
- dout(20) << "CEPH_OSD_OP_OMAPGETHEADER: "
- << " Reading " << oi.soid << " omap from tmap" << dendl;
- map<string, bufferlist> vals;
- bufferlist header;
- int r = _get_tmap(ctx, &vals, &header);
- if (r == 0) {
- osd_op.outdata.claim(header);
- break;
- }
- // No valid tmap, fall through to omap
- dout(10) << "failed, reading from omap" << dendl;
- }
osd->store->omap_get_header(coll, soid, &osd_op.outdata);
ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
ctx->delta_stats.num_rd++;
}
break;
+
case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
++ctx->num_read;
{
@@ -3143,34 +3459,13 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
goto fail;
}
map<string, bufferlist> out;
- if (oi.uses_tmap && g_conf->osd_auto_upgrade_tmap) {
- dout(20) << "CEPH_OSD_OP_OMAPGET: "
- << " Reading " << oi.soid << " omap from tmap" << dendl;
- map<string, bufferlist> vals;
- bufferlist header;
- int r = _get_tmap(ctx, &vals, &header);
- if (r == 0) {
- for (set<string>::iterator iter = keys_to_get.begin();
- iter != keys_to_get.end();
- ++iter) {
- if (vals.count(*iter)) {
- out.insert(*(vals.find(*iter)));
- }
- }
- ::encode(out, osd_op.outdata);
- ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
- ctx->delta_stats.num_rd++;
- break;
- }
- // No valid tmap, use omap
- dout(10) << "failed, reading from omap" << dendl;
- }
osd->store->omap_get_values(coll, soid, keys_to_get, &out);
::encode(out, osd_op.outdata);
ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
ctx->delta_stats.num_rd++;
}
break;
+
case CEPH_OSD_OP_OMAP_CMP:
++ctx->num_read;
{
@@ -3236,13 +3531,11 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
}
}
break;
+
// OMAP Write ops
case CEPH_OSD_OP_OMAPSETVALS:
++ctx->num_write;
{
- if (oi.uses_tmap && g_conf->osd_auto_upgrade_tmap) {
- _copy_up_tmap(ctx);
- }
if (!obs.exists) {
ctx->delta_stats.num_objects++;
obs.exists = true;
@@ -3266,12 +3559,10 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
ctx->delta_stats.num_wr++;
}
break;
+
case CEPH_OSD_OP_OMAPSETHEADER:
++ctx->num_write;
{
- if (oi.uses_tmap && g_conf->osd_auto_upgrade_tmap) {
- _copy_up_tmap(ctx);
- }
if (!obs.exists) {
ctx->delta_stats.num_objects++;
obs.exists = true;
@@ -3281,6 +3572,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
ctx->delta_stats.num_wr++;
}
break;
+
case CEPH_OSD_OP_OMAPCLEAR:
++ctx->num_write;
{
@@ -3288,14 +3580,12 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
result = -ENOENT;
break;
}
- if (oi.uses_tmap && g_conf->osd_auto_upgrade_tmap) {
- _copy_up_tmap(ctx);
- }
t.touch(coll, soid);
t.omap_clear(coll, soid);
ctx->delta_stats.num_wr++;
}
break;
+
case CEPH_OSD_OP_OMAPRMKEYS:
++ctx->num_write;
{
@@ -3303,9 +3593,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
result = -ENOENT;
break;
}
- if (oi.uses_tmap && g_conf->osd_auto_upgrade_tmap) {
- _copy_up_tmap(ctx);
- }
t.touch(coll, soid);
set<string> to_rm;
try {
@@ -3319,6 +3606,130 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
ctx->delta_stats.num_wr++;
}
break;
+
+ case CEPH_OSD_OP_COPY_GET:
+ ++ctx->num_read;
+ {
+ object_copy_cursor_t cursor;
+ uint64_t out_max;
+ try {
+ ::decode(cursor, bp);
+ ::decode(out_max, bp);
+ }
+ catch (buffer::error& e) {
+ result = -EINVAL;
+ goto fail;
+ }
+
+ // size, mtime
+ ::encode(oi.size, osd_op.outdata);
+ ::encode(oi.mtime, osd_op.outdata);
+
+ // attrs
+ map<string,bufferptr> out_attrs;
+ if (!cursor.attr_complete) {
+ result = osd->store->getattrs(coll, soid, out_attrs, true);
+ if (result < 0)
+ break;
+ cursor.attr_complete = true;
+ dout(20) << " got attrs" << dendl;
+ }
+ ::encode(out_attrs, osd_op.outdata);
+
+ int64_t left = out_max - osd_op.outdata.length();
+
+ // data
+ bufferlist bl;
+ if (left > 0 && !cursor.data_complete) {
+ if (cursor.data_offset < oi.size) {
+ result = osd->store->read(coll, oi.soid, cursor.data_offset, left, bl);
+ if (result < 0)
+ return result;
+ assert(result <= left);
+ left -= result;
+ cursor.data_offset += result;
+ }
+ if (cursor.data_offset == oi.size) {
+ cursor.data_complete = true;
+ dout(20) << " got data" << dendl;
+ }
+ }
+ ::encode(bl, osd_op.outdata);
+
+ // omap
+ std::map<std::string,bufferlist> out_omap;
+ if (left > 0 && !cursor.omap_complete) {
+ ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(coll, oi.soid);
+ assert(iter);
+ if (iter->valid()) {
+ iter->upper_bound(cursor.omap_offset);
+ for (; left > 0 && iter->valid(); iter->next()) {
+ out_omap.insert(make_pair(iter->key(), iter->value()));
+ left -= iter->key().length() + 4 + iter->value().length() + 4;
+ }
+ }
+ if (iter->valid()) {
+ cursor.omap_offset = iter->key();
+ } else {
+ cursor.omap_complete = true;
+ dout(20) << " got omap" << dendl;
+ }
+ }
+ ::encode(out_omap, osd_op.outdata);
+
+ dout(20) << " cursor.is_complete=" << cursor.is_complete()
+ << " " << out_attrs.size() << " attrs"
+ << " " << bl.length() << " bytes"
+ << " " << out_omap.size() << " keys"
+ << dendl;
+ ::encode(cursor, osd_op.outdata);
+ result = 0;
+ }
+ break;
+
+ case CEPH_OSD_OP_COPY_FROM:
+ ++ctx->num_write;
+ {
+ object_t src_name;
+ object_locator_t src_oloc;
+ snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
+ version_t src_version = op.copy_from.src_version;
+ try {
+ ::decode(src_name, bp);
+ ::decode(src_oloc, bp);
+ }
+ catch (buffer::error& e) {
+ result = -EINVAL;
+ goto fail;
+ }
+ if (!ctx->copy_cb) {
+ // start
+ pg_t raw_pg;
+ get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
+ hobject_t src(src_name, src_oloc.key, src_snapid,
+ raw_pg.ps(), raw_pg.pool(),
+ src_oloc.nspace);
+ if (src == soid) {
+ dout(20) << " copy from self is invalid" << dendl;
+ result = -EINVAL;
+ break;
+ }
+ hobject_t temp_target = generate_temp_object();
+ CopyFromCallback *cb = new CopyFromCallback(ctx, temp_target);
+ ctx->copy_cb = cb;
+ result = start_copy(cb, ctx->obc, src, src_oloc, src_version,
+ temp_target);
+ if (result < 0)
+ goto fail;
+ result = -EINPROGRESS;
+ } else {
+ // finish
+ assert(ctx->copy_cb->get_result() >= 0);
+ result = finish_copyfrom(ctx);
+ }
+ }
+ break;
+
default:
dout(1) << "unrecognized osd op " << op.op
<< " " << ceph_osd_op_name(op.op)
@@ -3360,22 +3771,6 @@ int ReplicatedPG::_get_tmap(OpContext *ctx,
return 0;
}
-int ReplicatedPG::_copy_up_tmap(OpContext *ctx)
-{
- dout(20) << "copying up tmap for " << ctx->new_obs.oi.soid << dendl;
- ctx->new_obs.oi.uses_tmap = false;
- map<string, bufferlist> vals;
- bufferlist header;
- int r = _get_tmap(ctx, &vals, &header);
- if (r < 0)
- return 0;
- ctx->op_t.omap_setkeys(coll, ctx->new_obs.oi.soid,
- vals);
- ctx->op_t.omap_setheader(coll, ctx->new_obs.oi.soid,
- header);
- return 0;
-}
-
inline int ReplicatedPG::_delete_head(OpContext *ctx)
{
SnapSet& snapset = ctx->new_snapset;
@@ -3422,37 +3817,35 @@ int ReplicatedPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
int ret = find_object_context(
hobject_t(soid.oid, soid.get_key(), snapid, soid.hash, info.pgid.pool(), soid.get_namespace()),
&rollback_to, false, &cloneid);
- if (ret) {
- if (-ENOENT == ret) {
- // there's no snapshot here, or there's no object.
- // if there's no snapshot, we delete the object; otherwise, do nothing.
- dout(20) << "_rollback_to deleting head on " << soid.oid
- << " because got ENOENT on find_object_context" << dendl;
- if (ctx->obc->obs.oi.watchers.size()) {
- // Cannot delete an object with watchers
- ret = -EBUSY;
- } else {
- _delete_head(ctx);
- ret = 0;
- }
- } else if (-EAGAIN == ret) {
- /* a different problem, like degraded pool
- * with not-yet-restored object. We shouldn't have been able
- * to get here; recovery should have completed first! */
- hobject_t rollback_target(soid.oid, soid.get_key(), cloneid, soid.hash,
- info.pgid.pool(), soid.get_namespace());
- assert(is_missing_object(rollback_target));
- dout(20) << "_rollback_to attempted to roll back to a missing object "
- << rollback_target << " (requested snapid: ) " << snapid << dendl;
- wait_for_missing_object(rollback_target, ctx->op);
+ if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
+ // there's no snapshot here, or there's no object.
+ // if there's no snapshot, we delete the object; otherwise, do nothing.
+ dout(20) << "_rollback_to deleting head on " << soid.oid
+ << " because got ENOENT|whiteout on find_object_context" << dendl;
+ if (ctx->obc->obs.oi.watchers.size()) {
+ // Cannot delete an object with watchers
+ ret = -EBUSY;
} else {
- // ummm....huh? It *can't* return anything else at time of writing.
- assert(0);
- }
+ _delete_head(ctx);
+ ret = 0;
+ }
+ } else if (-EAGAIN == ret) {
+ /* a different problem, like degraded pool
+ * with not-yet-restored object. We shouldn't have been able
+ * to get here; recovery should have completed first! */
+ hobject_t rollback_target(soid.oid, soid.get_key(), cloneid, soid.hash,
+ info.pgid.pool(), soid.get_namespace());
+ assert(is_missing_object(rollback_target));
+ dout(20) << "_rollback_to attempted to roll back to a missing object "
+ << rollback_target << " (requested snapid: ) " << snapid << dendl;
+ wait_for_missing_object(rollback_target, ctx->op);
+ } else if (ret) {
+ // ummm....huh? It *can't* return anything else at time of writing.
+ assert(0 == "unexpected error code in _rollback_to");
} else { //we got our context, let's use it to do the rollback!
hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
if (is_degraded_object(rollback_to_sobject)) {
- dout(20) << "_rollback_to attempted to roll back to a degraded object "
+ dout(20) << "_rollback_to attempted to roll back to a degraded object "
<< rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
wait_for_degraded_object(rollback_to_sobject, ctx->op);
ret = -EAGAIN;
@@ -3528,6 +3921,15 @@ void ReplicatedPG::make_writeable(OpContext *ctx)
dout(20) << "make_writeable " << soid << " snapset=" << ctx->snapset
<< " snapc=" << snapc << dendl;;
+ // we will mark the object dirty
+ if (ctx->undirty) {
+ dout(20) << " clearing DIRTY flag" << dendl;
+ ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
+ } else {
+ dout(20) << " setting DIRTY flag" << dendl;
+ ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
+ }
+
// use newer snapc?
if (ctx->new_snapset.seq > snapc.seq) {
snapc.seq = ctx->new_snapset.seq;
@@ -3590,7 +3992,9 @@ void ReplicatedPG::make_writeable(OpContext *ctx)
<< " to " << coid << " v " << ctx->at_version
<< " snaps=" << snaps << dendl;
ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::CLONE, coid, ctx->at_version,
- ctx->obs->oi.version, ctx->reqid, ctx->new_obs.oi.mtime));
+ ctx->obs->oi.version,
+ ctx->obs->oi.user_version,
+ osd_reqid_t(), ctx->new_obs.oi.mtime));
::encode(snaps, ctx->log.back().snaps);
ctx->at_version.version++;
@@ -3644,7 +4048,7 @@ void ReplicatedPG::add_interval_usage(interval_set<uint64_t>& s, object_stat_sum
void ReplicatedPG::do_osd_op_effects(OpContext *ctx)
{
- ConnectionRef conn(ctx->op->request->get_connection());
+ ConnectionRef conn(ctx->op->get_req()->get_connection());
boost::intrusive_ptr<OSD::Session> session(
(OSD::Session *)conn->get_priv());
session->put(); // get_priv() takes a ref, and so does the intrusive_ptr
@@ -3707,7 +4111,7 @@ void ReplicatedPG::do_osd_op_effects(OpContext *ctx)
p->timeout,
p->cookie,
osd->get_next_id(get_osdmap()->get_epoch()),
- ctx->obc->obs.oi.user_version.version,
+ ctx->obc->obs.oi.user_version,
osd));
for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
ctx->obc->watchers.begin();
@@ -3736,19 +4140,19 @@ void ReplicatedPG::do_osd_op_effects(OpContext *ctx)
}
}
-bool ReplicatedPG::have_temp_coll()
+coll_t ReplicatedPG::get_temp_coll(ObjectStore::Transaction *t)
{
- return temp_created || osd->store->collection_exists(temp_coll);
+ return pgbackend->get_temp_coll(t);
}
-coll_t ReplicatedPG::get_temp_coll(ObjectStore::Transaction *t)
+hobject_t ReplicatedPG::generate_temp_object()
{
- if (temp_created)
- return temp_coll;
- if (!osd->store->collection_exists(temp_coll))
- t->create_collection(temp_coll);
- temp_created = true;
- return temp_coll;
+ ostringstream ss;
+ ss << "temp_" << info.pgid << "_" << get_role() << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
+ hobject_t hoid(object_t(ss.str()), "", CEPH_NOSNAP, 0, -1, "");
+ pgbackend->add_temp_obj(hoid);
+ dout(20) << __func__ << " " << hoid << dendl;
+ return hoid;
}
int ReplicatedPG::prepare_transaction(OpContext *ctx)
@@ -3779,7 +4183,6 @@ int ReplicatedPG::prepare_transaction(OpContext *ctx)
// read-op? done?
if (ctx->op_t.empty() && !ctx->modify) {
- ctx->reply_version = ctx->obs->oi.user_version;
unstable_stats.add(ctx->delta_stats, ctx->obc->obs.oi.category);
return result;
}
@@ -3805,7 +4208,7 @@ int ReplicatedPG::prepare_transaction(OpContext *ctx)
dout(10) << " removing old " << snapoid << dendl;
ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::DELETE, snapoid, ctx->at_version, old_version,
- osd_reqid_t(), ctx->mtime));
+ 0, osd_reqid_t(), ctx->mtime));
ctx->at_version.version++;
ctx->snapset_obc->obs.exists = false;
@@ -3818,7 +4221,7 @@ int ReplicatedPG::prepare_transaction(OpContext *ctx)
dout(10) << " final snapset " << ctx->new_snapset
<< " in " << snapoid << dendl;
ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, snapoid, ctx->at_version, old_version,
- osd_reqid_t(), ctx->mtime));
+ 0, osd_reqid_t(), ctx->mtime));
ctx->snapset_obc = get_object_context(snapoid, true);
ctx->snapset_obc->obs.exists = true;
@@ -3836,12 +4239,17 @@ int ReplicatedPG::prepare_transaction(OpContext *ctx)
// finish and log the op.
if (ctx->user_modify) {
- /* update the user_version for any modify ops, except for the watch op */
- ctx->new_obs.oi.user_version = ctx->at_version;
+ // update the user_version for any modify ops, except for the watch op
+ ctx->user_at_version = MAX(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
+ /* In order for new clients and old clients to interoperate properly
+ * when exchanging versions, we need to lower bound the user_version
+ * (which our new clients pay proper attention to)
+ * by the at_version (which is all the old clients can ever see). */
+ if (ctx->at_version.version > ctx->user_at_version)
+ ctx->user_at_version = ctx->at_version.version;
+ ctx->new_obs.oi.user_version = ctx->user_at_version;
}
- ctx->reply_version = ctx->new_obs.oi.user_version;
ctx->bytes_written = ctx->op_t.get_encoded_bytes();
- ctx->new_obs.oi.version = ctx->at_version;
if (ctx->new_obs.exists) {
// on the head object
@@ -3869,7 +4277,7 @@ int ReplicatedPG::prepare_transaction(OpContext *ctx)
if (!ctx->new_obs.exists)
logopcode = pg_log_entry_t::DELETE;
ctx->log.push_back(pg_log_entry_t(logopcode, soid, ctx->at_version, old_version,
- ctx->reqid, ctx->mtime));
+ ctx->user_at_version, ctx->reqid, ctx->mtime));
// apply new object state.
ctx->obc->obs = ctx->new_obs;
@@ -3893,6 +4301,248 @@ int ReplicatedPG::prepare_transaction(OpContext *ctx)
return result;
}
+// ========================================================================
+// copyfrom
+
+struct C_Copyfrom : public Context {
+ ReplicatedPGRef pg;
+ hobject_t oid;
+ epoch_t last_peering_reset;
+ tid_t tid;
+ C_Copyfrom(ReplicatedPG *p, hobject_t o, epoch_t lpr)
+ : pg(p), oid(o), last_peering_reset(lpr),
+ tid(0)
+ {}
+ void finish(int r) {
+ pg->lock();
+ if (last_peering_reset == pg->get_last_peering_reset()) {
+ pg->process_copy_chunk(oid, tid, r);
+ }
+ pg->unlock();
+ }
+};
+
+int ReplicatedPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
+ hobject_t src, object_locator_t oloc, version_t version,
+ const hobject_t& temp_dest_oid)
+{
+ const hobject_t& dest = obc->obs.oi.soid;
+ dout(10) << __func__ << " " << dest
+ << " from " << src << " " << oloc << " v" << version
+ << dendl;
+
+ // cancel a previous in-progress copy?
+ if (copy_ops.count(dest)) {
+ // FIXME: if the src etc match, we could avoid restarting from the
+ // beginning.
+ CopyOpRef cop = copy_ops[dest];
+ cancel_copy(cop);
+ }
+
+ CopyOpRef cop(new CopyOp(cb, obc, src, oloc, version, temp_dest_oid));
+ copy_ops[dest] = cop;
+ ++obc->copyfrom_readside;
+
+ _copy_some(obc, cop);
+
+ return 0;
+}
+
+void ReplicatedPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
+{
+ dout(10) << __func__ << " " << obc << " " << cop << dendl;
+ ObjectOperation op;
+ if (cop->version) {
+ op.assert_version(cop->version);
+ } else {
+ // we should learn the version after the first chunk, if we didn't know
+ // it already!
+ assert(cop->cursor.is_initial());
+ }
+ op.copy_get(&cop->cursor, cct->_conf->osd_copyfrom_max_chunk,
+ &cop->size, &cop->mtime, &cop->attrs,
+ &cop->data, &cop->omap,
+ &cop->rval);
+
+ C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
+ get_last_peering_reset());
+ osd->objecter_lock.Lock();
+ tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
+ cop->src.snap, NULL, 0,
+ new C_OnFinisher(fin,
+ &osd->objecter_finisher),
+ // discover the object version if we don't know it yet
+ cop->version ? NULL : &cop->version);
+ fin->tid = tid;
+ cop->objecter_tid = tid;
+ osd->objecter_lock.Unlock();
+}
+
+void ReplicatedPG::process_copy_chunk(hobject_t oid, tid_t tid, int r)
+{
+ dout(10) << __func__ << " tid " << tid << " " << cpp_strerror(r) << dendl;
+ map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
+ if (p == copy_ops.end()) {
+ dout(10) << __func__ << " no copy_op found" << dendl;
+ return;
+ }
+ CopyOpRef cop = p->second;
+ if (tid != cop->objecter_tid) {
+ dout(10) << __func__ << " tid " << tid << " != cop " << cop
+ << " tid " << cop->objecter_tid << dendl;
+ return;
+ }
+ ObjectContextRef obc = cop->obc;
+ cop->objecter_tid = 0;
+
+ CopyResults results;
+ if (r >= 0) {
+ assert(cop->rval >= 0);
+
+ if (!cop->cursor.is_complete()) {
+ // write out what we have so far
+ vector<OSDOp> ops;
+ tid_t rep_tid = osd->get_tid();
+ osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
+ OpContext *tctx = new OpContext(OpRequestRef(), reqid, ops, &obc->obs, obc->ssc, this);
+ tctx->mtime = ceph_clock_now(g_ceph_context);
+ RepGather *repop = new_repop(tctx, obc, rep_tid);
+
+ if (cop->temp_cursor.is_initial()) {
+ cop->temp_coll = get_temp_coll(&tctx->local_t);
+ repop->ctx->new_temp_oid = cop->temp_oid;
+ }
+
+ _write_copy_chunk(cop, &tctx->op_t);
+
+ issue_repop(repop, repop->ctx->mtime);
+ eval_repop(repop);
+ repop->put();
+
+ dout(10) << __func__ << " fetching more" << dendl;
+ _copy_some(obc, cop);
+ return;
+ }
+ _build_finish_copy_transaction(cop, results.get<3>());
+ results.get<1>() = cop->temp_cursor.data_offset;
+ }
+
+ dout(20) << __func__ << " complete; committing" << dendl;
+ results.get<0>() = r;
+ cop->cb->complete(results);
+
+ copy_ops.erase(obc->obs.oi.soid);
+ --obc->copyfrom_readside;
+ kick_object_context_blocked(obc);
+}
+
+void ReplicatedPG::_write_copy_chunk(CopyOpRef cop, ObjectStore::Transaction *t)
+{
+ dout(20) << __func__ << " " << cop
+ << " " << cop->attrs.size() << " attrs"
+ << " " << cop->data.length() << " bytes"
+ << " " << cop->omap.size() << " keys"
+ << dendl;
+ if (!cop->temp_cursor.attr_complete) {
+ t->touch(cop->temp_coll, cop->temp_oid);
+ for (map<string,bufferlist>::iterator p = cop->attrs.begin(); p != cop->attrs.end(); ++p)
+ t->setattr(cop->temp_coll, cop->temp_oid, string("_") + p->first, p->second);
+ cop->attrs.clear();
+ }
+ if (!cop->temp_cursor.data_complete) {
+ t->write(cop->temp_coll, cop->temp_oid, cop->temp_cursor.data_offset, cop->data.length(), cop->data);
+ cop->data.clear();
+ }
+ if (!cop->temp_cursor.omap_complete) {
+ t->omap_setkeys(cop->temp_coll, cop->temp_oid, cop->omap);
+ cop->omap.clear();
+ }
+ cop->temp_cursor = cop->cursor;
+}
+
+void ReplicatedPG::_build_finish_copy_transaction(CopyOpRef cop,
+ ObjectStore::Transaction& t)
+{
+ ObjectState& obs = cop->obc->obs;
+
+ if (obs.exists) {
+ t.remove(coll, obs.oi.soid);
+ }
+
+ if (cop->temp_cursor.is_initial()) {
+ // write directly to final object
+ cop->temp_coll = coll;
+ cop->temp_oid = obs.oi.soid;
+ _write_copy_chunk(cop, &t);
+ } else {
+ // finish writing to temp object, then move into place
+ _write_copy_chunk(cop, &t);
+ t.collection_move_rename(cop->temp_coll, cop->temp_oid, coll, obs.oi.soid);
+ pgbackend->clear_temp_obj(cop->temp_oid);
+ }
+}
+
+int ReplicatedPG::finish_copyfrom(OpContext *ctx)
+{
+ dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
+ ObjectState& obs = ctx->new_obs;
+ CopyFromCallback *cb = static_cast<CopyFromCallback*>(ctx->copy_cb);
+
+ if (!ctx->obs->exists) {
+ ctx->delta_stats.num_objects++;
+ obs.exists = true;
+ }
+ if (cb->is_temp_obj_used()) {
+ ctx->discard_temp_oid = cb->temp_obj;
+ }
+ ctx->op_t.swap(cb->results.get<3>());
+ ctx->op_t.append(cb->results.get<3>());
+
+ interval_set<uint64_t> ch;
+ if (obs.oi.size > 0)
+ ch.insert(0, obs.oi.size);
+ ctx->modified_ranges.union_of(ch);
+
+ if (cb->get_data_size() != obs.oi.size) {
+ ctx->delta_stats.num_bytes -= obs.oi.size;
+ obs.oi.size = cb->get_data_size();
+ ctx->delta_stats.num_bytes += obs.oi.size;
+ }
+ ctx->delta_stats.num_wr++;
+ ctx->delta_stats.num_wr_kb += SHIFT_ROUND_UP(obs.oi.size, 10);
+
+ return 0;
+}
+
+void ReplicatedPG::cancel_copy(CopyOpRef cop)
+{
+ dout(10) << __func__ << " " << cop->obc->obs.oi.soid
+ << " from " << cop->src << " " << cop->oloc << " v" << cop->version
+ << dendl;
+
+ // cancel objecter op, if we can
+ if (cop->objecter_tid) {
+ Mutex::Locker l(osd->objecter_lock);
+ osd->objecter->op_cancel(cop->objecter_tid);
+ }
+
+ copy_ops.erase(cop->obc->obs.oi.soid);
+ --cop->obc->copyfrom_readside;
+
+ kick_object_context_blocked(cop->obc);
+ bool temp_obj_created = !cop->cursor.is_initial();
+ CopyResults result(-ECANCELED, 0, temp_obj_created, ObjectStore::Transaction());
+ cop->cb->complete(result);
+}
+
+void ReplicatedPG::cancel_copy_ops()
+{
+ dout(10) << __func__ << dendl;
+ map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
+ while (p != copy_ops.end()) {
+ cancel_copy((p++)->second);
+ }
+}
// ========================================================================
@@ -3941,10 +4591,19 @@ void ReplicatedPG::apply_repop(RepGather *repop)
if (repop->ctx->clone_obc)
repop->ctx->clone_obc->ondisk_write_lock();
+ bool unlock_snapset_obc = false;
+ if (repop->ctx->snapset_obc && repop->ctx->snapset_obc->obs.oi.soid !=
+ repop->obc->obs.oi.soid) {
+ repop->ctx->snapset_obc->ondisk_write_lock();
+ unlock_snapset_obc = true;
+ }
+
Context *oncommit = new C_OSD_OpCommit(this, repop);
Context *onapplied = new C_OSD_OpApplied(this, repop);
- Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(repop->obc,
- repop->ctx->clone_obc);
+ Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(
+ repop->obc,
+ repop->ctx->clone_obc,
+ unlock_snapset_obc ? repop->ctx->snapset_obc : ObjectContextRef());
int r = osd->store->queue_transactions(osr.get(), repop->tls, onapplied, oncommit, onapplied_sync, repop->ctx->op);
if (r) {
derr << "apply_repop queue_transactions returned " << r << " on " << *repop << dendl;
@@ -3980,9 +4639,11 @@ void ReplicatedPG::op_applied(RepGather *repop)
repop->waitfor_disk.count(whoami) == 0); // commit before ondisk
repop->waitfor_ack.erase(whoami);
- assert(info.last_update >= repop->v);
- assert(last_update_applied < repop->v);
- last_update_applied = repop->v;
+ if (repop->v != eversion_t()) {
+ assert(info.last_update >= repop->v);
+ assert(last_update_applied < repop->v);
+ last_update_applied = repop->v;
+ }
// chunky scrub
if (scrubber.active && scrubber.is_chunky) {
@@ -4029,9 +4690,10 @@ void ReplicatedPG::op_commit(RepGather *repop)
// is no separate reply sent.
repop->waitfor_ack.erase(whoami);
- last_update_ondisk = repop->v;
-
- last_complete_ondisk = repop->pg_local_last_complete;
+ if (repop->v != eversion_t()) {
+ last_update_ondisk = repop->v;
+ last_complete_ondisk = repop->pg_local_last_complete;
+ }
eval_repop(repop);
}
@@ -4045,7 +4707,7 @@ void ReplicatedPG::eval_repop(RepGather *repop)
{
MOSDOp *m = NULL;
if (repop->ctx->op)
- m = static_cast<MOSDOp *>(repop->ctx->op->request);
+ m = static_cast<MOSDOp *>(repop->ctx->op->get_req());
if (m)
dout(10) << "eval_repop " << *repop
@@ -4072,6 +4734,8 @@ void ReplicatedPG::eval_repop(RepGather *repop)
// ondisk?
if (repop->waitfor_disk.empty()) {
+ release_op_ctx_locks(repop->ctx);
+
log_op_stats(repop->ctx);
publish_stats_to_osd();
@@ -4081,7 +4745,8 @@ void ReplicatedPG::eval_repop(RepGather *repop)
for (list<OpRequestRef>::iterator i = waiting_for_ondisk[repop->v].begin();
i != waiting_for_ondisk[repop->v].end();
++i) {
- osd->reply_op_error(*i, 0, repop->v);
+ osd->reply_op_error(*i, 0, repop->ctx->at_version,
+ repop->ctx->user_at_version);
}
waiting_for_ondisk.erase(repop->v);
}
@@ -4097,8 +4762,11 @@ void ReplicatedPG::eval_repop(RepGather *repop)
MOSDOpReply *reply = repop->ctx->reply;
if (reply)
repop->ctx->reply = NULL;
- else
+ else {
reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0);
+ reply->set_reply_versions(repop->ctx->at_version,
+ repop->ctx->user_at_version);
+ }
reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
dout(10) << " sending commit on " << *repop << " " << reply << dendl;
assert(entity_name_t::TYPE_OSD != m->get_connection()->peer_type);
@@ -4117,8 +4785,10 @@ void ReplicatedPG::eval_repop(RepGather *repop)
for (list<OpRequestRef>::iterator i = waiting_for_ack[repop->v].begin();
i != waiting_for_ack[repop->v].end();
++i) {
- MOSDOp *m = (MOSDOp*)(*i)->request;
+ MOSDOp *m = (MOSDOp*)(*i)->get_req();
MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0);
+ reply->set_reply_versions(repop->ctx->at_version,
+ repop->ctx->user_at_version);
reply->add_flags(CEPH_OSD_FLAG_ACK);
osd->send_message_osd_client(reply, m->get_connection());
}
@@ -4130,8 +4800,11 @@ void ReplicatedPG::eval_repop(RepGather *repop)
MOSDOpReply *reply = repop->ctx->reply;
if (reply)
repop->ctx->reply = NULL;
- else
+ else {
reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0);
+ reply->set_reply_versions(repop->ctx->at_version,
+ repop->ctx->user_at_version);
+ }
reply->add_flags(CEPH_OSD_FLAG_ACK);
dout(10) << " sending ack on " << *repop << " " << reply << dendl;
assert(entity_name_t::TYPE_OSD != m->get_connection()->peer_type);
@@ -4144,7 +4817,7 @@ void ReplicatedPG::eval_repop(RepGather *repop)
// _prior_ to being committed; it will not get set with
// writeahead journaling, for instance.
if (repop->ctx->readable_stamp == utime_t())
- repop->ctx->readable_stamp = ceph_clock_now(g_ceph_context);
+ repop->ctx->readable_stamp = ceph_clock_now(cct);
}
}
@@ -4173,8 +4846,7 @@ void ReplicatedPG::eval_repop(RepGather *repop)
}
}
-void ReplicatedPG::issue_repop(RepGather *repop, utime_t now,
- eversion_t old_last_update, bool old_exists, uint64_t old_size, eversion_t old_version)
+void ReplicatedPG::issue_repop(RepGather *repop, utime_t now)
{
OpContext *ctx = repop->ctx;
const hobject_t& soid = ctx->obs->oi.soid;
@@ -4209,37 +4881,34 @@ void ReplicatedPG::issue_repop(RepGather *repop, utime_t now,
get_osdmap()->get_epoch(),
repop->rep_tid, repop->ctx->at_version);
if (ctx->op &&
- ((static_cast<MOSDOp *>(ctx->op->request))->get_flags() & CEPH_OSD_FLAG_PARALLELEXEC)) {
+ ((static_cast<MOSDOp *>(ctx->op->get_req()))->get_flags() & CEPH_OSD_FLAG_PARALLELEXEC)) {
// replicate original op for parallel execution on replica
assert(0 == "broken implementation, do not use");
- wr->oloc = object_locator_t(repop->ctx->obs->oi.soid);
- wr->ops = repop->ctx->ops;
- wr->mtime = repop->ctx->mtime;
- wr->old_exists = old_exists;
- wr->old_size = old_size;
- wr->old_version = old_version;
- wr->snapset = repop->obc->ssc->snapset;
- wr->snapc = repop->ctx->snapc;
- wr->set_data(repop->ctx->op->request->get_data()); // _copy_ bufferlist
- } else {
- // ship resulting transaction, log entries, and pg_stats
- if (peer == backfill_target && soid >= backfill_pos) {
- dout(10) << "issue_repop shipping empty opt to osd." << peer << ", object beyond backfill_pos "
- << backfill_pos << ", last_backfill is " << pinfo.last_backfill << dendl;
- ObjectStore::Transaction t;
- ::encode(t, wr->get_data());
- } else {
- ::encode(repop->ctx->op_t, wr->get_data());
- }
- ::encode(repop->ctx->log, wr->logbl);
+ }
- if (backfill_target >= 0 && backfill_target == peer)
- wr->pg_stats = pinfo.stats; // reflects backfill progress
- else
- wr->pg_stats = info.stats;
+ // ship resulting transaction, log entries, and pg_stats
+ if (peer == backfill_target && soid >= backfill_pos &&
+ soid.pool == (int64_t)info.pgid.pool()) { // only skip normal (not temp pool=-1) objects
+ dout(10) << "issue_repop shipping empty opt to osd." << peer << ", object beyond backfill_pos "
+ << backfill_pos << ", last_backfill is " << pinfo.last_backfill << dendl;
+ ObjectStore::Transaction t;
+ ::encode(t, wr->get_data());
+ } else {
+ ::encode(repop->ctx->op_t, wr->get_data());
}
+
+ ::encode(repop->ctx->log, wr->logbl);
+
+ if (backfill_target >= 0 && backfill_target == peer)
+ wr->pg_stats = pinfo.stats; // reflects backfill progress
+ else
+ wr->pg_stats = info.stats;
wr->pg_trim_to = pg_trim_to;
+
+ wr->new_temp_oid = repop->ctx->new_temp_oid;
+ wr->discard_temp_oid = repop->ctx->discard_temp_oid;
+
osd->send_message_osd_cluster(peer, wr, get_osdmap()->get_epoch());
// keep peer_info up to date
@@ -4253,13 +4922,13 @@ ReplicatedPG::RepGather *ReplicatedPG::new_repop(OpContext *ctx, ObjectContextRe
tid_t rep_tid)
{
if (ctx->op)
- dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->request << dendl;
+ dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
else
dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
RepGather *repop = new RepGather(ctx, obc, rep_tid, info.last_complete);
- repop->start = ceph_clock_now(g_ceph_context);
+ repop->start = ceph_clock_now(cct);
repop_queue.push_back(&repop->queue_item);
repop_map[repop->rep_tid] = repop;
@@ -4272,6 +4941,7 @@ ReplicatedPG::RepGather *ReplicatedPG::new_repop(OpContext *ctx, ObjectContextRe
void ReplicatedPG::remove_repop(RepGather *repop)
{
+ release_op_ctx_locks(repop->ctx);
repop_map.erase(repop->rep_tid);
repop->put();
@@ -4284,7 +4954,7 @@ void ReplicatedPG::repop_ack(RepGather *repop, int result, int ack_type,
MOSDOp *m = NULL;
if (repop->ctx->op)
- m = static_cast<MOSDOp *>(repop->ctx->op->request);
+ m = static_cast<MOSDOp *>(repop->ctx->op->get_req());
if (m)
dout(7) << "repop_ack rep_tid " << repop->rep_tid << " op " << *m
@@ -4393,7 +5063,8 @@ void ReplicatedPG::check_blacklisted_obc_watchers(ObjectContextRef obc)
void ReplicatedPG::populate_obc_watchers(ObjectContextRef obc)
{
assert(is_active());
- assert(!is_missing_object(obc->obs.oi.soid) ||
+ assert((recovering.count(obc->obs.oi.soid) ||
+ !is_missing_object(obc->obs.oi.soid)) ||
(pg_log.get_log().objects.count(obc->obs.oi.soid) && // or this is a revert... see recover_primary()
pg_log.get_log().objects.find(obc->obs.oi.soid)->second->op ==
pg_log_entry_t::LOST_REVERT &&
@@ -4458,7 +5129,7 @@ void ReplicatedPG::handle_watch_timeout(WatchRef watch)
osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
OpContext *ctx = new OpContext(OpRequestRef(), reqid, ops,
&obc->obs, obc->ssc, this);
- ctx->mtime = ceph_clock_now(g_ceph_context);
+ ctx->mtime = ceph_clock_now(cct);
ctx->at_version.epoch = get_osdmap()->get_epoch();
ctx->at_version.version = pg_log.get_head().version + 1;
@@ -4470,16 +5141,12 @@ void ReplicatedPG::handle_watch_timeout(WatchRef watch)
ObjectStore::Transaction *t = &ctx->op_t;
ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
- ctx->at_version,
- obc->obs.oi.version,
- osd_reqid_t(), ctx->mtime));
-
- eversion_t old_last_update = pg_log.get_head();
- bool old_exists = repop->obc->obs.exists;
- uint64_t old_size = repop->obc->obs.oi.size;
- eversion_t old_version = repop->obc->obs.oi.version;
+ ctx->at_version,
+ obc->obs.oi.version,
+ 0,
+ osd_reqid_t(), ctx->mtime));
- obc->obs.oi.prior_version = old_version;
+ obc->obs.oi.prior_version = repop->obc->obs.oi.version;
obc->obs.oi.version = ctx->at_version;
bufferlist bl;
::encode(obc->obs.oi, bl);
@@ -4488,9 +5155,9 @@ void ReplicatedPG::handle_watch_timeout(WatchRef watch)
append_log(repop->ctx->log, eversion_t(), repop->ctx->local_t);
// obc ref swallowed by repop!
- issue_repop(repop, repop->ctx->mtime, old_last_update, old_exists,
- old_size, old_version);
+ issue_repop(repop, repop->ctx->mtime);
eval_repop(repop);
+ repop->put();
}
ObjectContextRef ReplicatedPG::create_object_context(const object_info_t& oi,
@@ -4510,23 +5177,37 @@ ObjectContextRef ReplicatedPG::create_object_context(const object_info_t& oi,
}
ObjectContextRef ReplicatedPG::get_object_context(const hobject_t& soid,
- bool can_create)
-{
+ bool can_create,
+ map<string, bufferptr> *attrs)
+{
+ assert(
+ attrs || !pg_log.get_missing().is_missing(soid) ||
+ // or this is a revert... see recover_primary()
+ (pg_log.get_log().objects.count(soid) &&
+ pg_log.get_log().objects.find(soid)->second->op ==
+ pg_log_entry_t::LOST_REVERT));
ObjectContextRef obc = object_contexts.lookup(soid);
if (obc) {
dout(10) << "get_object_context " << obc << " " << soid << dendl;
} else {
// check disk
bufferlist bv;
- int r = osd->store->getattr(coll, soid, OI_ATTR, bv);
- if (r < 0) {
- if (!can_create)
- return ObjectContextRef(); // -ENOENT!
-
- // new object.
- object_info_t oi(soid);
- SnapSetContext *ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, true, soid.get_namespace());
- return create_object_context(oi, ssc);
+ if (attrs) {
+ assert(attrs->count(OI_ATTR));
+ bv.push_back(attrs->find(OI_ATTR)->second);
+ } else {
+ int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
+ if (r < 0) {
+ if (!can_create)
+ return ObjectContextRef(); // -ENOENT!
+
+ // new object.
+ object_info_t oi(soid);
+ SnapSetContext *ssc = get_snapset_context(
+ soid.oid, soid.get_key(), soid.hash, true, soid.get_namespace(),
+ soid.has_snapset() ? attrs : 0);
+ return create_object_context(oi, ssc);
+ }
}
object_info_t oi(bv);
@@ -4538,10 +5219,11 @@ ObjectContextRef ReplicatedPG::get_object_context(const hobject_t& soid,
obc->obs.oi = oi;
obc->obs.exists = true;
- if (can_create) {
- obc->ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, true, soid.get_namespace());
- register_snapset_context(obc->ssc);
- }
+ obc->ssc = get_snapset_context(
+ soid.oid, soid.get_key(), soid.hash,
+ true, soid.get_namespace(),
+ soid.has_snapset() ? attrs : 0);
+ register_snapset_context(obc->ssc);
populate_obc_watchers(obc);
dout(10) << "get_object_context " << obc << " " << soid << " 0 -> 1 read " << obc->obs.oi << dendl;
@@ -4689,9 +5371,6 @@ int ReplicatedPG::find_object_context(const hobject_t& oid,
void ReplicatedPG::object_context_destructor_callback(ObjectContext *obc)
{
- dout(10) << "object_context_destructor_callback " << obc << " "
- << obc->obs.oi.soid << dendl;
-
if (obc->ssc)
put_snapset_context(obc->ssc);
}
@@ -4736,21 +5415,40 @@ void ReplicatedPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t
pgstat->stats.cat_sum[oi.category].add(stat);
}
+void ReplicatedPG::kick_object_context_blocked(ObjectContextRef obc)
+{
+ const hobject_t& soid = obc->obs.oi.soid;
+ map<hobject_t, list<OpRequestRef> >::iterator p = waiting_for_blocked_object.find(soid);
+ if (p == waiting_for_blocked_object.end())
+ return;
+
+ if (obc->is_blocked()) {
+ dout(10) << __func__ << " " << soid << " still blocked" << dendl;
+ return;
+ }
+
+ list<OpRequestRef>& ls = p->second;
+ dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
+ requeue_ops(ls);
+ waiting_for_blocked_object.erase(p);
+}
+
SnapSetContext *ReplicatedPG::create_snapset_context(const object_t& oid)
{
Mutex::Locker l(snapset_contexts_lock);
SnapSetContext *ssc = new SnapSetContext(oid);
- dout(10) << "create_snapset_context " << ssc << " " << ssc->oid << dendl;
_register_snapset_context(ssc);
ssc->ref++;
return ssc;
}
-SnapSetContext *ReplicatedPG::get_snapset_context(const object_t& oid,
- const string& key,
- ps_t seed,
- bool can_create,
- const string& nspace)
+SnapSetContext *ReplicatedPG::get_snapset_context(
+ const object_t& oid,
+ const string& key,
+ ps_t seed,
+ bool can_create,
+ const string& nspace,
+ map<string, bufferptr> *attrs)
{
Mutex::Locker l(snapset_contexts_lock);
SnapSetContext *ssc;
@@ -4759,27 +5457,30 @@ SnapSetContext *ReplicatedPG::get_snapset_context(const object_t& oid,
ssc = p->second;
} else {
bufferlist bv;
- hobject_t head(oid, key, CEPH_NOSNAP, seed,
- info.pgid.pool(), nspace);
- int r = osd->store->getattr(coll, head, SS_ATTR, bv);
- if (r < 0) {
- // try _snapset
- hobject_t snapdir(oid, key, CEPH_SNAPDIR, seed,
- info.pgid.pool(), nspace);
- r = osd->store->getattr(coll, snapdir, SS_ATTR, bv);
- if (r < 0 && !can_create)
- return NULL;
+ if (!attrs) {
+ hobject_t head(oid, key, CEPH_NOSNAP, seed,
+ info.pgid.pool(), nspace);
+ int r = pgbackend->objects_get_attr(head, SS_ATTR, &bv);
+ if (r < 0) {
+ // try _snapset
+ hobject_t snapdir(oid, key, CEPH_SNAPDIR, seed,
+ info.pgid.pool(), nspace);
+ r = pgbackend->objects_get_attr(snapdir, SS_ATTR, &bv);
+ if (r < 0 && !can_create)
+ return NULL;
+ }
+ } else {
+ assert(attrs->count(SS_ATTR));
+ bv.push_back(attrs->find(SS_ATTR)->second);
}
ssc = new SnapSetContext(oid);
_register_snapset_context(ssc);
- if (r >= 0) {
+ if (bv.length()) {
bufferlist::iterator bvp = bv.begin();
ssc->snapset.decode(bvp);
}
}
assert(ssc);
- dout(10) << "get_snapset_context " << ssc->oid << " "
- << ssc->ref << " -> " << (ssc->ref+1) << dendl;
ssc->ref++;
return ssc;
}
@@ -4787,8 +5488,6 @@ SnapSetContext *ReplicatedPG::get_snapset_context(const object_t& oid,
void ReplicatedPG::put_snapset_context(SnapSetContext *ssc)
{
Mutex::Locker l(snapset_contexts_lock);
- dout(10) << "put_snapset_context " << ssc->oid << " "
- << ssc->ref << " -> " << (ssc->ref-1) << dendl;
--ssc->ref;
if (ssc->ref == 0) {
if (ssc->registered)
@@ -4801,7 +5500,7 @@ void ReplicatedPG::put_snapset_context(SnapSetContext *ssc)
void ReplicatedPG::sub_op_modify(OpRequestRef op)
{
- MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request);
+ MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req());
assert(m->get_header().type == MSG_OSD_SUBOP);
const hobject_t& soid = m->poid;
@@ -4844,71 +5543,51 @@ void ReplicatedPG::sub_op_modify(OpRequestRef op)
rm->epoch_started = get_osdmap()->get_epoch();
if (!m->noop) {
- if (m->logbl.length()) {
- // shipped transaction and log entries
- vector<pg_log_entry_t> log;
-
- bufferlist::iterator p = m->get_data().begin();
-
- ::decode(rm->opt, p);
- if (!(m->get_connection()->get_features() & CEPH_FEATURE_OSD_SNAPMAPPER))
- rm->opt.set_tolerate_collection_add_enoent();
- p = m->logbl.begin();
- ::decode(log, p);
- if (m->hobject_incorrect_pool) {
- for (vector<pg_log_entry_t>::iterator i = log.begin();
- i != log.end();
- ++i) {
- if (i->soid.pool == -1)
- i->soid.pool = info.pgid.pool();
- }
- rm->opt.set_pool_override(info.pgid.pool());
+ assert(m->logbl.length());
+ // shipped transaction and log entries
+ vector<pg_log_entry_t> log;
+
+ bufferlist::iterator p = m->get_data().begin();
+
+ if (m->new_temp_oid != hobject_t()) {
+ dout(20) << __func__ << " start tracking temp " << m->new_temp_oid << dendl;
+ pgbackend->add_temp_obj(m->new_temp_oid);
+ get_temp_coll(&rm->localt);
+ }
+ if (m->discard_temp_oid != hobject_t()) {
+ dout(20) << __func__ << " stop tracking temp " << m->discard_temp_oid << dendl;
+ pgbackend->clear_temp_obj(m->discard_temp_oid);
+ }
+
+ ::decode(rm->opt, p);
+ if (!(m->get_connection()->get_features() & CEPH_FEATURE_OSD_SNAPMAPPER))
+ rm->opt.set_tolerate_collection_add_enoent();
+ p = m->logbl.begin();
+ ::decode(log, p);
+ if (m->hobject_incorrect_pool) {
+ for (vector<pg_log_entry_t>::iterator i = log.begin();
+ i != log.end();
+ ++i) {
+ if (i->soid.pool == -1)
+ i->soid.pool = info.pgid.pool();
}
- rm->opt.set_replica();
-
- info.stats = m->pg_stats;
- if (!rm->opt.empty()) {
- // If the opt is non-empty, we infer we are before
- // last_backfill (according to the primary, not our
- // not-quite-accurate value), and should update the
- // collections now. Otherwise, we do it later on push.
- update_snap_map(log, rm->localt);
- }
- append_log(log, m->pg_trim_to, rm->localt);
-
- rm->tls.push_back(&rm->localt);
- rm->tls.push_back(&rm->opt);
-
- } else {
- // do op
- assert(0);
-
- // TODO: this is severely broken because we don't know whether this object is really lost or
- // not. We just always assume that it's not right now.
- // Also, we're taking the address of a variable on the stack.
- object_info_t oi(soid);
- oi.lost = false; // I guess?
- oi.version = m->old_version;
- oi.size = m->old_size;
- ObjectState obs(oi, m->old_exists);
- SnapSetContext ssc(m->poid.oid);
-
- rm->ctx = new OpContext(op, m->reqid, m->ops, &obs, &ssc, this);
-
- rm->ctx->mtime = m->mtime;
- rm->ctx->at_version = m->version;
- rm->ctx->snapc = m->snapc;
+ rm->opt.set_pool_override(info.pgid.pool());
+ }
+ rm->opt.set_replica();
- ssc.snapset = m->snapset;
- rm->ctx->obc->ssc = &ssc;
-
- prepare_transaction(rm->ctx);
- append_log(rm->ctx->log, m->pg_trim_to, rm->ctx->local_t);
-
- rm->tls.push_back(&rm->ctx->op_t);
- rm->tls.push_back(&rm->ctx->local_t);
+ info.stats = m->pg_stats;
+ if (!rm->opt.empty()) {
+ // If the opt is non-empty, we infer we are before
+ // last_backfill (according to the primary, not our
+ // not-quite-accurate value), and should update the
+ // collections now. Otherwise, we do it later on push.
+ update_snap_map(log, rm->localt);
}
+ append_log(log, m->pg_trim_to, rm->localt);
+ rm->tls.push_back(&rm->localt);
+ rm->tls.push_back(&rm->opt);
+
rm->bytes_written = rm->opt.get_encoded_bytes();
} else {
@@ -4940,8 +5619,8 @@ void ReplicatedPG::sub_op_modify_applied(RepModify *rm)
rm->applied = true;
if (!pg_has_reset_since(rm->epoch_started)) {
- dout(10) << "sub_op_modify_applied on " << rm << " op " << *rm->op->request << dendl;
- MOSDSubOp *m = static_cast<MOSDSubOp*>(rm->op->request);
+ dout(10) << "sub_op_modify_applied on " << rm << " op " << *rm->op->get_req() << dendl;
+ MOSDSubOp *m = static_cast<MOSDSubOp*>(rm->op->get_req());
assert(m->get_header().type == MSG_OSD_SUBOP);
if (!rm->committed) {
@@ -4951,9 +5630,11 @@ void ReplicatedPG::sub_op_modify_applied(RepModify *rm)
osd->send_message_osd_cluster(rm->ackerosd, ack, get_osdmap()->get_epoch());
}
- assert(info.last_update >= m->version);
- assert(last_update_applied < m->version);
- last_update_applied = m->version;
+ if (m->version != eversion_t()) {
+ assert(info.last_update >= m->version);
+ assert(last_update_applied < m->version);
+ last_update_applied = m->version;
+ }
if (scrubber.active_rep_scrub) {
if (last_update_applied == scrubber.active_rep_scrub->scrub_to) {
osd->rep_scrub_wq.queue(scrubber.active_rep_scrub);
@@ -4961,7 +5642,7 @@ void ReplicatedPG::sub_op_modify_applied(RepModify *rm)
}
}
} else {
- dout(10) << "sub_op_modify_applied on " << rm << " op " << *rm->op->request
+ dout(10) << "sub_op_modify_applied on " << rm << " op " << *rm->op->get_req()
<< " from epoch " << rm->epoch_started << " < last_peering_reset "
<< last_peering_reset << dendl;
}
@@ -4983,24 +5664,24 @@ void ReplicatedPG::sub_op_modify_commit(RepModify *rm)
if (!pg_has_reset_since(rm->epoch_started)) {
// send commit.
- dout(10) << "sub_op_modify_commit on op " << *rm->op->request
+ dout(10) << "sub_op_modify_commit on op " << *rm->op->get_req()
<< ", sending commit to osd." << rm->ackerosd
<< dendl;
if (get_osdmap()->is_up(rm->ackerosd)) {
last_complete_ondisk = rm->last_complete;
- MOSDSubOpReply *commit = new MOSDSubOpReply(static_cast<MOSDSubOp*>(rm->op->request), 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ONDISK);
+ MOSDSubOpReply *commit = new MOSDSubOpReply(static_cast<MOSDSubOp*>(rm->op->get_req()), 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ONDISK);
commit->set_last_complete_ondisk(rm->last_complete);
commit->set_priority(CEPH_MSG_PRIO_HIGH); // this better match ack priority!
osd->send_message_osd_cluster(rm->ackerosd, commit, get_osdmap()->get_epoch());
}
} else {
- dout(10) << "sub_op_modify_commit " << rm << " op " << *rm->op->request
+ dout(10) << "sub_op_modify_commit " << rm << " op " << *rm->op->get_req()
<< " from epoch " << rm->epoch_started << " < last_peering_reset "
<< last_peering_reset << dendl;
}
- log_subop_stats(rm->op, l_osd_sop_w_inb, l_osd_sop_w_lat);
+ log_subop_stats(osd, rm->op, l_osd_sop_w_inb, l_osd_sop_w_lat);
bool done = rm->applied && rm->committed;
unlock();
if (done) {
@@ -5012,7 +5693,7 @@ void ReplicatedPG::sub_op_modify_commit(RepModify *rm)
void ReplicatedPG::sub_op_modify_reply(OpRequestRef op)
{
- MOSDSubOpReply *r = static_cast<MOSDSubOpReply*>(op->request);
+ MOSDSubOpReply *r = static_cast<MOSDSubOpReply*>(op->get_req());
assert(r->get_header().type == MSG_OSD_SUBOPREPLY);
op->mark_started();
@@ -5041,11 +5722,12 @@ void ReplicatedPG::sub_op_modify_reply(OpRequestRef op)
// ===========================================================
-void ReplicatedPG::calc_head_subsets(ObjectContextRef obc, SnapSet& snapset, const hobject_t& head,
- pg_missing_t& missing,
- const hobject_t &last_backfill,
- interval_set<uint64_t>& data_subset,
- map<hobject_t, interval_set<uint64_t> >& clone_subsets)
+void ReplicatedBackend::calc_head_subsets(
+ ObjectContextRef obc, SnapSet& snapset, const hobject_t& head,
+ const pg_missing_t& missing,
+ const hobject_t &last_backfill,
+ interval_set<uint64_t>& data_subset,
+ map<hobject_t, interval_set<uint64_t> >& clone_subsets)
{
dout(10) << "calc_head_subsets " << head
<< " clone_overlap " << snapset.clone_overlap << dendl;
@@ -5054,7 +5736,7 @@ void ReplicatedPG::calc_head_subsets(ObjectContextRef obc, SnapSet& snapset, con
if (size)
data_subset.insert(0, size);
- if (!g_conf->osd_recover_clone_overlap) {
+ if (!cct->_conf->osd_recover_clone_overlap) {
dout(10) << "calc_head_subsets " << head << " -- osd_recover_clone_overlap disabled" << dendl;
return;
}
@@ -5081,7 +5763,7 @@ void ReplicatedPG::calc_head_subsets(ObjectContextRef obc, SnapSet& snapset, con
}
- if (cloning.num_intervals() > g_conf->osd_recover_clone_overlap_limit) {
+ if (cloning.num_intervals() > cct->_conf->osd_recover_clone_overlap_limit) {
dout(10) << "skipping clone, too many holes" << dendl;
clone_subsets.clear();
cloning.clear();
@@ -5095,11 +5777,12 @@ void ReplicatedPG::calc_head_subsets(ObjectContextRef obc, SnapSet& snapset, con
<< " clone_subsets " << clone_subsets << dendl;
}
-void ReplicatedPG::calc_clone_subsets(SnapSet& snapset, const hobject_t& soid,
- const pg_missing_t& missing,
- const hobject_t &last_backfill,
- interval_set<uint64_t>& data_subset,
- map<hobject_t, interval_set<uint64_t> >& clone_subsets)
+void ReplicatedBackend::calc_clone_subsets(
+ SnapSet& snapset, const hobject_t& soid,
+ const pg_missing_t& missing,
+ const hobject_t &last_backfill,
+ interval_set<uint64_t>& data_subset,
+ map<hobject_t, interval_set<uint64_t> >& clone_subsets)
{
dout(10) << "calc_clone_subsets " << soid
<< " clone_overlap " << snapset.clone_overlap << dendl;
@@ -5108,7 +5791,7 @@ void ReplicatedPG::calc_clone_subsets(SnapSet& snapset, const hobject_t& soid,
if (size)
data_subset.insert(0, size);
- if (!g_conf->osd_recover_clone_overlap) {
+ if (!cct->_conf->osd_recover_clone_overlap) {
dout(10) << "calc_clone_subsets " << soid << " -- osd_recover_clone_overlap disabled" << dendl;
return;
}
@@ -5157,7 +5840,7 @@ void ReplicatedPG::calc_clone_subsets(SnapSet& snapset, const hobject_t& soid,
<< " overlap " << next << dendl;
}
- if (cloning.num_intervals() > g_conf->osd_recover_clone_overlap_limit) {
+ if (cloning.num_intervals() > cct->_conf->osd_recover_clone_overlap_limit) {
dout(10) << "skipping clone, too many holes" << dendl;
clone_subsets.clear();
cloning.clear();
@@ -5184,95 +5867,69 @@ void ReplicatedPG::calc_clone_subsets(SnapSet& snapset, const hobject_t& soid,
*/
enum { PULL_NONE, PULL_OTHER, PULL_YES };
-int ReplicatedPG::prepare_pull(
- const hobject_t& soid, eversion_t v,
- int priority,
- map<int, vector<PullOp> > *pulls)
-{
+void ReplicatedBackend::prepare_pull(
+ const hobject_t& soid,
+ ObjectContextRef headctx,
+ RPGHandle *h)
+{
+ assert(get_parent()->get_local_missing().missing.count(soid));
+ eversion_t v = get_parent()->get_local_missing().missing.find(
+ soid)->second.need;
+ const map<hobject_t, set<int> > &missing_loc(
+ get_parent()->get_missing_loc());
+ const map<int, pg_missing_t > &peer_missing(
+ get_parent()->get_peer_missing());
int fromosd = -1;
- map<hobject_t,set<int> >::iterator q = missing_loc.find(soid);
- if (q != missing_loc.end()) {
- // randomize the list of possible sources
- // should we take weights into account?
- vector<int> shuffle(q->second.begin(), q->second.end());
- random_shuffle(shuffle.begin(), shuffle.end());
- for (vector<int>::iterator p = shuffle.begin();
- p != shuffle.end();
- ++p) {
- if (get_osdmap()->is_up(*p)) {
- fromosd = *p;
- break;
- }
- }
- }
- if (fromosd < 0) {
- dout(7) << "pull " << soid
- << " v " << v
- << " but it is unfound" << dendl;
- return PULL_NONE;
- }
+ map<hobject_t,set<int> >::const_iterator q = missing_loc.find(soid);
+ assert(q != missing_loc.end());
+ assert(!q->second.empty());
+
+ // pick a pullee
+ vector<int> shuffle(q->second.begin(), q->second.end());
+ random_shuffle(shuffle.begin(), shuffle.end());
+ vector<int>::iterator p = shuffle.begin();
+ assert(get_osdmap()->is_up(*p));
+ fromosd = *p;
+ assert(fromosd >= 0);
+
+ dout(7) << "pull " << soid
+ << "v " << v
+ << " on osds " << *p
+ << " from osd." << fromosd
+ << dendl;
assert(peer_missing.count(fromosd));
- if (peer_missing[fromosd].is_missing(soid, v)) {
- assert(peer_missing[fromosd].missing[soid].have != v);
+ const pg_missing_t &pmissing = peer_missing.find(fromosd)->second;
+ if (pmissing.is_missing(soid, v)) {
+ assert(pmissing.missing.find(soid)->second.have != v);
dout(10) << "pulling soid " << soid << " from osd " << fromosd
- << " at version " << peer_missing[fromosd].missing[soid].have
+ << " at version " << pmissing.missing.find(soid)->second.have
<< " rather than at version " << v << dendl;
- v = peer_missing[fromosd].missing[soid].have;
- assert(pg_log.get_log().objects.count(soid) &&
- pg_log.get_log().objects.find(soid)->second->op == pg_log_entry_t::LOST_REVERT &&
- pg_log.get_log().objects.find(soid)->second->reverting_to == v);
+ v = pmissing.missing.find(soid)->second.have;
+ assert(get_parent()->get_log().get_log().objects.count(soid) &&
+ (get_parent()->get_log().get_log().objects.find(soid)->second->op ==
+ pg_log_entry_t::LOST_REVERT) &&
+ (get_parent()->get_log().get_log().objects.find(
+ soid)->second->reverting_to ==
+ v));
}
- dout(7) << "pull " << soid
- << " v " << v
- << " on osds " << missing_loc[soid]
- << " from osd." << fromosd
- << dendl;
-
ObjectRecoveryInfo recovery_info;
- // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
- if (soid.snap && soid.snap < CEPH_NOSNAP) {
- // do we have the head and/or snapdir?
- hobject_t head = soid;
- head.snap = CEPH_NOSNAP;
- if (pg_log.get_missing().is_missing(head)) {
- if (pulling.count(head)) {
- dout(10) << " missing but already pulling head " << head << dendl;
- return PULL_NONE;
- } else {
- int r = prepare_pull(
- head, pg_log.get_missing().missing.find(head)->second.need, priority,
- pulls);
- if (r != PULL_NONE)
- return PULL_OTHER;
- return PULL_NONE;
- }
- }
- head.snap = CEPH_SNAPDIR;
- if (pg_log.get_missing().is_missing(head)) {
- if (pulling.count(head)) {
- dout(10) << " missing but already pulling snapdir " << head << dendl;
- return PULL_NONE;
- } else {
- int r = prepare_pull(
- head, pg_log.get_missing().missing.find(head)->second.need, priority,
- pulls);
- if (r != PULL_NONE)
- return PULL_OTHER;
- return PULL_NONE;
- }
- }
-
+ if (soid.is_snap()) {
+ assert(!get_parent()->get_local_missing().is_missing(
+ soid.get_head()) ||
+ !get_parent()->get_local_missing().is_missing(
+ soid.get_snapdir()));
+ assert(headctx);
// check snapset
- SnapSetContext *ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, false, soid.get_namespace());
+ SnapSetContext *ssc = headctx->ssc;
assert(ssc);
dout(10) << " snapset " << ssc->snapset << dendl;
- calc_clone_subsets(ssc->snapset, soid, pg_log.get_missing(), info.last_backfill,
+ calc_clone_subsets(ssc->snapset, soid, get_parent()->get_local_missing(),
+ get_info().last_backfill,
recovery_info.copy_subset,
recovery_info.clone_subset);
- put_snapset_context(ssc);
// FIXME: this may overestimate if we are pulling multiple clones in parallel...
dout(10) << " pulling " << recovery_info << dendl;
} else {
@@ -5282,8 +5939,8 @@ int ReplicatedPG::prepare_pull(
recovery_info.size = ((uint64_t)-1);
}
- (*pulls)[fromosd].push_back(PullOp());
- PullOp &op = (*pulls)[fromosd].back();
+ h->pulls[fromosd].push_back(PullOp());
+ PullOp &op = h->pulls[fromosd].back();
op.soid = soid;
op.recovery_info = recovery_info;
@@ -5297,11 +5954,78 @@ int ReplicatedPG::prepare_pull(
assert(!pulling.count(soid));
pull_from_peer[fromosd].insert(soid);
PullInfo &pi = pulling[soid];
+ pi.head_ctx = headctx;
pi.recovery_info = op.recovery_info;
pi.recovery_progress = op.recovery_progress;
- pi.priority = priority;
+}
+
+int ReplicatedPG::recover_missing(
+ const hobject_t &soid, eversion_t v,
+ int priority,
+ PGBackend::RecoveryHandle *h)
+{
+ map<hobject_t,set<int> >::iterator q = missing_loc.find(soid);
+ if (q == missing_loc.end()) {
+ dout(7) << "pull " << soid
+ << " v " << v
+ << " but it is unfound" << dendl;
+ return PULL_NONE;
+ }
+
+ // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
+ ObjectContextRef obc;
+ ObjectContextRef head_obc;
+ if (soid.snap && soid.snap < CEPH_NOSNAP) {
+ // do we have the head and/or snapdir?
+ hobject_t head = soid.get_head();
+ if (pg_log.get_missing().is_missing(head)) {
+ if (recovering.count(head)) {
+ dout(10) << " missing but already recovering head " << head << dendl;
+ return PULL_NONE;
+ } else {
+ int r = recover_missing(
+ head, pg_log.get_missing().missing.find(head)->second.need, priority,
+ h);
+ if (r != PULL_NONE)
+ return PULL_OTHER;
+ return PULL_NONE;
+ }
+ }
+ head = soid.get_snapdir();
+ if (pg_log.get_missing().is_missing(head)) {
+ if (recovering.count(head)) {
+ dout(10) << " missing but already recovering snapdir " << head << dendl;
+ return PULL_NONE;
+ } else {
+ int r = recover_missing(
+ head, pg_log.get_missing().missing.find(head)->second.need, priority,
+ h);
+ if (r != PULL_NONE)
+ return PULL_OTHER;
+ return PULL_NONE;
+ }
+ }
+ // we must have one or the other
+ head_obc = get_object_context(
+ soid.get_head(),
+ false,
+ 0);
+ if (!head_obc)
+ head_obc = get_object_context(
+ soid.get_snapdir(),
+ false,
+ 0);
+ assert(head_obc);
+ }
start_recovery_op(soid);
+ assert(!recovering.count(soid));
+ recovering.insert(soid);
+ pgbackend->recover_object(
+ soid,
+ head_obc,
+ obc,
+ h);
return PULL_YES;
}
@@ -5325,15 +6049,14 @@ void ReplicatedPG::send_remove_op(const hobject_t& oid, eversion_t v, int peer)
* intelligently push an object to a replica. make use of existing
* clones/heads and dup data ranges where possible.
*/
-void ReplicatedPG::prep_push_to_replica(
+void ReplicatedBackend::prep_push_to_replica(
ObjectContextRef obc, const hobject_t& soid, int peer,
- int prio,
PushOp *pop)
{
const object_info_t& oi = obc->obs.oi;
uint64_t size = obc->obs.oi.size;
- dout(10) << __func__ << soid << " v" << oi.version
+ dout(10) << __func__ << ": " << soid << " v" << oi.version
<< " size " << size << " to osd." << peer << dendl;
map<hobject_t, interval_set<uint64_t> > clone_subsets;
@@ -5346,41 +6069,48 @@ void ReplicatedPG::prep_push_to_replica(
// try to base push off of clones that succeed/preceed poid
// we need the head (and current SnapSet) locally to do that.
- if (pg_log.get_missing().is_missing(head)) {
+ if (get_parent()->get_local_missing().is_missing(head)) {
dout(15) << "push_to_replica missing head " << head << ", pushing raw clone" << dendl;
- return prep_push(prio, obc, soid, peer, pop);
+ return prep_push(obc, soid, peer, pop);
}
hobject_t snapdir = head;
snapdir.snap = CEPH_SNAPDIR;
- if (pg_log.get_missing().is_missing(snapdir)) {
- dout(15) << "push_to_replica missing snapdir " << snapdir << ", pushing raw clone" << dendl;
- return prep_push(prio, obc, soid, peer, pop);
+ if (get_parent()->get_local_missing().is_missing(snapdir)) {
+ dout(15) << "push_to_replica missing snapdir " << snapdir
+ << ", pushing raw clone" << dendl;
+ return prep_push(obc, soid, peer, pop);
}
- SnapSetContext *ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, false, soid.get_namespace());
+ SnapSetContext *ssc = obc->ssc;
assert(ssc);
dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl;
- calc_clone_subsets(ssc->snapset, soid, peer_missing[peer],
- peer_info[peer].last_backfill,
+ map<int, pg_missing_t>::const_iterator pm =
+ get_parent()->get_peer_missing().find(peer);
+ assert(pm != get_parent()->get_peer_missing().end());
+ map<int, pg_info_t>::const_iterator pi =
+ get_parent()->get_peer_info().find(peer);
+ assert(pi != get_parent()->get_peer_info().end());
+ calc_clone_subsets(ssc->snapset, soid,
+ pm->second,
+ pi->second.last_backfill,
data_subset, clone_subsets);
- put_snapset_context(ssc);
} else if (soid.snap == CEPH_NOSNAP) {
// pushing head or unversioned object.
// base this on partially on replica's clones?
- SnapSetContext *ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, false, soid.get_namespace());
+ SnapSetContext *ssc = obc->ssc;
assert(ssc);
dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl;
- calc_head_subsets(obc, ssc->snapset, soid, peer_missing[peer],
- peer_info[peer].last_backfill,
- data_subset, clone_subsets);
- put_snapset_context(ssc);
+ calc_head_subsets(
+ obc,
+ ssc->snapset, soid, get_parent()->get_peer_missing().find(peer)->second,
+ get_parent()->get_peer_info().find(peer)->second.last_backfill,
+ data_subset, clone_subsets);
}
- prep_push(prio, obc, soid, peer, oi.version, data_subset, clone_subsets, pop);
+ prep_push(obc, soid, peer, oi.version, data_subset, clone_subsets, pop);
}
-void ReplicatedPG::prep_push(int prio,
- ObjectContextRef obc,
+void ReplicatedBackend::prep_push(ObjectContextRef obc,
const hobject_t& soid, int peer,
PushOp *pop)
{
@@ -5389,13 +6119,12 @@ void ReplicatedPG::prep_push(int prio,
data_subset.insert(0, obc->obs.oi.size);
map<hobject_t, interval_set<uint64_t> > clone_subsets;
- prep_push(prio, obc, soid, peer,
+ prep_push(obc, soid, peer,
obc->obs.oi.version, data_subset, clone_subsets,
pop);
}
-void ReplicatedPG::prep_push(
- int prio,
+void ReplicatedBackend::prep_push(
ObjectContextRef obc,
const hobject_t& soid, int peer,
eversion_t version,
@@ -5403,9 +6132,10 @@ void ReplicatedPG::prep_push(
map<hobject_t, interval_set<uint64_t> >& clone_subsets,
PushOp *pop)
{
- peer_missing[peer].revise_have(soid, eversion_t());
+ get_parent()->begin_peer_recover(peer, soid);
// take note.
PushInfo &pi = pushing[soid][peer];
+ pi.obc = obc;
pi.recovery_info.size = obc->obs.oi.size;
pi.recovery_info.copy_subset = data_subset;
pi.recovery_info.clone_subset = clone_subsets;
@@ -5416,19 +6146,20 @@ void ReplicatedPG::prep_push(
pi.recovery_progress.data_recovered_to = 0;
pi.recovery_progress.data_complete = 0;
pi.recovery_progress.omap_complete = 0;
- pi.priority = prio;
ObjectRecoveryProgress new_progress;
- build_push_op(pi.recovery_info,
- pi.recovery_progress,
- &new_progress,
- pop);
+ int r = build_push_op(pi.recovery_info,
+ pi.recovery_progress,
+ &new_progress,
+ pop,
+ &(pi.stat));
+ assert(r == 0);
pi.recovery_progress = new_progress;
}
-int ReplicatedPG::send_pull_legacy(int prio, int peer,
- const ObjectRecoveryInfo &recovery_info,
- ObjectRecoveryProgress progress)
+int ReplicatedBackend::send_pull_legacy(int prio, int peer,
+ const ObjectRecoveryInfo &recovery_info,
+ ObjectRecoveryProgress progress)
{
// send op
tid_t tid = osd->get_tid();
@@ -5441,14 +6172,14 @@ int ReplicatedPG::send_pull_legacy(int prio, int peer,
<< " from osd." << peer
<< " tid " << tid << dendl;
- MOSDSubOp *subop = new MOSDSubOp(rid, info.pgid, recovery_info.soid,
+ MOSDSubOp *subop = new MOSDSubOp(rid, get_info().pgid, recovery_info.soid,
false, CEPH_OSD_FLAG_ACK,
get_osdmap()->get_epoch(), tid,
recovery_info.version);
subop->set_priority(prio);
subop->ops = vector<OSDOp>(1);
subop->ops[0].op.op = CEPH_OSD_OP_PULL;
- subop->ops[0].op.extent.length = g_conf->osd_recovery_max_chunk;
+ subop->ops[0].op.extent.length = cct->_conf->osd_recovery_max_chunk;
subop->recovery_info = recovery_info;
subop->recovery_progress = progress;
@@ -5458,7 +6189,7 @@ int ReplicatedPG::send_pull_legacy(int prio, int peer,
return 0;
}
-void ReplicatedPG::submit_push_data(
+void ReplicatedBackend::submit_push_data(
ObjectRecoveryInfo &recovery_info,
bool first,
bool complete,
@@ -5480,9 +6211,7 @@ void ReplicatedPG::submit_push_data(
}
if (first) {
- pg_log.revise_have(recovery_info.soid, eversion_t());
- remove_snap_mapped_object(*t, recovery_info.soid);
- t->remove(coll, recovery_info.soid);
+ get_parent()->on_local_recover_start(recovery_info.soid, t);
t->remove(get_temp_coll(t), recovery_info.soid);
t->touch(target_coll, recovery_info.soid);
t->omap_setheader(target_coll, recovery_info.soid, omap_header);
@@ -5516,8 +6245,8 @@ void ReplicatedPG::submit_push_data(
}
}
-void ReplicatedPG::submit_push_complete(ObjectRecoveryInfo &recovery_info,
- ObjectStore::Transaction *t)
+void ReplicatedBackend::submit_push_complete(ObjectRecoveryInfo &recovery_info,
+ ObjectStore::Transaction *t)
{
for (map<hobject_t, interval_set<uint64_t> >::const_iterator p =
recovery_info.clone_subset.begin();
@@ -5532,67 +6261,29 @@ void ReplicatedPG::submit_push_complete(ObjectRecoveryInfo &recovery_info,
q.get_start(), q.get_len(), q.get_start());
}
}
-
- if (recovery_info.soid.snap < CEPH_NOSNAP) {
- assert(recovery_info.oi.snaps.size());
- OSDriver::OSTransaction _t(osdriver.get_transaction(t));
- set<snapid_t> snaps(
- recovery_info.oi.snaps.begin(),
- recovery_info.oi.snaps.end());
- snap_mapper.add_oid(
- recovery_info.soid,
- snaps,
- &_t);
- }
-
- if (pg_log.get_missing().is_missing(recovery_info.soid) &&
- pg_log.get_missing().missing.find(recovery_info.soid)->second.need > recovery_info.version) {
- assert(is_primary());
- const pg_log_entry_t *latest = pg_log.get_log().objects.find(recovery_info.soid)->second;
- if (latest->op == pg_log_entry_t::LOST_REVERT &&
- latest->reverting_to == recovery_info.version) {
- dout(10) << " got old revert version " << recovery_info.version
- << " for " << *latest << dendl;
- recovery_info.version = latest->version;
- // update the attr to the revert event version
- recovery_info.oi.prior_version = recovery_info.oi.version;
- recovery_info.oi.version = latest->version;
- bufferlist bl;
- ::encode(recovery_info.oi, bl);
- t->setattr(coll, recovery_info.soid, OI_ATTR, bl);
- }
- }
- recover_got(recovery_info.soid, recovery_info.version);
-
- // update pg
- dirty_info = true;
- write_if_dirty(*t);
}
-ObjectRecoveryInfo ReplicatedPG::recalc_subsets(const ObjectRecoveryInfo& recovery_info)
+ObjectRecoveryInfo ReplicatedBackend::recalc_subsets(
+ const ObjectRecoveryInfo& recovery_info,
+ SnapSetContext *ssc)
{
if (!recovery_info.soid.snap || recovery_info.soid.snap >= CEPH_NOSNAP)
return recovery_info;
-
- SnapSetContext *ssc = get_snapset_context(recovery_info.soid.oid,
- recovery_info.soid.get_key(),
- recovery_info.soid.hash,
- false,
- recovery_info.soid.get_namespace());
- assert(ssc);
ObjectRecoveryInfo new_info = recovery_info;
new_info.copy_subset.clear();
new_info.clone_subset.clear();
assert(ssc);
- calc_clone_subsets(ssc->snapset, new_info.soid, pg_log.get_missing(), info.last_backfill,
+ calc_clone_subsets(ssc->snapset, new_info.soid, get_parent()->get_local_missing(),
+ get_info().last_backfill,
new_info.copy_subset, new_info.clone_subset);
- put_snapset_context(ssc);
return new_info;
}
-bool ReplicatedPG::handle_pull_response(
+bool ReplicatedBackend::handle_pull_response(
int from, PushOp &pop, PullOp *response,
- ObjectStore::Transaction *t)
+ list<ObjectContextRef> *to_continue,
+ ObjectStore::Transaction *t
+ )
{
interval_set<uint64_t> data_included = pop.data_included;
bufferlist data;
@@ -5624,7 +6315,13 @@ bool ReplicatedPG::handle_pull_response(
pop.recovery_info.copy_subset);
}
- pi.recovery_info = recalc_subsets(pi.recovery_info);
+ bool first = pi.recovery_progress.first;
+ if (first) {
+ pi.obc = get_parent()->get_obc(pi.recovery_info.soid, pop.attrset);
+ pi.recovery_info.oi = pi.obc->obs.oi;
+ pi.recovery_info = recalc_subsets(pi.recovery_info, pi.obc->ssc);
+ }
+
interval_set<uint64_t> usable_intervals;
bufferlist usable_data;
@@ -5636,33 +6333,15 @@ bool ReplicatedPG::handle_pull_response(
data_included = usable_intervals;
data.claim(usable_data);
- info.stats.stats.sum.num_bytes_recovered += data.length();
- bool first = pi.recovery_progress.first;
pi.recovery_progress = pop.after_progress;
+ pi.stat.num_bytes_recovered += data.length();
+
dout(10) << "new recovery_info " << pi.recovery_info
<< ", new progress " << pi.recovery_progress
<< dendl;
- if (first) {
- bufferlist oibl;
- if (pop.attrset.count(OI_ATTR)) {
- oibl.push_back(pop.attrset[OI_ATTR]);
- ::decode(pi.recovery_info.oi, oibl);
- } else {
- assert(0);
- }
- bufferlist ssbl;
- if (pop.attrset.count(SS_ATTR)) {
- ssbl.push_back(pop.attrset[SS_ATTR]);
- ::decode(pi.recovery_info.ss, ssbl);
- } else {
- assert(pi.recovery_info.soid.snap != CEPH_NOSNAP &&
- pi.recovery_info.soid.snap != CEPH_SNAPDIR);
- }
- }
-
bool complete = pi.is_complete();
submit_push_data(pi.recovery_info, first,
@@ -5673,53 +6352,17 @@ bool ReplicatedPG::handle_pull_response(
pop.omap_entries,
t);
- info.stats.stats.sum.num_keys_recovered += pop.omap_entries.size();
-
- if (complete) {
- info.stats.stats.sum.num_objects_recovered++;
-
- SnapSetContext *ssc;
- if (hoid.snap == CEPH_NOSNAP || hoid.snap == CEPH_SNAPDIR) {
- ssc = create_snapset_context(hoid.oid);
- ssc->snapset = pi.recovery_info.ss;
- } else {
- ssc = get_snapset_context(hoid.oid, hoid.get_key(), hoid.hash, false,
- hoid.get_namespace());
- assert(ssc);
- }
- ObjectContextRef obc = create_object_context(pi.recovery_info.oi, ssc);
- obc->obs.exists = true;
-
- obc->ondisk_write_lock();
-
- // keep track of active pushes for scrub
- ++active_pushes;
-
- t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
- t->register_on_applied_sync(new C_OSD_OndiskWriteUnlock(obc));
- t->register_on_complete(
- new C_OSD_CompletedPull(this, hoid, get_osdmap()->get_epoch()));
- }
-
- t->register_on_commit(
- new C_OSD_CommittedPushedObject(
- this,
- get_osdmap()->get_epoch(),
- info.last_complete));
+ pi.stat.num_keys_recovered += pop.omap_entries.size();
if (complete) {
+ to_continue->push_back(pi.obc);
+ pi.stat.num_objects_recovered++;
+ get_parent()->on_local_recover(
+ hoid, pi.stat, pi.recovery_info, pi.obc, t);
pulling.erase(hoid);
pull_from_peer[from].erase(hoid);
- publish_stats_to_osd();
- if (waiting_for_missing_object.count(hoid)) {
- dout(20) << " kicking waiters on " << hoid << dendl;
- requeue_ops(waiting_for_missing_object[hoid]);
- waiting_for_missing_object.erase(hoid);
- if (pg_log.get_missing().missing.size() == 0) {
- requeue_ops(waiting_for_all_missing);
- waiting_for_all_missing.clear();
- }
- }
+ if (pull_from_peer[from].empty())
+ pull_from_peer.erase(from);
return false;
} else {
response->soid = pop.soid;
@@ -5735,11 +6378,11 @@ struct C_OnPushCommit : public Context {
C_OnPushCommit(ReplicatedPG *pg, OpRequestRef op) : pg(pg), op(op) {}
void finish(int) {
op->mark_event("committed");
- pg->log_subop_stats(op, l_osd_push_inb, l_osd_sop_push_lat);
+ log_subop_stats(pg->osd, op, l_osd_push_inb, l_osd_sop_push_lat);
}
};
-void ReplicatedPG::handle_push(
+void ReplicatedBackend::handle_push(
int from, PushOp &pop, PushReplyOp *response,
ObjectStore::Transaction *t)
{
@@ -5753,12 +6396,7 @@ void ReplicatedPG::handle_push(
bool complete = pop.after_progress.data_complete &&
pop.after_progress.omap_complete;
- // keep track of active pushes for scrub
- ++active_pushes;
-
response->soid = pop.recovery_info.soid;
- t->register_on_applied(
- new C_OSD_AppliedRecoveredObjectReplica(this));
submit_push_data(pop.recovery_info,
first,
complete,
@@ -5769,14 +6407,16 @@ void ReplicatedPG::handle_push(
pop.omap_entries,
t);
- t->register_on_commit(
- new C_OSD_CommittedPushedObject(
- this,
- get_osdmap()->get_epoch(),
- info.last_complete));
+ if (complete)
+ get_parent()->on_local_recover(
+ pop.recovery_info.soid,
+ object_stat_sum_t(),
+ pop.recovery_info,
+ ObjectContextRef(), // ok, is replica
+ t);
}
-void ReplicatedPG::send_pushes(int prio, map<int, vector<PushOp> > &pushes)
+void ReplicatedBackend::send_pushes(int prio, map<int, vector<PushOp> > &pushes)
{
for (map<int, vector<PushOp> >::iterator i = pushes.begin();
i != pushes.end();
@@ -5800,28 +6440,28 @@ void ReplicatedPG::send_pushes(int prio, map<int, vector<PushOp> > &pushes)
uint64_t cost = 0;
uint64_t pushes = 0;
MOSDPGPush *msg = new MOSDPGPush();
- msg->pgid = info.pgid;
+ msg->pgid = get_info().pgid;
msg->map_epoch = get_osdmap()->get_epoch();
msg->set_priority(prio);
for (;
(j != i->second.end() &&
- cost < g_conf->osd_max_push_cost &&
- pushes < g_conf->osd_max_push_objects) ;
+ cost < cct->_conf->osd_max_push_cost &&
+ pushes < cct->_conf->osd_max_push_objects) ;
++j) {
dout(20) << __func__ << ": sending push " << *j
<< " to osd." << i->first << dendl;
- cost += j->cost(g_ceph_context);
+ cost += j->cost(cct);
pushes += 1;
msg->pushes.push_back(*j);
}
- msg->compute_cost(g_ceph_context);
+ msg->compute_cost(cct);
osd->send_message_osd_cluster(msg, con);
}
}
}
}
-void ReplicatedPG::send_pulls(int prio, map<int, vector<PullOp> > &pulls)
+void ReplicatedBackend::send_pulls(int prio, map<int, vector<PullOp> > &pulls)
{
for (map<int, vector<PullOp> >::iterator i = pulls.begin();
i != pulls.end();
@@ -5848,31 +6488,20 @@ void ReplicatedPG::send_pulls(int prio, map<int, vector<PullOp> > &pulls)
<< " to osd." << i->first << dendl;
MOSDPGPull *msg = new MOSDPGPull();
msg->set_priority(prio);
- msg->pgid = info.pgid;
+ msg->pgid = get_info().pgid;
msg->map_epoch = get_osdmap()->get_epoch();
msg->pulls.swap(i->second);
- msg->compute_cost(g_ceph_context);
+ msg->compute_cost(cct);
osd->send_message_osd_cluster(msg, con);
}
}
}
-int ReplicatedPG::send_push(int prio, int peer,
- const ObjectRecoveryInfo &recovery_info,
- const ObjectRecoveryProgress &progress,
- ObjectRecoveryProgress *out_progress)
-{
- PushOp op;
- int r = build_push_op(recovery_info, progress, out_progress, &op);
- if (r < 0)
- return r;
- return send_push_op_legacy(prio, peer, op);
-}
-
-int ReplicatedPG::build_push_op(const ObjectRecoveryInfo &recovery_info,
- const ObjectRecoveryProgress &progress,
- ObjectRecoveryProgress *out_progress,
- PushOp *out_op)
+int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info,
+ const ObjectRecoveryProgress &progress,
+ ObjectRecoveryProgress *out_progress,
+ PushOp *out_op,
+ object_stat_sum_t *stat)
{
ObjectRecoveryProgress _new_progress;
if (!out_progress)
@@ -5896,7 +6525,7 @@ int ReplicatedPG::build_push_op(const ObjectRecoveryInfo &recovery_info,
object_info_t oi(bv);
if (oi.version != recovery_info.version) {
- osd->clog.error() << info.pgid << " push "
+ osd->clog.error() << get_info().pgid << " push "
<< recovery_info.soid << " v "
<< " failed because local copy is "
<< oi.version << "\n";
@@ -5906,7 +6535,7 @@ int ReplicatedPG::build_push_op(const ObjectRecoveryInfo &recovery_info,
new_progress.first = false;
}
- uint64_t available = g_conf->osd_recovery_max_chunk;
+ uint64_t available = cct->_conf->osd_recovery_max_chunk;
if (!progress.omap_complete) {
ObjectMap::ObjectMapIterator iter =
osd->store->get_omap_iterator(coll,
@@ -5959,11 +6588,14 @@ int ReplicatedPG::build_push_op(const ObjectRecoveryInfo &recovery_info,
if (new_progress.is_complete(recovery_info)) {
new_progress.data_complete = true;
- info.stats.stats.sum.num_objects_recovered++;
+ if (stat)
+ stat->num_objects_recovered++;
}
- info.stats.stats.sum.num_keys_recovered += out_op->omap_entries.size();
- info.stats.stats.sum.num_bytes_recovered += out_op->data.length();
+ if (stat) {
+ stat->num_keys_recovered += out_op->omap_entries.size();
+ stat->num_bytes_recovered += out_op->data.length();
+ }
osd->logger->inc(l_osd_push);
osd->logger->inc(l_osd_push_outb, out_op->data.length());
@@ -5977,11 +6609,11 @@ int ReplicatedPG::build_push_op(const ObjectRecoveryInfo &recovery_info,
return 0;
}
-int ReplicatedPG::send_push_op_legacy(int prio, int peer, PushOp &pop)
+int ReplicatedBackend::send_push_op_legacy(int prio, int peer, PushOp &pop)
{
tid_t tid = osd->get_tid();
osd_reqid_t rid(osd->get_cluster_msgr_name(), 0, tid);
- MOSDSubOp *subop = new MOSDSubOp(rid, info.pgid, pop.soid,
+ MOSDSubOp *subop = new MOSDSubOp(rid, get_info().pgid, pop.soid,
false, 0, get_osdmap()->get_epoch(),
tid, pop.recovery_info.version);
subop->ops = vector<OSDOp>(1);
@@ -6002,16 +6634,16 @@ int ReplicatedPG::send_push_op_legacy(int prio, int peer, PushOp &pop)
return 0;
}
-void ReplicatedPG::prep_push_op_blank(const hobject_t& soid, PushOp *op)
+void ReplicatedBackend::prep_push_op_blank(const hobject_t& soid, PushOp *op)
{
op->recovery_info.version = eversion_t();
op->version = eversion_t();
op->soid = soid;
}
-void ReplicatedPG::sub_op_push_reply(OpRequestRef op)
+void ReplicatedBackend::sub_op_push_reply(OpRequestRef op)
{
- MOSDSubOpReply *reply = static_cast<MOSDSubOpReply*>(op->request);
+ MOSDSubOpReply *reply = static_cast<MOSDSubOpReply*>(op->get_req());
const hobject_t& soid = reply->get_poid();
assert(reply->get_header().type == MSG_OSD_SUBOPREPLY);
dout(10) << "sub_op_push_reply from " << reply->get_source() << " " << *reply << dendl;
@@ -6024,10 +6656,10 @@ void ReplicatedPG::sub_op_push_reply(OpRequestRef op)
PushOp pop;
bool more = handle_push_reply(peer, rop, &pop);
if (more)
- send_push_op_legacy(pushing[soid][peer].priority, peer, pop);
+ send_push_op_legacy(op->get_req()->get_priority(), peer, pop);
}
-bool ReplicatedPG::handle_push_reply(int peer, PushReplyOp &op, PushOp *reply)
+bool ReplicatedBackend::handle_push_reply(int peer, PushReplyOp &op, PushOp *reply)
{
const hobject_t &soid = op.soid;
if (pushing.count(soid) == 0) {
@@ -6047,32 +6679,25 @@ bool ReplicatedPG::handle_push_reply(int peer, PushReplyOp &op, PushOp *reply)
<< pi->recovery_progress.data_recovered_to
<< " of " << pi->recovery_info.copy_subset << dendl;
ObjectRecoveryProgress new_progress;
- build_push_op(
+ int r = build_push_op(
pi->recovery_info,
- pi->recovery_progress, &new_progress, reply);
+ pi->recovery_progress, &new_progress, reply,
+ &(pi->stat));
+ assert(r == 0);
pi->recovery_progress = new_progress;
return true;
} else {
// done!
- if (peer == backfill_target && backfills_in_flight.count(soid))
- backfills_in_flight.erase(soid);
- else
- peer_missing[peer].got(soid, pi->recovery_info.version);
+ get_parent()->on_peer_recover(
+ peer, soid, pi->recovery_info,
+ pi->stat);
pushing[soid].erase(peer);
pi = NULL;
- publish_stats_to_osd();
if (pushing[soid].empty()) {
- pushing.erase(soid);
- dout(10) << "pushed " << soid << " to all replicas" << dendl;
- finish_recovery_op(soid);
- if (waiting_for_degraded_object.count(soid)) {
- requeue_ops(waiting_for_degraded_object[soid]);
- waiting_for_degraded_object.erase(soid);
- }
- finish_degraded_object(soid);
+ get_parent()->on_global_recover(soid);
} else {
dout(10) << "pushed " << soid << ", still waiting for push ack from "
<< pushing[soid].size() << " others" << dendl;
@@ -6110,9 +6735,9 @@ void ReplicatedPG::finish_degraded_object(const hobject_t& oid)
* process request to pull an entire object.
* NOTE: called from opqueue.
*/
-void ReplicatedPG::sub_op_pull(OpRequestRef op)
+void ReplicatedBackend::sub_op_pull(OpRequestRef op)
{
- MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request);
+ MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req());
assert(m->get_header().type == MSG_OSD_SUBOP);
op->mark_started();
@@ -6137,16 +6762,17 @@ void ReplicatedPG::sub_op_pull(OpRequestRef op)
m->get_source().num(),
reply);
- log_subop_stats(op, 0, l_osd_sop_pull_lat);
+ log_subop_stats(osd, op, 0, l_osd_sop_pull_lat);
}
-void ReplicatedPG::handle_pull(int peer, PullOp &op, PushOp *reply)
+void ReplicatedBackend::handle_pull(int peer, PullOp &op, PushOp *reply)
{
const hobject_t &soid = op.soid;
struct stat st;
int r = osd->store->stat(coll, soid, &st);
if (r != 0) {
- osd->clog.error() << info.pgid << " " << peer << " tried to pull " << soid
+ osd->clog.error() << get_info().pgid << " "
+ << peer << " tried to pull " << soid
<< " but got " << cpp_strerror(-r) << "\n";
prep_push_op_blank(soid, reply);
} else {
@@ -6263,7 +6889,7 @@ void ReplicatedPG::recover_got(hobject_t oid, eversion_t v)
* @param intervals_usable intervals we want to keep
* @param data_usable matching data we want to keep
*/
-void ReplicatedPG::trim_pushed_data(
+void ReplicatedBackend::trim_pushed_data(
const interval_set<uint64_t> &copy_subset,
const interval_set<uint64_t> &intervals_received,
bufferlist data_received,
@@ -6301,10 +6927,10 @@ void ReplicatedPG::trim_pushed_data(
/** op_push
* NOTE: called from opqueue.
*/
-void ReplicatedPG::sub_op_push(OpRequestRef op)
+void ReplicatedBackend::sub_op_push(OpRequestRef op)
{
op->mark_started();
- MOSDSubOp *m = static_cast<MOSDSubOp *>(op->request);
+ MOSDSubOp *m = static_cast<MOSDSubOp *>(op->get_req());
PushOp pop;
pop.soid = m->recovery_info.soid;
@@ -6321,14 +6947,29 @@ void ReplicatedPG::sub_op_push(OpRequestRef op)
if (is_primary()) {
PullOp resp;
- bool more = handle_pull_response(m->get_source().num(), pop, &resp, t);
+ RPGHandle *h = _open_recovery_op();
+ list<ObjectContextRef> to_continue;
+ bool more = handle_pull_response(
+ m->get_source().num(), pop, &resp,
+ &to_continue, t);
if (more) {
send_pull_legacy(
m->get_priority(),
m->get_source().num(),
resp.recovery_info,
resp.recovery_progress);
- }
+ } else {
+ C_ReplicatedBackend_OnPullComplete *c =
+ new C_ReplicatedBackend_OnPullComplete(
+ this,
+ op->get_req()->get_priority());
+ c->to_continue.swap(to_continue);
+ t->register_on_complete(
+ new C_QueueInWQ(
+ &osd->push_wq,
+ get_parent()->bless_gencontext(c)));
+ }
+ run_recovery_op(h, op->get_req()->get_priority());
} else {
PushReplyOp resp;
MOSDSubOpReply *reply = new MOSDSubOpReply(
@@ -6337,15 +6978,16 @@ void ReplicatedPG::sub_op_push(OpRequestRef op)
assert(entity_name_t::TYPE_OSD == m->get_connection()->peer_type);
handle_push(m->get_source().num(), pop, &resp, t);
t->register_on_complete(new C_OSD_SendMessageOnConn(
- osd, reply, m->get_connection()));
+ osd, reply, m->get_connection()));
}
- t->register_on_commit(new C_OnPushCommit(this, op));
- osd->store->queue_transaction(osr.get(), t);
+ get_parent()->queue_transaction(t);
return;
}
-void ReplicatedPG::_failed_push(int from, const hobject_t &soid)
+void ReplicatedPG::failed_push(int from, const hobject_t &soid)
{
+ assert(recovering.count(soid));
+ recovering.erase(soid);
map<hobject_t,set<int> >::iterator p = missing_loc.find(soid);
if (p != missing_loc.end()) {
dout(0) << "_failed_push " << soid << " from osd." << from
@@ -6358,15 +7000,21 @@ void ReplicatedPG::_failed_push(int from, const hobject_t &soid)
dout(0) << "_failed_push " << soid << " from osd." << from
<< " but not in missing_loc ???" << dendl;
}
-
finish_recovery_op(soid); // close out this attempt,
+}
+
+void ReplicatedBackend::_failed_push(int from, const hobject_t &soid)
+{
+ get_parent()->failed_push(from, soid);
pull_from_peer[from].erase(soid);
+ if (pull_from_peer[from].empty())
+ pull_from_peer.erase(from);
pulling.erase(soid);
}
void ReplicatedPG::sub_op_remove(OpRequestRef op)
{
- MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request);
+ MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req());
assert(m->get_header().type == MSG_OSD_SUBOP);
dout(7) << "sub_op_remove " << m->poid << dendl;
@@ -6420,14 +7068,14 @@ ObjectContextRef ReplicatedPG::mark_object_lost(ObjectStore::Transaction *t,
// Add log entry
++info.last_update.version;
- pg_log_entry_t e(what, oid, info.last_update, version, osd_reqid_t(), mtime);
+ pg_log_entry_t e(what, oid, info.last_update, version, 0, osd_reqid_t(), mtime);
pg_log.add(e);
ObjectContextRef obc = get_object_context(oid, true);
obc->ondisk_write_lock();
- obc->obs.oi.lost = true;
+ obc->obs.oi.set_flag(object_info_t::FLAG_LOST);
obc->obs.oi.version = info.last_update;
obc->obs.oi.prior_version = version;
@@ -6460,7 +7108,7 @@ void ReplicatedPG::mark_all_unfound_lost(int what)
ObjectStore::Transaction *t = new ObjectStore::Transaction;
C_PG_MarkUnfoundLost *c = new C_PG_MarkUnfoundLost(this);
- utime_t mtime = ceph_clock_now(g_ceph_context);
+ utime_t mtime = ceph_clock_now(cct);
info.last_update.epoch = get_osdmap()->get_epoch();
const pg_missing_t &missing = pg_log.get_missing();
map<hobject_t, pg_missing_t::item>::const_iterator m = missing.missing.begin();
@@ -6491,7 +7139,7 @@ void ReplicatedPG::mark_all_unfound_lost(int what)
++info.last_update.version;
pg_log_entry_t e(
pg_log_entry_t::LOST_REVERT, oid, info.last_update,
- m->second.need, osd_reqid_t(), mtime);
+ m->second.need, 0, osd_reqid_t(), mtime);
e.reverting_to = prev;
pg_log.add(e);
dout(10) << e << dendl;
@@ -6508,7 +7156,7 @@ void ReplicatedPG::mark_all_unfound_lost(int what)
// log it
++info.last_update.version;
pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, info.last_update, m->second.need,
- osd_reqid_t(), mtime);
+ 0, osd_reqid_t(), mtime);
pg_log.add(e);
dout(10) << e << dendl;
@@ -6589,7 +7237,7 @@ void ReplicatedPG::apply_and_flush_repops(bool requeue)
if (requeue) {
if (repop->ctx->op) {
- dout(10) << " requeuing " << *repop->ctx->op->request << dendl;
+ dout(10) << " requeuing " << *repop->ctx->op->get_req() << dendl;
rq.push_back(repop->ctx->op);
repop->ctx->op = OpRequestRef();
}
@@ -6644,6 +7292,7 @@ void ReplicatedPG::on_shutdown()
deleting = true;
unreg_next_scrub();
+ cancel_copy_ops();
apply_and_flush_repops(false);
context_registry_on_change();
@@ -6655,20 +7304,6 @@ void ReplicatedPG::on_shutdown()
cancel_recovery();
}
-void ReplicatedPG::on_flushed()
-{
- assert(object_contexts.empty());
- if (have_temp_coll() &&
- !osd->store->collection_empty(get_temp_coll())) {
- vector<hobject_t> objects;
- osd->store->collection_list(get_temp_coll(), objects);
- derr << __func__ << ": found objects in the temp collection: "
- << objects << ", crashing now"
- << dendl;
- assert(0 == "found garbage in the temp collection");
- }
-}
-
void ReplicatedPG::on_activate()
{
for (unsigned i = 1; i<acting.size(); i++) {
@@ -6694,37 +7329,44 @@ void ReplicatedPG::on_change(ObjectStore::Transaction *t)
context_registry_on_change();
+ cancel_copy_ops();
+
// requeue object waiters
- requeue_ops(waiting_for_backfill_pos);
- requeue_object_waiters(waiting_for_missing_object);
+ if (is_primary()) {
+ requeue_ops(waiting_for_backfill_pos);
+ requeue_object_waiters(waiting_for_missing_object);
+ } else {
+ waiting_for_backfill_pos.clear();
+ waiting_for_missing_object.clear();
+ }
for (map<hobject_t,list<OpRequestRef> >::iterator p = waiting_for_degraded_object.begin();
p != waiting_for_degraded_object.end();
waiting_for_degraded_object.erase(p++)) {
- requeue_ops(p->second);
+ if (is_primary())
+ requeue_ops(p->second);
+ else
+ p->second.clear();
finish_degraded_object(p->first);
}
+ for (map<hobject_t,list<OpRequestRef> >::iterator p = waiting_for_blocked_object.begin();
+ p != waiting_for_blocked_object.end();
+ waiting_for_blocked_object.erase(p++)) {
+ if (is_primary())
+ requeue_ops(p->second);
+ else
+ p->second.clear();
+ }
- requeue_ops(waiting_for_all_missing);
- waiting_for_all_missing.clear();
+ if (is_primary())
+ requeue_ops(waiting_for_all_missing);
+ else
+ waiting_for_all_missing.clear();
// this will requeue ops we were working on but didn't finish, and
// any dups
apply_and_flush_repops(is_primary());
- // clear pushing/pulling maps
- pushing.clear();
- pulling.clear();
- pull_from_peer.clear();
-
- // clear temp
- for (set<hobject_t>::iterator i = temp_contents.begin();
- i != temp_contents.end();
- ++i) {
- dout(10) << __func__ << ": Removing oid "
- << *i << " from the temp collection" << dendl;
- t->remove(get_temp_coll(t), *i);
- }
- temp_contents.clear();
+ pgbackend->on_change(t);
// clear snap_trimmer state
snap_trimmer_machine.process_event(Reset());
@@ -6750,9 +7392,16 @@ void ReplicatedPG::_clear_recovery_state()
backfill_pos = hobject_t();
backfills_in_flight.clear();
pending_backfill_updates.clear();
- pulling.clear();
- pushing.clear();
- pull_from_peer.clear();
+ recovering.clear();
+ pgbackend->clear_state();
+}
+
+void ReplicatedPG::cancel_pull(const hobject_t &soid)
+{
+ assert(recovering.count(soid));
+ recovering.erase(soid);
+ finish_recovery_op(soid);
+ pg_log.set_last_requested(0); // get recover_primary to start over
}
void ReplicatedPG::check_recovery_sources(const OSDMapRef osdmap)
@@ -6771,26 +7420,10 @@ void ReplicatedPG::check_recovery_sources(const OSDMapRef osdmap)
}
dout(10) << "check_recovery_sources source osd." << *p << " now down" << dendl;
now_down.insert(*p);
-
- // reset pulls?
- map<int, set<hobject_t> >::iterator j = pull_from_peer.find(*p);
- if (j != pull_from_peer.end()) {
- dout(10) << "check_recovery_sources resetting pulls from osd." << *p
- << ", osdmap has it marked down" << dendl;
- for (set<hobject_t>::iterator i = j->second.begin();
- i != j->second.end();
- ++i) {
- assert(pulling.count(*i) == 1);
- pulling.erase(*i);
- finish_recovery_op(*i);
- }
- pg_log.set_last_requested(0);
- pull_from_peer.erase(j++);
- }
-
- // remove from missing_loc_sources
missing_loc_sources.erase(p++);
}
+ pgbackend->check_recovery_sources(osdmap);
+
if (now_down.empty()) {
dout(10) << "check_recovery_sources no source osds (" << missing_loc_sources << ") went down" << dendl;
} else {
@@ -6876,7 +7509,8 @@ int ReplicatedPG::start_recovery_ops(
}
bool deferred_backfill = false;
- if (state_test(PG_STATE_BACKFILL) &&
+ if (recovering.empty() &&
+ state_test(PG_STATE_BACKFILL) &&
backfill_target >= 0 && started < max &&
missing.num_missing() == 0 &&
!waiting_on_backfill) {
@@ -6904,9 +7538,11 @@ int ReplicatedPG::start_recovery_ops(
dout(10) << " started " << started << dendl;
osd->logger->inc(l_osd_rop, started);
- if (started || recovery_ops_active > 0 || deferred_backfill)
+ if (!recovering.empty() ||
+ started || recovery_ops_active > 0 || deferred_backfill)
return started;
+ assert(recovering.empty());
assert(recovery_ops_active == 0);
int unfound = get_num_unfound();
@@ -6922,6 +7558,13 @@ int ReplicatedPG::start_recovery_ops(
return started;
}
+ if (needs_recovery()) {
+ // this shouldn't happen!
+ // We already checked num_missing() so we must have missing replicas
+ osd->clog.error() << info.pgid << " recovery ending with missing replicas\n";
+ return started;
+ }
+
if (state_test(PG_STATE_RECOVERING)) {
state_clear(PG_STATE_RECOVERING);
if (needs_backfill()) {
@@ -6965,7 +7608,8 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle)
const pg_missing_t &missing = pg_log.get_missing();
- dout(10) << "recover_primary pulling " << pulling.size() << " in pg" << dendl;
+ dout(10) << "recover_primary recovering " << recovering.size()
+ << " in pg" << dendl;
dout(10) << "recover_primary " << missing << dendl;
dout(25) << "recover_primary " << missing.missing << dendl;
@@ -6974,7 +7618,7 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle)
int started = 0;
int skipped = 0;
- map<int, vector<PullOp> > pulls;
+ PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
map<version_t, hobject_t>::const_iterator p =
missing.rmissing.lower_bound(pg_log.get_log().last_requested);
while (p != missing.rmissing.end()) {
@@ -7005,8 +7649,8 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle)
<< (unfound ? " (unfound)":"")
<< (missing.is_missing(soid) ? " (missing)":"")
<< (missing.is_missing(head) ? " (missing head)":"")
- << (pulling.count(soid) ? " (pulling)":"")
- << (pulling.count(head) ? " (pulling head)":"")
+ << (recovering.count(soid) ? " (recovering)":"")
+ << (recovering.count(head) ? " (recovering head)":"")
<< dendl;
if (latest) {
@@ -7081,14 +7725,14 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle)
}
}
- if (!pulling.count(soid)) {
- if (pulling.count(head)) {
+ if (!recovering.count(soid)) {
+ if (recovering.count(head)) {
++skipped;
} else if (unfound) {
++skipped;
} else {
- int r = prepare_pull(
- soid, need, g_conf->osd_recovery_op_priority, &pulls);
+ int r = recover_missing(
+ soid, need, cct->_conf->osd_recovery_op_priority, h);
switch (r) {
case PULL_YES:
++started;
@@ -7110,14 +7754,14 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle)
if (!skipped)
pg_log.set_last_requested(v);
}
-
- send_pulls(g_conf->osd_recovery_op_priority, pulls);
+
+ pgbackend->run_recovery_op(h, cct->_conf->osd_recovery_op_priority);
return started;
}
int ReplicatedPG::prep_object_replica_pushes(
- const hobject_t& soid, eversion_t v, int prio,
- map<int, vector<PushOp> > *pushes)
+ const hobject_t& soid, eversion_t v,
+ PGBackend::RecoveryHandle *h)
{
dout(10) << __func__ << ": on " << soid << dendl;
@@ -7144,30 +7788,46 @@ int ReplicatedPG::prep_object_replica_pushes(
return 0;
}
- dout(10) << " ondisk_read_lock for " << soid << dendl;
+ start_recovery_op(soid);
+ assert(!recovering.count(soid));
+ recovering.insert(soid);
+
+ /* We need this in case there is an in progress write on the object. In fact,
+ * the only possible write is an update to the xattr due to a lost_revert --
+ * a client write would be blocked since the object is degraded.
+ * In almost all cases, therefore, this lock should be uncontended.
+ */
obc->ondisk_read_lock();
-
+ pgbackend->recover_object(
+ soid,
+ ObjectContextRef(),
+ obc, // has snapset context
+ h);
+ obc->ondisk_read_unlock();
+ return 1;
+}
+
+int ReplicatedBackend::start_pushes(
+ const hobject_t &soid,
+ ObjectContextRef obc,
+ RPGHandle *h)
+{
+ int pushes = 0;
// who needs it?
- bool started = false;
- for (unsigned i=1; i<acting.size(); i++) {
- int peer = acting[i];
- if (peer_missing.count(peer) &&
- peer_missing[peer].is_missing(soid)) {
- if (!started) {
- start_recovery_op(soid);
- started = true;
- }
- (*pushes)[peer].push_back(PushOp());
- prep_push_to_replica(obc, soid, peer, prio,
- &((*pushes)[peer].back())
+ for (unsigned i=1; i<get_parent()->get_acting().size(); i++) {
+ int peer = get_parent()->get_acting()[i];
+ map<int, pg_missing_t>::const_iterator j =
+ get_parent()->get_peer_missing().find(peer);
+ assert(j != get_parent()->get_peer_missing().end());
+ if (j->second.is_missing(soid)) {
+ ++pushes;
+ h->pushes[peer].push_back(PushOp());
+ prep_push_to_replica(obc, soid, peer,
+ &(h->pushes[peer].back())
);
}
}
-
- dout(10) << " ondisk_read_unlock on " << soid << dendl;
- obc->ondisk_read_unlock();
-
- return 1;
+ return pushes;
}
int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
@@ -7175,13 +7835,15 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
dout(10) << __func__ << "(" << max << ")" << dendl;
int started = 0;
- map<int, vector<PushOp> > pushes;
+ PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
// this is FAR from an optimal recovery order. pretty lame, really.
for (unsigned i=1; i<acting.size(); i++) {
int peer = acting[i];
map<int, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
assert(pm != peer_missing.end());
+ map<int, pg_info_t>::const_iterator pi = peer_info.find(peer);
+ assert(pi != peer_info.end());
size_t m_sz = pm->second.num_missing();
dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
@@ -7195,8 +7857,17 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
handle.reset_tp_timeout();
const hobject_t soid(p->second);
- if (pushing.count(soid)) {
- dout(10) << __func__ << ": already pushing " << soid << dendl;
+ if (soid > pi->second.last_backfill) {
+ if (!recovering.count(soid)) {
+ derr << __func__ << ": object added to missing set for backfill, but "
+ << "is not in recovering, error!" << dendl;
+ assert(0);
+ }
+ continue;
+ }
+
+ if (recovering.count(soid)) {
+ dout(10) << __func__ << ": already recovering" << soid << dendl;
continue;
}
@@ -7211,13 +7882,11 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
map<hobject_t,pg_missing_t::item>::const_iterator r = m.missing.find(soid);
started += prep_object_replica_pushes(soid, r->second.need,
- g_conf->osd_recovery_op_priority,
- &pushes);
+ h);
}
}
- send_pushes(g_conf->osd_recovery_op_priority, pushes);
-
+ pgbackend->run_recovery_op(h, cct->_conf->osd_recovery_op_priority);
return started;
}
@@ -7264,17 +7933,9 @@ int ReplicatedPG::recover_backfill(
<< " interval " << pbi.begin << "-" << pbi.end
<< " " << pbi.objects.size() << " objects" << dendl;
- int local_min = osd->store->get_ideal_list_min();
- int local_max = osd->store->get_ideal_list_max();
-
- // re-scan our local interval to cope with recent changes
- // FIXME: we could track the eversion_t when we last scanned, and invalidate
- // that way. or explicitly modify/invalidate when we actually change specific
- // objects.
- dout(10) << " rescanning local backfill_info from " << backfill_pos << dendl;
- backfill_info.clear();
- osr->flush();
- scan_range(backfill_pos, local_min, local_max, &backfill_info, handle);
+ // update our local interval to cope with recent changes
+ backfill_info.begin = backfill_pos;
+ update_range(&backfill_info, handle);
int ops = 0;
map<hobject_t, pair<eversion_t, eversion_t> > to_push;
@@ -7287,9 +7948,11 @@ int ReplicatedPG::recover_backfill(
while (ops < max) {
if (backfill_info.begin <= pbi.begin &&
!backfill_info.extends_to_end() && backfill_info.empty()) {
- osr->flush();
- scan_range(backfill_info.end, local_min, local_max, &backfill_info,
- handle);
+ hobject_t next = backfill_info.end;
+ backfill_info.clear();
+ backfill_info.begin = next;
+ backfill_info.end = hobject_t::get_max();
+ update_range(&backfill_info, handle);
backfill_info.trim();
}
backfill_pos = backfill_info.begin > pbi.begin ? pbi.begin : backfill_info.begin;
@@ -7380,15 +8043,16 @@ int ReplicatedPG::recover_backfill(
send_remove_op(i->first, i->second, backfill_target);
}
+ PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
map<int, vector<PushOp> > pushes;
for (map<hobject_t, pair<eversion_t, eversion_t> >::iterator i = to_push.begin();
i != to_push.end();
++i) {
handle.reset_tp_timeout();
prep_backfill_object_push(
- i->first, i->second.first, i->second.second, backfill_target, &pushes);
+ i->first, i->second.first, i->second.second, backfill_target, h);
}
- send_pushes(g_conf->osd_recovery_op_priority, pushes);
+ pgbackend->run_recovery_op(h, cct->_conf->osd_recovery_op_priority);
release_waiting_for_backfill_pos();
dout(5) << "backfill_pos is " << backfill_pos << " and pinfo.last_backfill is "
@@ -7434,35 +8098,104 @@ int ReplicatedPG::recover_backfill(
void ReplicatedPG::prep_backfill_object_push(
hobject_t oid, eversion_t v, eversion_t have, int peer,
- map<int, vector<PushOp> > *pushes)
+ PGBackend::RecoveryHandle *h)
{
dout(10) << "push_backfill_object " << oid << " v " << v << " to osd." << peer << dendl;
backfills_in_flight.insert(oid);
+ map<int, pg_missing_t>::iterator bpm = peer_missing.find(backfill_target);
+ assert(bpm != peer_missing.end());
+ bpm->second.add(oid, eversion_t(), eversion_t());
+
+ assert(!recovering.count(oid));
- if (!pushing.count(oid))
- start_recovery_op(oid);
+ start_recovery_op(oid);
+ recovering.insert(oid);
ObjectContextRef obc = get_object_context(oid, false);
+
+ // We need to take the read_lock here in order to flush in-progress writes
obc->ondisk_read_lock();
- (*pushes)[peer].push_back(PushOp());
- prep_push_to_replica(obc, oid, peer, g_conf->osd_recovery_op_priority,
- &((*pushes)[peer].back()));
+ pgbackend->recover_object(
+ oid,
+ ObjectContextRef(),
+ obc,
+ h);
obc->ondisk_read_unlock();
}
+void ReplicatedPG::update_range(
+ BackfillInterval *bi,
+ ThreadPool::TPHandle &handle)
+{
+ int local_min = cct->_conf->osd_backfill_scan_min;
+ int local_max = cct->_conf->osd_backfill_scan_max;
+
+ if (bi->version < info.log_tail) {
+ dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
+ << dendl;
+ if (last_update_applied >= info.log_tail) {
+ bi->version = last_update_applied;
+ } else {
+ osr->flush();
+ bi->version = info.last_update;
+ }
+ scan_range(local_min, local_max, bi, handle);
+ }
+
+ if (bi->version >= info.last_update) {
+ dout(10) << __func__<< ": bi is current " << dendl;
+ assert(bi->version == info.last_update);
+ } else if (bi->version >= info.log_tail) {
+ assert(!pg_log.get_log().empty());
+ dout(10) << __func__<< ": bi is old, (" << bi->version
+ << ") can be updated with log" << dendl;
+ list<pg_log_entry_t>::const_iterator i =
+ pg_log.get_log().log.end();
+ --i;
+ while (i != pg_log.get_log().log.begin() &&
+ i->version > bi->version) {
+ --i;
+ }
+ if (i->version == bi->version)
+ ++i;
+
+ assert(i != pg_log.get_log().log.end());
+ dout(10) << __func__ << ": updating from version " << i->version
+ << dendl;
+ for (; i != pg_log.get_log().log.end(); ++i) {
+ const hobject_t &soid = i->soid;
+ if (soid >= bi->begin && soid < bi->end) {
+ if (i->is_update()) {
+ dout(10) << __func__ << ": " << i->soid << " updated to version "
+ << i->version << dendl;
+ bi->objects.erase(i->soid);
+ bi->objects.insert(
+ make_pair(
+ i->soid,
+ i->version));
+ } else if (i->is_delete()) {
+ dout(10) << __func__ << ": " << i->soid << " removed" << dendl;
+ bi->objects.erase(i->soid);
+ }
+ }
+ }
+ bi->version = info.last_update;
+ } else {
+ assert(0 == "scan_range should have raised bi->version past log_tail");
+ }
+}
+
void ReplicatedPG::scan_range(
- hobject_t begin, int min, int max, BackfillInterval *bi,
+ int min, int max, BackfillInterval *bi,
ThreadPool::TPHandle &handle)
{
assert(is_locked());
- dout(10) << "scan_range from " << begin << dendl;
- bi->begin = begin;
+ dout(10) << "scan_range from " << bi->begin << dendl;
bi->objects.clear(); // for good measure
vector<hobject_t> ls;
ls.reserve(max);
- int r = osd->store->collection_list_partial(coll, begin, min, max,
- 0, &ls, &bi->end);
+ int r = pgbackend->objects_list_partial(bi->begin, min, max, 0, &ls, &bi->end);
assert(r >= 0);
dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
dout(20) << ls << dendl;
@@ -7477,7 +8210,7 @@ void ReplicatedPG::scan_range(
dout(20) << " " << *p << " " << obc->obs.oi.version << dendl;
} else {
bufferlist bl;
- int r = osd->store->getattr(coll, *p, OI_ATTR, bl);
+ int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
assert(r >= 0);
object_info_t oi(bl);
bi->objects[*p] = oi.version;
@@ -7497,7 +8230,7 @@ void ReplicatedPG::check_local()
assert(info.last_update >= pg_log.get_tail()); // otherwise we need some help!
- if (!g_conf->osd_debug_verify_stray_on_activate)
+ if (!cct->_conf->osd_debug_verify_stray_on_activate)
return;
// just scan the log.
@@ -7747,9 +8480,10 @@ void ReplicatedPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_t
<< "SnapTrimmer state<" << get_state_name() << ">: ")
/* NotTrimming */
-ReplicatedPG::NotTrimming::NotTrimming(my_context ctx) : my_base(ctx)
+ReplicatedPG::NotTrimming::NotTrimming(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< SnapTrimmer >().pg->cct, "NotTrimming")
{
- state_name = "NotTrimming";
context< SnapTrimmer >().requeue = false;
context< SnapTrimmer >().log_enter(state_name);
}
@@ -7788,9 +8522,10 @@ boost::statechart::result ReplicatedPG::NotTrimming::react(const SnapTrim&)
}
/* TrimmingObjects */
-ReplicatedPG::TrimmingObjects::TrimmingObjects(my_context ctx) : my_base(ctx)
+ReplicatedPG::TrimmingObjects::TrimmingObjects(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< SnapTrimmer >().pg->cct, "Trimming/TrimmingObjects")
{
- state_name = "Trimming/TrimmingObjects";
context< SnapTrimmer >().log_enter(state_name);
}
@@ -7823,24 +8558,20 @@ boost::statechart::result ReplicatedPG::TrimmingObjects::react(const SnapTrim&)
dout(10) << "TrimmingObjects react trimming " << pos << dendl;
RepGather *repop = pg->trim_object(pos);
assert(repop);
-
repop->queue_snap_trimmer = true;
- eversion_t old_last_update = pg->pg_log.get_head();
- bool old_exists = repop->obc->obs.exists;
- uint64_t old_size = repop->obc->obs.oi.size;
- eversion_t old_version = repop->obc->obs.oi.version;
pg->append_log(repop->ctx->log, eversion_t(), repop->ctx->local_t);
- pg->issue_repop(repop, repop->ctx->mtime, old_last_update, old_exists, old_size, old_version);
+ pg->issue_repop(repop, repop->ctx->mtime);
pg->eval_repop(repop);
repops.insert(repop);
return discard_event();
}
/* WaitingOnReplicasObjects */
-ReplicatedPG::WaitingOnReplicas::WaitingOnReplicas(my_context ctx) : my_base(ctx)
+ReplicatedPG::WaitingOnReplicas::WaitingOnReplicas(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< SnapTrimmer >().pg->cct, "Trimming/WaitingOnReplicas")
{
- state_name = "Trimming/WaitingOnReplicas";
context< SnapTrimmer >().log_enter(state_name);
context< SnapTrimmer >().requeue = false;
}
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index 0fbe5afd9ca..1292780d044 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -18,6 +18,7 @@
#define CEPH_REPLICATEDPG_H
#include <boost/optional.hpp>
+#include <boost/tuple/tuple.hpp>
#include "include/assert.h"
#include "common/cmdparse.h"
@@ -33,6 +34,9 @@
#include "common/sharedptr_registry.hpp"
+#include "PGBackend.h"
+#include "ReplicatedBackend.h"
+
class MOSDSubOpReply;
class ReplicatedPG;
@@ -80,10 +84,230 @@ public:
virtual bool filter(bufferlist& xattr_data, bufferlist& outdata);
};
-class ReplicatedPG : public PG {
+class ReplicatedPG : public PG, public PGBackend::Listener {
friend class OSD;
friend class Watch;
-public:
+
+public:
+
+ /*
+ * state associated with a copy operation
+ */
+ struct OpContext;
+ class CopyCallback;
+
+ struct CopyOp {
+ CopyCallback *cb;
+ ObjectContextRef obc;
+ hobject_t src;
+ object_locator_t oloc;
+ version_t version;
+
+ tid_t objecter_tid;
+
+ object_copy_cursor_t cursor;
+ uint64_t size;
+ utime_t mtime;
+ map<string,bufferlist> attrs;
+ bufferlist data;
+ map<string,bufferlist> omap;
+ int rval;
+
+ coll_t temp_coll;
+ hobject_t temp_oid;
+ object_copy_cursor_t temp_cursor;
+
+ CopyOp(CopyCallback *cb_, ObjectContextRef _obc, hobject_t s, object_locator_t l,
+ version_t v, const hobject_t& dest)
+ : cb(cb_), obc(_obc), src(s), oloc(l), version(v),
+ objecter_tid(0),
+ size(0),
+ rval(-1),
+ temp_oid(dest)
+ {}
+ };
+ typedef boost::shared_ptr<CopyOp> CopyOpRef;
+
+ /**
+ * The CopyCallback class defines an interface for completions to the
+ * copy_start code. Users of the copy infrastructure must implement
+ * one and give an instance of the class to start_copy.
+ *
+ * The implementer is responsible for making sure that the CopyCallback
+ * can associate itself with the correct copy operation. The presence
+ * of the closing Transaction ensures that write operations can be performed
+ * atomically with the copy being completed (which doing them in separate
+ * transactions would not allow); if you are doing the copy for a read
+ * op you will have to generate a separate op to finish the copy with.
+ */
+ /// return code, total object size, data in temp object?, final Transaction
+ typedef boost::tuple<int, size_t, bool, ObjectStore::Transaction> CopyResults;
+ class CopyCallback : public GenContext<CopyResults&> {
+ protected:
+ CopyCallback() {}
+ /**
+ * results.get<0>() is the return code: 0 for success; -ECANCELLED if
+ * the operation was cancelled by the local OSD; -errno for other issues.
+ * results.get<1>() is the total size of the object (for updating pg stats)
+ * results.get<2>() indicates whether we have already written data to
+ * the temp object (so it needs to get cleaned up, if the return code
+ * indicates a failure)
+ * results.get<3>() is a Transaction; if non-empty you need to perform
+ * its results before any other accesses to the object in order to
+ * complete the copy.
+ */
+ virtual void finish(CopyResults& results_) = 0;
+
+ public:
+ /// Provide the final size of the copied object to the CopyCallback
+ virtual ~CopyCallback() {};
+ };
+
+ class CopyFromCallback: public CopyCallback {
+ public:
+ CopyResults results;
+ OpContext *ctx;
+ hobject_t temp_obj;
+ CopyFromCallback(OpContext *ctx_, const hobject_t& temp_obj_) :
+ ctx(ctx_), temp_obj(temp_obj_) {}
+ ~CopyFromCallback() {}
+
+ virtual void finish(CopyResults& results_) {
+ results = results_;
+ int r = results.get<0>();
+ if (r >= 0) {
+ ctx->pg->execute_ctx(ctx);
+ }
+ ctx->copy_cb = NULL;
+ if (r < 0) {
+ if (r != -ECANCELED) { // on cancel just toss it out; client resends
+ ctx->pg->osd->reply_op_error(ctx->op, r);
+ }
+ ctx->pg->close_op_ctx(ctx);
+ }
+ }
+
+ bool is_temp_obj_used() { return results.get<2>(); }
+ uint64_t get_data_size() { return results.get<1>(); }
+ int get_result() { return results.get<0>(); }
+ };
+ friend class CopyFromCallback;
+
+ boost::scoped_ptr<PGBackend> pgbackend;
+ PGBackend *get_pgbackend() {
+ return pgbackend.get();
+ }
+
+ /// Listener methods
+ void on_local_recover_start(
+ const hobject_t &oid,
+ ObjectStore::Transaction *t);
+ void on_local_recover(
+ const hobject_t &oid,
+ const object_stat_sum_t &stat_diff,
+ const ObjectRecoveryInfo &recovery_info,
+ ObjectContextRef obc,
+ ObjectStore::Transaction *t
+ );
+ void on_peer_recover(
+ int peer,
+ const hobject_t &oid,
+ const ObjectRecoveryInfo &recovery_info,
+ const object_stat_sum_t &stat
+ );
+ void begin_peer_recover(
+ int peer,
+ const hobject_t oid);
+ void on_global_recover(
+ const hobject_t &oid);
+ void failed_push(int from, const hobject_t &soid);
+ void cancel_pull(const hobject_t &soid);
+
+ template <typename T>
+ class BlessedGenContext : public GenContext<T> {
+ ReplicatedPG *pg;
+ GenContext<T> *c;
+ epoch_t e;
+ public:
+ BlessedGenContext(ReplicatedPG *pg, GenContext<T> *c, epoch_t e)
+ : pg(pg), c(c), e(e) {}
+ void finish(T t) {
+ pg->lock();
+ if (pg->pg_has_reset_since(e))
+ delete c;
+ else
+ c->complete(t);
+ pg->unlock();
+ }
+ };
+ class BlessedContext : public Context {
+ ReplicatedPG *pg;
+ Context *c;
+ epoch_t e;
+ public:
+ BlessedContext(ReplicatedPG *pg, Context *c, epoch_t e)
+ : pg(pg), c(c), e(e) {}
+ void finish(int r) {
+ pg->lock();
+ if (pg->pg_has_reset_since(e))
+ delete c;
+ else
+ c->complete(r);
+ pg->unlock();
+ }
+ };
+ Context *bless_context(Context *c) {
+ return new BlessedContext(this, c, get_osdmap()->get_epoch());
+ }
+ GenContext<ThreadPool::TPHandle&> *bless_gencontext(
+ GenContext<ThreadPool::TPHandle&> *c) {
+ return new BlessedGenContext<ThreadPool::TPHandle&>(
+ this, c, get_osdmap()->get_epoch());
+ }
+
+ void send_message(int to_osd, Message *m) {
+ osd->send_message_osd_cluster(to_osd, m, get_osdmap()->get_epoch());
+ }
+ void queue_transaction(ObjectStore::Transaction *t) {
+ osd->store->queue_transaction(osr.get(), t);
+ }
+ epoch_t get_epoch() {
+ return get_osdmap()->get_epoch();
+ }
+ const vector<int> &get_acting() {
+ return acting;
+ }
+ std::string gen_dbg_prefix() const { return gen_prefix(); }
+
+ const map<hobject_t, set<int> > &get_missing_loc() {
+ return missing_loc;
+ }
+ const map<int, pg_missing_t> &get_peer_missing() {
+ return peer_missing;
+ }
+ const map<int, pg_info_t> &get_peer_info() {
+ return peer_info;
+ }
+ const pg_missing_t &get_local_missing() {
+ return pg_log.get_missing();
+ }
+ const PGLog &get_log() {
+ return pg_log;
+ }
+ bool pgb_is_primary() const {
+ return is_primary();
+ }
+ OSDMapRef pgb_get_osdmap() const {
+ return get_osdmap();
+ }
+ const pg_info_t &get_info() const {
+ return info;
+ }
+ ObjectContextRef get_obc(
+ const hobject_t &hoid,
+ map<string, bufferptr> &attrs) {
+ return get_object_context(hoid, true, &attrs);
+ }
/*
* Capture all object state associated with an in-progress read or write.
@@ -103,6 +327,7 @@ public:
bool modify; // (force) modification (even if op_t is empty)
bool user_modify; // user-visible modification
+ bool undirty; // user explicitly un-dirtying this object
// side effects
list<watch_info_t> watch_connects;
@@ -122,7 +347,7 @@ public:
utime_t mtime;
SnapContext snapc; // writer snap context
eversion_t at_version; // pg's current version pointer
- eversion_t reply_version; // the version that we report the client (depends on the op)
+ version_t user_at_version; // pg's current user version pointer
int current_osd_subop_num;
@@ -145,6 +370,12 @@ public:
int num_read; ///< count read ops
int num_write; ///< count update ops
+ CopyFromCallback *copy_cb;
+
+ hobject_t new_temp_oid, discard_temp_oid; ///< temp objects we should start/stop tracking
+
+ enum { W_LOCK, R_LOCK, NONE } lock_to_release;
+
OpContext(const OpContext& other);
const OpContext& operator=(const OpContext& other);
@@ -153,12 +384,14 @@ public:
ReplicatedPG *_pg) :
op(_op), reqid(_reqid), ops(_ops), obs(_obs), snapset(0),
new_obs(_obs->oi, _obs->exists),
- modify(false), user_modify(false),
- bytes_written(0), bytes_read(0),
+ modify(false), user_modify(false), undirty(false),
+ bytes_written(0), bytes_read(0), user_at_version(0),
current_osd_subop_num(0),
data_off(0), reply(NULL), pg(_pg),
num_read(0),
- num_write(0) {
+ num_write(0),
+ copy_cb(NULL),
+ lock_to_release(NONE) {
if (_ssc) {
new_snapset = _ssc->snapset;
snapset = &_ssc->snapset;
@@ -166,6 +399,7 @@ public:
}
~OpContext() {
assert(!clone_obc);
+ assert(lock_to_release == NONE);
if (reply)
reply->put();
}
@@ -224,7 +458,7 @@ public:
if (--nref == 0) {
assert(!obc);
assert(src_obc.empty());
- delete ctx;
+ delete ctx; // must already be unlocked
delete this;
//generic_dout(0) << "deleting " << this << dendl;
}
@@ -235,6 +469,163 @@ public:
protected:
+ /// Tracks pending readers or writers on an object
+ class RWTracker {
+ struct ObjState {
+ enum State {
+ NONE,
+ READ,
+ WRITE
+ };
+ State state; /// rw state
+ uint64_t count; /// number of readers or writers
+ list<OpRequestRef> waiters; /// ops waiting on state change
+
+ ObjState() : state(NONE), count(0) {}
+ bool get_read(OpRequestRef op) {
+ // don't starve!
+ if (!waiters.empty()) {
+ waiters.push_back(op);
+ return false;
+ }
+ switch (state) {
+ case NONE:
+ assert(count == 0);
+ state = READ;
+ // fall through
+ case READ:
+ count++;
+ return true;
+ case WRITE:
+ waiters.push_back(op);
+ return false;
+ default:
+ assert(0 == "unhandled case");
+ return false;
+ }
+ }
+ bool get_write(OpRequestRef op) {
+ if (!waiters.empty()) {
+ // don't starve!
+ waiters.push_back(op);
+ return false;
+ }
+ switch (state) {
+ case NONE:
+ assert(count == 0);
+ state = WRITE;
+ // fall through
+ case WRITE:
+ count++;
+ return true;
+ case READ:
+ waiters.push_back(op);
+ return false;
+ default:
+ assert(0 == "unhandled case");
+ return false;
+ }
+ }
+ void dec(list<OpRequestRef> *requeue) {
+ assert(count > 0);
+ assert(requeue);
+ assert(requeue->empty());
+ count--;
+ if (count == 0) {
+ state = NONE;
+ requeue->swap(waiters);
+ }
+ }
+ void put_read(list<OpRequestRef> *requeue) {
+ assert(state == READ);
+ dec(requeue);
+ }
+ void put_write(list<OpRequestRef> *requeue) {
+ assert(state == WRITE);
+ dec(requeue);
+ }
+ bool empty() const { return state == NONE; }
+ };
+ map<hobject_t, ObjState > obj_state;
+ public:
+ bool get_read(const hobject_t &hoid, OpRequestRef op) {
+ return obj_state[hoid].get_read(op);
+ }
+ bool get_write(const hobject_t &hoid, OpRequestRef op) {
+ return obj_state[hoid].get_write(op);
+ }
+ void put_read(const hobject_t &hoid, list<OpRequestRef> *to_wake) {
+ obj_state[hoid].put_read(to_wake);
+ if (obj_state[hoid].empty()) {
+ obj_state.erase(hoid);
+ }
+ }
+ void put_write(const hobject_t &hoid, list<OpRequestRef> *to_wake) {
+ obj_state[hoid].put_write(to_wake);
+ if (obj_state[hoid].empty()) {
+ obj_state.erase(hoid);
+ }
+ }
+ } rw_manager;
+
+ /**
+ * Grabs locks for OpContext, should be cleaned up in close_op_ctx
+ *
+ * @param ctx [in,out] ctx to get locks for
+ * @return true on success, false if we are queued
+ */
+ bool get_rw_locks(OpContext *ctx) {
+ if (ctx->op->may_write()) {
+ if (rw_manager.get_write(ctx->obs->oi.soid, ctx->op)) {
+ ctx->lock_to_release = OpContext::W_LOCK;
+ return true;
+ } else {
+ return false;
+ }
+ } else {
+ assert(ctx->op->may_read());
+ if (rw_manager.get_read(ctx->obs->oi.soid, ctx->op)) {
+ ctx->lock_to_release = OpContext::R_LOCK;
+ return true;
+ } else {
+ return false;
+ }
+ }
+ }
+
+ /**
+ * Cleans up OpContext
+ *
+ * @param ctx [in] ctx to clean up
+ */
+ void close_op_ctx(OpContext *ctx) {
+ release_op_ctx_locks(ctx);
+ delete ctx;
+ }
+
+ /**
+ * Releases ctx locks
+ *
+ * @param ctx [in] ctx to clean up
+ */
+ void release_op_ctx_locks(OpContext *ctx) {
+ list<OpRequestRef> to_req;
+ switch (ctx->lock_to_release) {
+ case OpContext::W_LOCK:
+ rw_manager.put_write(ctx->obs->oi.soid, &to_req);
+ break;
+ case OpContext::R_LOCK:
+ rw_manager.put_read(ctx->obs->oi.soid, &to_req);
+ break;
+ case OpContext::NONE:
+ break;
+ default:
+ assert(0);
+ };
+ ctx->lock_to_release = OpContext::NONE;
+ requeue_ops(to_req);
+ }
+
// replica ops
// [primary|tail]
xlist<RepGather*> repop_queue;
@@ -244,8 +635,7 @@ protected:
void op_applied(RepGather *repop);
void op_commit(RepGather *repop);
void eval_repop(RepGather*);
- void issue_repop(RepGather *repop, utime_t now,
- eversion_t old_last_update, bool old_exists, uint64_t old_size, eversion_t old_version);
+ void issue_repop(RepGather *repop, utime_t now);
RepGather *new_repop(OpContext *ctx, ObjectContextRef obc, tid_t rep_tid);
void remove_repop(RepGather *repop);
void repop_ack(RepGather *repop,
@@ -299,7 +689,11 @@ public:
protected:
ObjectContextRef create_object_context(const object_info_t& oi, SnapSetContext *ssc);
- ObjectContextRef get_object_context(const hobject_t& soid, bool can_create);
+ ObjectContextRef get_object_context(
+ const hobject_t& soid,
+ bool can_create,
+ map<string, bufferptr> *attrs = 0
+ );
void context_registry_on_change();
void object_context_destructor_callback(ObjectContext *obc);
@@ -322,8 +716,11 @@ protected:
void get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc);
SnapSetContext *create_snapset_context(const object_t& oid);
- SnapSetContext *get_snapset_context(const object_t& oid, const string &key,
- ps_t seed, bool can_create, const string &nspace);
+ SnapSetContext *get_snapset_context(
+ const object_t& oid, const string &key,
+ ps_t seed, bool can_create, const string &nspace,
+ map<string, bufferptr> *attrs = 0
+ );
void register_snapset_context(SnapSetContext *ssc) {
Mutex::Locker l(snapset_contexts_lock);
_register_snapset_context(ssc);
@@ -338,93 +735,7 @@ protected:
}
void put_snapset_context(SnapSetContext *ssc);
- // push
- struct PushInfo {
- ObjectRecoveryProgress recovery_progress;
- ObjectRecoveryInfo recovery_info;
- int priority;
-
- void dump(Formatter *f) const {
- {
- f->open_object_section("recovery_progress");
- recovery_progress.dump(f);
- f->close_section();
- }
- {
- f->open_object_section("recovery_info");
- recovery_info.dump(f);
- f->close_section();
- }
- }
- };
- map<hobject_t, map<int, PushInfo> > pushing;
-
- // pull
- struct PullInfo {
- ObjectRecoveryProgress recovery_progress;
- ObjectRecoveryInfo recovery_info;
- int priority;
-
- void dump(Formatter *f) const {
- {
- f->open_object_section("recovery_progress");
- recovery_progress.dump(f);
- f->close_section();
- }
- {
- f->open_object_section("recovery_info");
- recovery_info.dump(f);
- f->close_section();
- }
- }
-
- bool is_complete() const {
- return recovery_progress.is_complete(recovery_info);
- }
- };
- map<hobject_t, PullInfo> pulling;
-
- // Track contents of temp collection, clear on reset
- set<hobject_t> temp_contents;
-
- ObjectRecoveryInfo recalc_subsets(const ObjectRecoveryInfo& recovery_info);
- static void trim_pushed_data(const interval_set<uint64_t> &copy_subset,
- const interval_set<uint64_t> &intervals_received,
- bufferlist data_received,
- interval_set<uint64_t> *intervals_usable,
- bufferlist *data_usable);
- bool handle_pull_response(
- int from, PushOp &op, PullOp *response,
- ObjectStore::Transaction *t);
- void handle_push(
- int from, PushOp &op, PushReplyOp *response,
- ObjectStore::Transaction *t);
- void send_pushes(int prio, map<int, vector<PushOp> > &pushes);
- int send_push(int priority, int peer,
- const ObjectRecoveryInfo& recovery_info,
- const ObjectRecoveryProgress &progress,
- ObjectRecoveryProgress *out_progress = 0);
- int build_push_op(const ObjectRecoveryInfo &recovery_info,
- const ObjectRecoveryProgress &progress,
- ObjectRecoveryProgress *out_progress,
- PushOp *out_op);
- int send_push_op_legacy(int priority, int peer,
- PushOp &pop);
-
- int send_pull_legacy(int priority, int peer,
- const ObjectRecoveryInfo& recovery_info,
- ObjectRecoveryProgress progress);
- void submit_push_data(ObjectRecoveryInfo &recovery_info,
- bool first,
- bool complete,
- const interval_set<uint64_t> &intervals_included,
- bufferlist data_included,
- bufferlist omap_header,
- map<string, bufferptr> &attrs,
- map<string, bufferlist> &omap_entries,
- ObjectStore::Transaction *t);
- void submit_push_complete(ObjectRecoveryInfo &recovery_info,
- ObjectStore::Transaction *t);
+ set<hobject_t> recovering;
/*
* Backfill
@@ -467,54 +778,17 @@ protected:
f->close_section();
}
{
- f->open_array_section("pull_from_peer");
- for (map<int, set<hobject_t> >::const_iterator i = pull_from_peer.begin();
- i != pull_from_peer.end();
+ f->open_array_section("recovering");
+ for (set<hobject_t>::const_iterator i = recovering.begin();
+ i != recovering.end();
++i) {
- f->open_object_section("pulling_from");
- f->dump_int("pull_from", i->first);
- {
- f->open_array_section("pulls");
- for (set<hobject_t>::const_iterator j = i->second.begin();
- j != i->second.end();
- ++j) {
- f->open_object_section("pull_info");
- assert(pulling.count(*j));
- pulling.find(*j)->second.dump(f);
- f->close_section();
- }
- f->close_section();
- }
- f->close_section();
+ f->dump_stream("object") << *i;
}
f->close_section();
}
{
- f->open_array_section("pushing");
- for (map<hobject_t, map<int, PushInfo> >::const_iterator i =
- pushing.begin();
- i != pushing.end();
- ++i) {
- f->open_object_section("object");
- f->dump_stream("pushing") << i->first;
- {
- f->open_array_section("pushing_to");
- for (map<int, PushInfo>::const_iterator j = i->second.begin();
- j != i->second.end();
- ++j) {
- f->open_object_section("push_progress");
- f->dump_stream("object_pushing") << j->first;
- {
- f->open_object_section("push_info");
- j->second.dump(f);
- f->close_section();
- }
- f->close_section();
- }
- f->close_section();
- }
- f->close_section();
- }
+ f->open_object_section("pg_backend");
+ pgbackend->dump_recovery_info(f);
f->close_section();
}
}
@@ -522,66 +796,38 @@ protected:
/// leading edge of backfill
hobject_t backfill_pos;
- // Reverse mapping from osd peer to objects beging pulled from that peer
- map<int, set<hobject_t> > pull_from_peer;
-
int prep_object_replica_pushes(const hobject_t& soid, eversion_t v,
- int priority,
- map<int, vector<PushOp> > *pushes);
- void calc_head_subsets(ObjectContextRef obc, SnapSet& snapset, const hobject_t& head,
- pg_missing_t& missing,
- const hobject_t &last_backfill,
- interval_set<uint64_t>& data_subset,
- map<hobject_t, interval_set<uint64_t> >& clone_subsets);
- void calc_clone_subsets(SnapSet& snapset, const hobject_t& poid, const pg_missing_t& missing,
- const hobject_t &last_backfill,
- interval_set<uint64_t>& data_subset,
- map<hobject_t, interval_set<uint64_t> >& clone_subsets);
- void prep_push_to_replica(
- ObjectContextRef obc,
- const hobject_t& oid,
- int dest,
- int priority,
- PushOp *push_op);
- void prep_push(int priority,
- ObjectContextRef obc,
- const hobject_t& oid, int dest,
- PushOp *op);
- void prep_push(int priority,
- ObjectContextRef obc,
- const hobject_t& soid, int peer,
- eversion_t version,
- interval_set<uint64_t> &data_subset,
- map<hobject_t, interval_set<uint64_t> >& clone_subsets,
- PushOp *op);
- void prep_push_op_blank(const hobject_t& soid, PushOp *op);
+ PGBackend::RecoveryHandle *h);
void finish_degraded_object(const hobject_t& oid);
// Cancels/resets pulls from peer
void check_recovery_sources(const OSDMapRef map);
- void send_pulls(
- int priority,
- map<int, vector<PullOp> > &pulls);
- int prepare_pull(
- const hobject_t& oid, eversion_t v,
+ int recover_missing(
+ const hobject_t& oid,
+ eversion_t v,
int priority,
- map<int, vector<PullOp> > *pulls
- );
+ PGBackend::RecoveryHandle *h);
// low level ops
void _make_clone(ObjectStore::Transaction& t,
const hobject_t& head, const hobject_t& coid,
object_info_t *poi);
+ void execute_ctx(OpContext *ctx);
+ void reply_ctx(OpContext *ctx, int err);
+ void reply_ctx(OpContext *ctx, int err, eversion_t v, version_t uv);
void make_writeable(OpContext *ctx);
void log_op_stats(OpContext *ctx);
void write_update_size_and_usage(object_stat_sum_t& stats, object_info_t& oi,
SnapSet& ss, interval_set<uint64_t>& modified,
uint64_t offset, uint64_t length, bool count_bytes);
- void add_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& st);
+ void add_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& st);
+
+ inline bool maybe_handle_cache(OpRequestRef op, ObjectContextRef obc, int r);
+ void do_cache_redirect(OpRequestRef op, ObjectContextRef obc);
int prepare_transaction(OpContext *ctx);
@@ -608,13 +854,19 @@ protected:
* @bi [out] resulting map of objects to eversion_t's
*/
void scan_range(
- hobject_t begin, int min, int max, BackfillInterval *bi,
+ int min, int max, BackfillInterval *bi,
ThreadPool::TPHandle &handle
);
+ /// Update a hash range to reflect changes since the last scan
+ void update_range(
+ BackfillInterval *bi, ///< [in,out] interval to update
+ ThreadPool::TPHandle &handle ///< [in] tp handle
+ );
+
void prep_backfill_object_push(
hobject_t oid, eversion_t v, eversion_t have, int peer,
- map<int, vector<PushOp> > *pushes);
+ PGBackend::RecoveryHandle *h);
void send_remove_op(const hobject_t& oid, eversion_t v, int peer);
@@ -651,12 +903,17 @@ protected:
}
};
struct C_OSD_OndiskWriteUnlock : public Context {
- ObjectContextRef obc, obc2;
- C_OSD_OndiskWriteUnlock(ObjectContextRef o, ObjectContextRef o2 = ObjectContextRef()) : obc(o), obc2(o2) {}
+ ObjectContextRef obc, obc2, obc3;
+ C_OSD_OndiskWriteUnlock(
+ ObjectContextRef o,
+ ObjectContextRef o2 = ObjectContextRef(),
+ ObjectContextRef o3 = ObjectContextRef()) : obc(o), obc2(o2), obc3(o3) {}
void finish(int r) {
obc->ondisk_write_unlock();
if (obc2)
obc2->ondisk_write_unlock();
+ if (obc3)
+ obc3->ondisk_write_unlock();
}
};
struct C_OSD_OndiskWriteUnlockList : public Context {
@@ -688,35 +945,6 @@ protected:
pg->_committed_pushed_object(epoch, last_complete);
}
};
- struct C_OSD_SendMessageOnConn: public Context {
- OSDService *osd;
- Message *reply;
- ConnectionRef conn;
- C_OSD_SendMessageOnConn(
- OSDService *osd,
- Message *reply,
- ConnectionRef conn) : osd(osd), reply(reply), conn(conn) {}
- void finish(int) {
- osd->send_message_osd_cluster(reply, conn.get());
- }
- };
- struct C_OSD_CompletedPull : public Context {
- ReplicatedPGRef pg;
- hobject_t hoid;
- epoch_t epoch;
- C_OSD_CompletedPull(
- ReplicatedPG *pg,
- const hobject_t &hoid,
- epoch_t epoch) : pg(pg), hoid(hoid), epoch(epoch) {}
- void finish(int) {
- pg->lock();
- if (!pg->pg_has_reset_since(epoch)) {
- pg->finish_recovery_op(hoid);
- }
- pg->unlock();
- }
- };
- friend struct C_OSD_CompletedPull;
struct C_OSD_AppliedRecoveredObjectReplica : public Context {
ReplicatedPGRef pg;
C_OSD_AppliedRecoveredObjectReplica(ReplicatedPG *p) :
@@ -737,15 +965,23 @@ protected:
void _applied_recovered_object_replica();
void _committed_pushed_object(epoch_t epoch, eversion_t lc);
void recover_got(hobject_t oid, eversion_t v);
- void sub_op_push(OpRequestRef op);
- void _failed_push(int from, const hobject_t &soid);
- void sub_op_push_reply(OpRequestRef op);
- bool handle_push_reply(int peer, PushReplyOp &op, PushOp *reply);
- void sub_op_pull(OpRequestRef op);
- void handle_pull(int peer, PullOp &op, PushOp *reply);
- void log_subop_stats(OpRequestRef op, int tag_inb, int tag_lat);
+ // -- copyfrom --
+ map<hobject_t, CopyOpRef> copy_ops;
+
+ int start_copy(CopyCallback *cb, ObjectContextRef obc, hobject_t src,
+ object_locator_t oloc, version_t version,
+ const hobject_t& temp_dest_oid);
+ void process_copy_chunk(hobject_t oid, tid_t tid, int r);
+ void _write_copy_chunk(CopyOpRef cop, ObjectStore::Transaction *t);
+ void _copy_some(ObjectContextRef obc, CopyOpRef cop);
+ void _build_finish_copy_transaction(CopyOpRef cop,
+ ObjectStore::Transaction& t);
+ int finish_copyfrom(OpContext *ctx);
+ void cancel_copy(CopyOpRef cop);
+ void cancel_copy_ops();
+ friend class C_Copyfrom;
// -- scrub --
virtual void _scrub(ScrubMap& map);
@@ -772,6 +1008,9 @@ public:
int do_command(cmdmap_t cmdmap, ostream& ss, bufferlist& idata,
bufferlist& odata);
+ void do_request(
+ OpRequestRef op,
+ ThreadPool::TPHandle &handle);
void do_op(OpRequestRef op);
bool pg_op_must_wait(MOSDOp *op);
void do_pg_op(OpRequestRef op);
@@ -781,17 +1020,7 @@ public:
OpRequestRef op,
ThreadPool::TPHandle &handle);
void do_backfill(OpRequestRef op);
- void _do_push(OpRequestRef op);
- void _do_pull_response(OpRequestRef op);
- void do_push(OpRequestRef op) {
- if (is_primary()) {
- _do_pull_response(op);
- } else {
- _do_push(op);
- }
- }
- void do_pull(OpRequestRef op);
- void do_push_reply(OpRequestRef op);
+
RepGather *trim_object(const hobject_t &coid);
void snap_trimmer();
int do_osd_ops(OpContext *ctx, vector<OSDOp>& ops);
@@ -801,13 +1030,27 @@ public:
void do_osd_op_effects(OpContext *ctx);
private:
- bool temp_created;
- coll_t temp_coll;
+ uint64_t temp_seq; ///< last id for naming temp objects
coll_t get_temp_coll(ObjectStore::Transaction *t);
+ hobject_t generate_temp_object(); ///< generate a new temp object name
public:
- bool have_temp_coll();
- coll_t get_temp_coll() {
- return temp_coll;
+ void get_colls(list<coll_t> *out) {
+ out->push_back(coll);
+ return pgbackend->temp_colls(out);
+ }
+ void split_colls(
+ pg_t child,
+ int split_bits,
+ int seed,
+ ObjectStore::Transaction *t) {
+ coll_t target = coll_t(child);
+ t->create_collection(target);
+ t->split_collection(
+ coll,
+ split_bits,
+ seed,
+ target);
+ pgbackend->split_colls(child, split_bits, seed, t);
}
private:
struct NotTrimming;
@@ -863,7 +1106,6 @@ private:
int _get_tmap(OpContext *ctx, map<string, bufferlist> *out,
bufferlist *header);
- int _copy_up_tmap(OpContext *ctx);
int _delete_head(OpContext *ctx);
int _rollback_to(OpContext *ctx, ceph_osd_op& op);
public:
@@ -880,6 +1122,9 @@ public:
bool is_degraded_object(const hobject_t& oid);
void wait_for_degraded_object(const hobject_t& oid, OpRequestRef op);
+ void wait_for_blocked_object(const hobject_t& soid, OpRequestRef op);
+ void kick_object_context_blocked(ObjectContextRef obc);
+
void mark_all_unfound_lost(int what);
eversion_t pick_newest_available(const hobject_t& oid);
ObjectContextRef mark_object_lost(ObjectStore::Transaction *t,
@@ -890,7 +1135,10 @@ public:
void on_role_change();
void on_change(ObjectStore::Transaction *t);
void on_activate();
- void on_flushed();
+ void on_flushed() {
+ assert(object_contexts.empty());
+ pgbackend->on_flushed();
+ }
void on_removal(ObjectStore::Transaction *t);
void on_shutdown();
};
@@ -906,7 +1154,7 @@ inline ostream& operator<<(ostream& out, ReplicatedPG::RepGather& repop)
//<< " wfnvram=" << repop.waitfor_nvram
<< " wfdisk=" << repop.waitfor_disk;
if (repop.ctx->op)
- out << " op=" << *(repop.ctx->op->request);
+ out << " op=" << *(repop.ctx->op->get_req());
out << ")";
return out;
}
diff --git a/src/osd/SnapMapper.h b/src/osd/SnapMapper.h
index 560cc43497f..f0d0baa2190 100644
--- a/src/osd/SnapMapper.h
+++ b/src/osd/SnapMapper.h
@@ -21,7 +21,7 @@
#include <string.h>
#include "common/map_cacher.hpp"
-#include "os/hobject.h"
+#include "common/hobject.h"
#include "include/buffer.h"
#include "include/encoding.h"
#include "include/object.h"
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index ea3e5d5c3eb..1a9dde665cf 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -101,6 +101,41 @@ void object_locator_t::generate_test_instances(list<object_locator_t*>& o)
o.push_back(new object_locator_t(12, "n1", "key2"));
}
+// -- request_redirect_t --
+void request_redirect_t::encode(bufferlist& bl) const
+{
+ ENCODE_START(1, 1, bl);
+ ::encode(redirect_locator, bl);
+ ::encode(redirect_object, bl);
+ ::encode(osd_instructions, bl);
+ ENCODE_FINISH(bl);
+}
+
+void request_redirect_t::decode(bufferlist::iterator& bl)
+{
+ DECODE_START(1, bl);
+ ::decode(redirect_locator, bl);
+ ::decode(redirect_object, bl);
+ ::decode(osd_instructions, bl);
+ DECODE_FINISH(bl);
+}
+
+void request_redirect_t::dump(Formatter *f) const
+{
+ f->dump_string("object", redirect_object);
+ f->open_object_section("locator");
+ redirect_locator.dump(f);
+ f->close_section(); // locator
+}
+
+void request_redirect_t::generate_test_instances(list<request_redirect_t*>& o)
+{
+ object_locator_t loc(1, "redir_obj");
+ o.push_back(new request_redirect_t());
+ o.push_back(new request_redirect_t(loc, 0));
+ o.push_back(new request_redirect_t(loc, "redir_obj"));
+ o.push_back(new request_redirect_t(loc));
+}
// -- pow2_hist_t --
void pow2_hist_t::dump(Formatter *f) const
@@ -358,6 +393,8 @@ ostream& operator<<(ostream& out, const pg_t &pg)
// -- coll_t --
+const coll_t coll_t::META_COLL("meta");
+
bool coll_t::is_temp(pg_t& pgid) const
{
const char *cstr(str.c_str());
@@ -618,6 +655,7 @@ void pool_snap_info_t::generate_test_instances(list<pool_snap_info_t*>& o)
void pg_pool_t::dump(Formatter *f) const
{
f->dump_unsigned("flags", get_flags());
+ f->dump_string("flags_names", get_flags_string());
f->dump_int("type", get_type());
f->dump_int("size", get_size());
f->dump_int("min_size", get_min_size());
@@ -641,6 +679,22 @@ void pg_pool_t::dump(Formatter *f) const
f->dump_stream("removed_snaps") << removed_snaps;
f->dump_int("quota_max_bytes", quota_max_bytes);
f->dump_int("quota_max_objects", quota_max_objects);
+ f->open_array_section("tiers");
+ for (set<uint64_t>::const_iterator p = tiers.begin(); p != tiers.end(); ++p)
+ f->dump_int("pool_id", *p);
+ f->close_section();
+ f->dump_int("tier_of", tier_of);
+ f->dump_int("read_tier", read_tier);
+ f->dump_int("write_tier", write_tier);
+ f->dump_string("cache_mode", get_cache_mode_name());
+ f->open_array_section("properties");
+ for (map<string,string>::const_iterator i = properties.begin();
+ i != properties.end();
+ ++i) {
+ string name = i->first;
+ f->dump_string(name.c_str(), i->second);
+ }
+ f->close_section();
}
@@ -845,7 +899,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
return;
}
- ENCODE_START(8, 5, bl);
+ ENCODE_START(10, 5, bl);
::encode(type, bl);
::encode(size, bl);
::encode(crush_ruleset, bl);
@@ -866,6 +920,13 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
::encode(min_size, bl);
::encode(quota_max_bytes, bl);
::encode(quota_max_objects, bl);
+ ::encode(tiers, bl);
+ ::encode(tier_of, bl);
+ __u8 c = cache_mode;
+ ::encode(c, bl);
+ ::encode(read_tier, bl);
+ ::encode(write_tier, bl);
+ ::encode(properties, bl);
ENCODE_FINISH(bl);
}
@@ -924,6 +985,18 @@ void pg_pool_t::decode(bufferlist::iterator& bl)
::decode(quota_max_bytes, bl);
::decode(quota_max_objects, bl);
}
+ if (struct_v >= 9) {
+ ::decode(tiers, bl);
+ ::decode(tier_of, bl);
+ __u8 v;
+ ::decode(v, bl);
+ cache_mode = (cache_mode_t)v;
+ ::decode(read_tier, bl);
+ ::decode(write_tier, bl);
+ }
+ if (struct_v >= 10) {
+ ::decode(properties, bl);
+ }
DECODE_FINISH(bl);
calc_pg_masks();
}
@@ -959,6 +1032,14 @@ void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
a.removed_snaps.insert(2); // not quite valid to combine with snaps!
a.quota_max_bytes = 2473;
a.quota_max_objects = 4374;
+ a.tiers.insert(0);
+ a.tiers.insert(1);
+ a.tier_of = 2;
+ a.cache_mode = CACHEMODE_WRITEBACK;
+ a.read_tier = 1;
+ a.write_tier = 1;
+ a.properties["p-1"] = "v-1";
+ a.properties["empty"] = string();
o.push_back(new pg_pool_t(a));
}
@@ -974,13 +1055,23 @@ ostream& operator<<(ostream& out, const pg_pool_t& p)
<< " last_change " << p.get_last_change()
<< " owner " << p.get_auid();
if (p.flags)
- out << " flags " << p.flags;
+ out << " flags " << p.get_flags_string();
if (p.crash_replay_interval)
out << " crash_replay_interval " << p.crash_replay_interval;
if (p.quota_max_bytes)
out << " max_bytes " << p.quota_max_bytes;
if (p.quota_max_objects)
out << " max_objects " << p.quota_max_objects;
+ if (p.tiers.size())
+ out << " tiers " << p.tiers;
+ if (p.is_tier())
+ out << " tier_of " << p.tier_of;
+ if (p.has_read_tier())
+ out << " read_tier " << p.read_tier;
+ if (p.has_write_tier())
+ out << " write_tier " << p.write_tier;
+ if (p.cache_mode)
+ out << " cache_mode " << p.get_cache_mode_name();
return out;
}
@@ -1553,7 +1644,7 @@ void pg_history_t::generate_test_instances(list<pg_history_t*>& o)
void pg_info_t::encode(bufferlist &bl) const
{
- ENCODE_START(27, 26, bl);
+ ENCODE_START(28, 26, bl);
::encode(pgid, bl);
::encode(last_update, bl);
::encode(last_complete, bl);
@@ -1563,12 +1654,13 @@ void pg_info_t::encode(bufferlist &bl) const
history.encode(bl);
::encode(purged_snaps, bl);
::encode(last_epoch_started, bl);
+ ::encode(last_user_version, bl);
ENCODE_FINISH(bl);
}
void pg_info_t::decode(bufferlist::iterator &bl)
{
- DECODE_START_LEGACY_COMPAT_LEN(27, 26, 26, bl);
+ DECODE_START_LEGACY_COMPAT_LEN(28, 26, 26, bl);
if (struct_v < 23) {
old_pg_t opgid;
::decode(opgid, bl);
@@ -1598,6 +1690,10 @@ void pg_info_t::decode(bufferlist::iterator &bl)
} else {
::decode(last_epoch_started, bl);
}
+ if (struct_v >= 28)
+ ::decode(last_user_version, bl);
+ else
+ last_user_version = last_update.version;
DECODE_FINISH(bl);
}
@@ -1609,6 +1705,7 @@ void pg_info_t::dump(Formatter *f) const
f->dump_stream("last_update") << last_update;
f->dump_stream("last_complete") << last_complete;
f->dump_stream("log_tail") << log_tail;
+ f->dump_int("last_user_version", last_user_version);
f->dump_stream("last_backfill") << last_backfill;
f->dump_stream("purged_snaps") << purged_snaps;
f->open_object_section("history");
@@ -1634,6 +1731,7 @@ void pg_info_t::generate_test_instances(list<pg_info_t*>& o)
o.back()->pgid = pg_t(1, 2, -1);
o.back()->last_update = eversion_t(3, 4);
o.back()->last_complete = eversion_t(5, 6);
+ o.back()->last_user_version = 2;
o.back()->log_tail = eversion_t(7, 8);
o.back()->last_backfill = hobject_t(object_t("objname"), "key", 123, 456, -1, "");
list<pg_stat_t*> s;
@@ -1912,7 +2010,7 @@ void pg_log_entry_t::decode_with_checksum(bufferlist::iterator& p)
void pg_log_entry_t::encode(bufferlist &bl) const
{
- ENCODE_START(7, 4, bl);
+ ENCODE_START(8, 4, bl);
::encode(op, bl);
::encode(soid, bl);
::encode(version, bl);
@@ -1934,12 +2032,13 @@ void pg_log_entry_t::encode(bufferlist &bl) const
if (op == LOST_REVERT)
::encode(prior_version, bl);
::encode(snaps, bl);
+ ::encode(user_version, bl);
ENCODE_FINISH(bl);
}
void pg_log_entry_t::decode(bufferlist::iterator &bl)
{
- DECODE_START_LEGACY_COMPAT_LEN(7, 4, 4, bl);
+ DECODE_START_LEGACY_COMPAT_LEN(8, 4, 4, bl);
::decode(op, bl);
if (struct_v < 2) {
sobject_t old_soid;
@@ -1976,6 +2075,11 @@ void pg_log_entry_t::decode(bufferlist::iterator &bl)
::decode(snaps, bl);
}
+ if (struct_v >= 8)
+ ::decode(user_version, bl);
+ else
+ user_version = version.version;
+
DECODE_FINISH(bl);
}
@@ -2008,7 +2112,8 @@ void pg_log_entry_t::generate_test_instances(list<pg_log_entry_t*>& o)
o.push_back(new pg_log_entry_t());
hobject_t oid(object_t("objname"), "key", 123, 456, 0, "");
o.push_back(new pg_log_entry_t(MODIFY, oid, eversion_t(1,2), eversion_t(3,4),
- osd_reqid_t(entity_name_t::CLIENT(777), 8, 999), utime_t(8,9)));
+ 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
+ utime_t(8,9)));
}
ostream& operator<<(ostream& out, const pg_log_entry_t& e)
@@ -2369,6 +2474,55 @@ void pg_missing_t::split_into(
}
}
+// -- object_copy_cursor_t --
+
+void object_copy_cursor_t::encode(bufferlist& bl) const
+{
+ ENCODE_START(1, 1, bl);
+ ::encode(attr_complete, bl);
+ ::encode(data_offset, bl);
+ ::encode(data_complete, bl);
+ ::encode(omap_offset, bl);
+ ::encode(omap_complete, bl);
+ ENCODE_FINISH(bl);
+}
+
+void object_copy_cursor_t::decode(bufferlist::iterator &bl)
+{
+ DECODE_START(1, bl);
+ ::decode(attr_complete, bl);
+ ::decode(data_offset, bl);
+ ::decode(data_complete, bl);
+ ::decode(omap_offset, bl);
+ ::decode(omap_complete, bl);
+ DECODE_FINISH(bl);
+}
+
+void object_copy_cursor_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("attr_complete", (int)attr_complete);
+ f->dump_unsigned("data_offset", data_offset);
+ f->dump_unsigned("data_complete", (int)data_complete);
+ f->dump_string("omap_offset", omap_offset);
+ f->dump_unsigned("omap_complete", (int)omap_complete);
+}
+
+void object_copy_cursor_t::generate_test_instances(list<object_copy_cursor_t*>& o)
+{
+ o.push_back(new object_copy_cursor_t);
+ o.push_back(new object_copy_cursor_t);
+ o.back()->attr_complete = true;
+ o.back()->data_offset = 123;
+ o.push_back(new object_copy_cursor_t);
+ o.back()->attr_complete = true;
+ o.back()->data_complete = true;
+ o.back()->omap_offset = "foo";
+ o.push_back(new object_copy_cursor_t);
+ o.back()->attr_complete = true;
+ o.back()->data_complete = true;
+ o.back()->omap_complete = true;
+}
+
// -- pg_create_t --
void pg_create_t::encode(bufferlist &bl) const
@@ -2643,9 +2797,8 @@ void object_info_t::copy_user_bits(const object_info_t& other)
last_reqid = other.last_reqid;
truncate_seq = other.truncate_seq;
truncate_size = other.truncate_size;
- lost = other.lost;
+ flags = other.flags;
category = other.category;
- uses_tmap = other.uses_tmap;
}
ps_t object_info_t::legacy_object_locator_to_ps(const object_t &oid,
@@ -2671,7 +2824,7 @@ void object_info_t::encode(bufferlist& bl) const
++i) {
old_watchers.insert(make_pair(i->first.second, i->second));
}
- ENCODE_START(11, 8, bl);
+ ENCODE_START(12, 8, bl);
::encode(soid, bl);
::encode(myoloc, bl); //Retained for compatibility
::encode(category, bl);
@@ -2686,10 +2839,15 @@ void object_info_t::encode(bufferlist& bl) const
::encode(snaps, bl);
::encode(truncate_seq, bl);
::encode(truncate_size, bl);
- ::encode(lost, bl);
+ __u8 flags_lo = flags & 0xff;
+ __u8 flags_hi = (flags & 0xff00) >> 8;
+ ::encode(flags_lo, bl);
::encode(old_watchers, bl);
- ::encode(user_version, bl);
- ::encode(uses_tmap, bl);
+ /* shenanigans to avoid breaking backwards compatibility in the disk format.
+ * When we can, switch this out for simply putting the version_t on disk. */
+ eversion_t user_eversion(0, user_version);
+ ::encode(user_eversion, bl);
+ ::encode(flags_hi, bl);
::encode(watchers, bl);
ENCODE_FINISH(bl);
}
@@ -2697,7 +2855,7 @@ void object_info_t::encode(bufferlist& bl) const
void object_info_t::decode(bufferlist::iterator& bl)
{
object_locator_t myoloc;
- DECODE_START_LEGACY_COMPAT_LEN(11, 8, 8, bl);
+ DECODE_START_LEGACY_COMPAT_LEN(12, 8, 8, bl);
map<entity_name_t, watch_info_t> old_watchers;
if (struct_v >= 2 && struct_v <= 5) {
sobject_t obj;
@@ -2727,18 +2885,26 @@ void object_info_t::decode(bufferlist::iterator& bl)
::decode(snaps, bl);
::decode(truncate_seq, bl);
::decode(truncate_size, bl);
- if (struct_v >= 3)
- ::decode(lost, bl);
- else
- lost = false;
+ if (struct_v >= 3) {
+ __u8 lo;
+ ::decode(lo, bl);
+ flags = (flag_t)lo;
+ } else {
+ flags = (flag_t)0;
+ }
if (struct_v >= 4) {
::decode(old_watchers, bl);
- ::decode(user_version, bl);
+ eversion_t user_eversion;
+ ::decode(user_eversion, bl);
+ user_version = user_eversion.version;
+ }
+ if (struct_v >= 9) {
+ __u8 hi;
+ ::decode(hi, bl);
+ flags = (flag_t)(flags | ((unsigned)hi << 8));
+ } else {
+ set_flag(FLAG_USES_TMAP);
}
- if (struct_v >= 9)
- ::decode(uses_tmap, bl);
- else
- uses_tmap = true;
if (struct_v < 10)
soid.pool = myoloc.pool;
if (struct_v >= 11) {
@@ -2766,7 +2932,8 @@ void object_info_t::dump(Formatter *f) const
f->dump_stream("last_reqid") << last_reqid;
f->dump_unsigned("size", size);
f->dump_stream("mtime") << mtime;
- f->dump_unsigned("lost", lost);
+ f->dump_unsigned("lost", (int)is_lost());
+ f->dump_unsigned("flags", (int)flags);
f->dump_stream("wrlock_by") << wrlock_by;
f->open_array_section("snaps");
for (vector<snapid_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p)
@@ -2802,8 +2969,8 @@ ostream& operator<<(ostream& out, const object_info_t& oi)
out << " wrlock_by=" << oi.wrlock_by;
else
out << " " << oi.snaps;
- if (oi.lost)
- out << " LOST";
+ if (oi.flags)
+ out << " " << oi.get_flag_string();
out << ")";
return out;
}
@@ -3357,6 +3524,11 @@ ostream& operator<<(ostream& out, const OSDOp& op)
case CEPH_OSD_OP_DELETE:
case CEPH_OSD_OP_LIST_WATCHERS:
case CEPH_OSD_OP_LIST_SNAPS:
+ case CEPH_OSD_OP_UNDIRTY:
+ case CEPH_OSD_OP_ISDIRTY:
+ break;
+ case CEPH_OSD_OP_ASSERT_VER:
+ out << " v" << op.op.assert_ver.ver;
break;
case CEPH_OSD_OP_TRUNCATE:
out << " " << op.op.extent.offset;
@@ -3372,6 +3544,11 @@ ostream& operator<<(ostream& out, const OSDOp& op)
out << (op.op.watch.flag ? " add":" remove")
<< " cookie " << op.op.watch.cookie << " ver " << op.op.watch.ver;
break;
+ case CEPH_OSD_OP_COPY_GET:
+ out << " max " << op.op.copy_get.max;
+ case CEPH_OSD_OP_COPY_FROM:
+ out << " ver " << op.op.copy_from.src_version;
+ break;
default:
out << " " << op.op.extent.offset << "~" << op.op.extent.length;
if (op.op.extent.truncate_seq)
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index 6cdacc9902c..8ceeb539c1a 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -23,10 +23,11 @@
#include "include/types.h"
#include "include/utime.h"
#include "include/CompatSet.h"
+#include "include/histogram.h"
#include "include/interval_set.h"
#include "common/snap_types.h"
#include "common/Formatter.h"
-#include "os/hobject.h"
+#include "common/hobject.h"
#include "Watch.h"
#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v026"
@@ -41,10 +42,12 @@
#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo")
#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog")
#define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper")
+#define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects")
typedef hobject_t collection_list_handle_t;
+typedef uint8_t shard_id_t;
/**
* osd request identifier
@@ -129,6 +132,10 @@ struct object_locator_t {
nspace = "";
}
+ bool empty() const {
+ return pool == -1;
+ }
+
void encode(bufferlist& bl) const;
void decode(bufferlist::iterator& p);
void dump(Formatter *f) const;
@@ -153,6 +160,47 @@ inline ostream& operator<<(ostream& out, const object_locator_t& loc)
return out;
}
+struct request_redirect_t {
+private:
+ object_locator_t redirect_locator; ///< this is authoritative
+ string redirect_object; ///< If non-empty, the request goes to this object name
+ bufferlist osd_instructions; ///< a bufferlist for the OSDs, passed but not interpreted by clients
+
+ friend ostream& operator<<(ostream& out, const request_redirect_t& redir);
+public:
+
+ request_redirect_t() {}
+ explicit request_redirect_t(const object_locator_t& orig, int64_t rpool) :
+ redirect_locator(orig) { redirect_locator.pool = rpool; }
+ explicit request_redirect_t(const object_locator_t& rloc) :
+ redirect_locator(rloc) {}
+ explicit request_redirect_t(const object_locator_t& orig,
+ const string& robj) :
+ redirect_locator(orig), redirect_object(robj) {}
+
+ void set_instructions(const bufferlist& bl) { osd_instructions = bl; }
+ const bufferlist& get_instructions() { return osd_instructions; }
+
+ bool empty() const { return redirect_locator.empty() &&
+ redirect_object.empty(); }
+
+ void combine_with_locator(object_locator_t& orig, string& obj) const {
+ orig = redirect_locator;
+ if (!redirect_object.empty())
+ obj = redirect_object;
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::iterator& bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<request_redirect_t*>& o);
+};
+WRITE_CLASS_ENCODER(request_redirect_t)
+
+inline ostream& operator<<(ostream& out, const request_redirect_t& redir) {
+ out << "object " << redir.redirect_object << ", locator{" << redir.redirect_locator << "}";
+ return out;
+}
// Internal OSD op flags - set by the OSD based on the op types
enum {
@@ -508,67 +556,6 @@ inline ostream& operator<<(ostream& out, const eversion_t e) {
return out << e.epoch << "'" << e.version;
}
-
-/**
- * power of 2 histogram
- */
-struct pow2_hist_t {
- /**
- * histogram
- *
- * bin size is 2^index
- * value is count of elements that are <= the current bin but > the previous bin.
- */
- vector<int32_t> h;
-
-private:
- /// expand to at least another's size
- void _expand_to(unsigned s) {
- if (s > h.size())
- h.resize(s, 0);
- }
- /// drop useless trailing 0's
- void _contract() {
- unsigned p = h.size();
- while (p > 0 && h[p-1] == 0)
- --p;
- h.resize(p);
- }
-
-public:
- void clear() {
- h.clear();
- }
- void set(int bin, int32_t v) {
- _expand_to(bin + 1);
- h[bin] = v;
- _contract();
- }
-
- void add(const pow2_hist_t& o) {
- _expand_to(o.h.size());
- for (unsigned p = 0; p < o.h.size(); ++p)
- h[p] += o.h[p];
- _contract();
- }
- void sub(const pow2_hist_t& o) {
- _expand_to(o.h.size());
- for (unsigned p = 0; p < o.h.size(); ++p)
- h[p] -= o.h[p];
- _contract();
- }
-
- int32_t upper_bound() const {
- return 1 << h.size();
- }
-
- void dump(Formatter *f) const;
- void encode(bufferlist &bl) const;
- void decode(bufferlist::iterator &bl);
- static void generate_test_instances(std::list<pow2_hist_t*>& o);
-};
-WRITE_CLASS_ENCODER(pow2_hist_t)
-
/**
* filestore_perf_stat_t
*
@@ -722,11 +709,6 @@ struct pg_pool_t {
TYPE_REP = 1, // replication
TYPE_RAID4 = 2, // raid4 (never implemented)
};
- enum {
- FLAG_HASHPSPOOL = 1, // hash pg seed and pool together (instead of adding)
- FLAG_FULL = 2, // pool is full
- };
-
static const char *get_type_name(int t) {
switch (t) {
case TYPE_REP: return "rep";
@@ -738,6 +720,63 @@ struct pg_pool_t {
return get_type_name(type);
}
+ enum {
+ FLAG_HASHPSPOOL = 1, // hash pg seed and pool together (instead of adding)
+ FLAG_FULL = 2, // pool is full
+ };
+
+ static const char *get_flag_name(int f) {
+ switch (f) {
+ case FLAG_HASHPSPOOL: return "hashpspool";
+ case FLAG_FULL: return "full";
+ default: return "???";
+ }
+ }
+ static string get_flags_string(uint64_t f) {
+ string s;
+ for (unsigned n=0; f && n<64; ++n) {
+ if (f & (1ull << n)) {
+ if (s.length())
+ s += ",";
+ s += get_flag_name(1ull << n);
+ }
+ }
+ return s;
+ }
+ string get_flags_string() const {
+ return get_flags_string(flags);
+ }
+
+ typedef enum {
+ CACHEMODE_NONE = 0, ///< no caching
+ CACHEMODE_WRITEBACK = 1, ///< write to cache, flush later
+ CACHEMODE_INVALIDATE_FORWARD = 2, ///< delete from cache, forward write
+ CACHEMODE_READONLY = 3, ///< handle reads, forward writes [not strongly consistent]
+ } cache_mode_t;
+ static const char *get_cache_mode_name(cache_mode_t m) {
+ switch (m) {
+ case CACHEMODE_NONE: return "none";
+ case CACHEMODE_WRITEBACK: return "writeback";
+ case CACHEMODE_INVALIDATE_FORWARD: return "invalidate+forward";
+ case CACHEMODE_READONLY: return "readonly";
+ default: return "unknown";
+ }
+ }
+ static cache_mode_t get_cache_mode_from_str(const string& s) {
+ if (s == "none")
+ return CACHEMODE_NONE;
+ if (s == "writeback")
+ return CACHEMODE_WRITEBACK;
+ if (s == "invalidate+forward")
+ return CACHEMODE_INVALIDATE_FORWARD;
+ if (s == "readonly")
+ return CACHEMODE_READONLY;
+ return (cache_mode_t)-1;
+ }
+ const char *get_cache_mode_name() const {
+ return get_cache_mode_name(cache_mode);
+ }
+
uint64_t flags; /// FLAG_*
__u8 type; /// TYPE_*
__u8 size, min_size; /// number of osds in each pg
@@ -745,7 +784,10 @@ struct pg_pool_t {
__u8 object_hash; /// hash mapping object name to ps
private:
__u32 pg_num, pgp_num; /// number of pgs
+
+
public:
+ map<string,string> properties; /// interpreted according to the pool type
epoch_t last_change; /// most recent epoch changed, exclusing snapshot changes
snapid_t snap_seq; /// seq for per-pool snapshot
epoch_t snap_epoch; /// osdmap epoch of last snap
@@ -771,6 +813,21 @@ public:
int pg_num_mask, pgp_num_mask;
+ set<uint64_t> tiers; ///< pools that are tiers of us
+ int64_t tier_of; ///< pool for which we are a tier
+ // Note that write wins for read+write ops
+ int64_t read_tier; ///< pool/tier for objecter to direct reads to
+ int64_t write_tier; ///< pool/tier for objecter to direct writes to
+ cache_mode_t cache_mode; ///< cache pool mode
+
+
+ bool is_tier() const { return tier_of >= 0; }
+ void clear_tier() { tier_of = -1; }
+ bool has_read_tier() const { return read_tier >= 0; }
+ void clear_read_tier() { read_tier = -1; }
+ bool has_write_tier() const { return write_tier >= 0; }
+ void clear_write_tier() { write_tier = -1; }
+
pg_pool_t()
: flags(0), type(0), size(0), min_size(0),
crush_ruleset(0), object_hash(0),
@@ -780,7 +837,10 @@ public:
auid(0),
crash_replay_interval(0),
quota_max_bytes(0), quota_max_objects(0),
- pg_num_mask(0), pgp_num_mask(0) { }
+ pg_num_mask(0), pgp_num_mask(0),
+ tier_of(-1), read_tier(-1), write_tier(-1),
+ cache_mode(CACHEMODE_NONE)
+ { }
void dump(Formatter *f) const;
@@ -1267,6 +1327,8 @@ struct pg_info_t {
eversion_t last_complete; // last version pg was complete through.
epoch_t last_epoch_started;// last epoch at which this pg started on this osd
+ version_t last_user_version; // last user object version applied to store
+
eversion_t log_tail; // oldest log entry.
hobject_t last_backfill; // objects >= this and < last_complete may be missing
@@ -1278,11 +1340,13 @@ struct pg_info_t {
pg_history_t history;
pg_info_t()
- : last_epoch_started(0), last_backfill(hobject_t::get_max())
+ : last_epoch_started(0), last_user_version(0),
+ last_backfill(hobject_t::get_max())
{ }
pg_info_t(pg_t p)
: pgid(p),
- last_epoch_started(0), last_backfill(hobject_t::get_max())
+ last_epoch_started(0), last_user_version(0),
+ last_backfill(hobject_t::get_max())
{ }
bool is_empty() const { return last_update.version == 0; }
@@ -1481,6 +1545,7 @@ struct pg_log_entry_t {
__s32 op;
hobject_t soid;
eversion_t version, prior_version, reverting_to;
+ version_t user_version; // the user version for this entry
osd_reqid_t reqid; // caller+tid to uniquely identify request
utime_t mtime; // this is the _user_ mtime, mind you
bufferlist snaps; // only for clone entries
@@ -1490,12 +1555,14 @@ struct pg_log_entry_t {
uint64_t offset; // [soft state] my offset on disk
pg_log_entry_t()
- : op(0), invalid_hash(false), invalid_pool(false), offset(0) {}
+ : op(0), user_version(0),
+ invalid_hash(false), invalid_pool(false), offset(0) {}
pg_log_entry_t(int _op, const hobject_t& _soid,
const eversion_t& v, const eversion_t& pv,
+ version_t uv,
const osd_reqid_t& rid, const utime_t& mt)
: op(_op), soid(_soid), version(v),
- prior_version(pv),
+ prior_version(pv), user_version(uv),
reqid(rid), mtime(mt), invalid_hash(false), invalid_pool(false),
offset(0) {}
@@ -1759,6 +1826,37 @@ struct pg_ls_response_t {
WRITE_CLASS_ENCODER(pg_ls_response_t)
+/**
+ * object_copy_cursor_t
+ */
+struct object_copy_cursor_t {
+ bool attr_complete;
+ uint64_t data_offset;
+ bool data_complete;
+ string omap_offset;
+ bool omap_complete;
+
+ object_copy_cursor_t()
+ : attr_complete(false),
+ data_offset(0),
+ data_complete(false),
+ omap_complete(false)
+ {}
+
+ bool is_initial() const {
+ return !attr_complete && data_offset == 0 && omap_offset.empty();
+ }
+ bool is_complete() const {
+ return attr_complete && data_complete && omap_complete;
+ }
+
+ static void generate_test_instances(list<object_copy_cursor_t*>& o);
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::iterator &bl);
+ void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(object_copy_cursor_t)
+
/**
* pg creation info
@@ -1952,27 +2050,73 @@ struct object_info_t {
string category;
eversion_t version, prior_version;
- eversion_t user_version;
+ version_t user_version;
osd_reqid_t last_reqid;
uint64_t size;
utime_t mtime;
- bool lost;
+
+ // note: these are currently encoded into a total 16 bits; see
+ // encode()/decode() for the weirdness.
+ typedef enum {
+ FLAG_LOST = 1<<0,
+ FLAG_WHITEOUT = 1<<1, // object logically does not exist
+ FLAG_DIRTY = 1<<2, // object has been modified since last flushed or undirtied
+ // ...
+ FLAG_USES_TMAP = 1<<8, // deprecated; no longer used.
+ } flag_t;
+
+ flag_t flags;
+
+ static string get_flag_string(flag_t flags) {
+ string s;
+ if (flags & FLAG_LOST)
+ s += "|lost";
+ if (flags & FLAG_WHITEOUT)
+ s += "|whiteout";
+ if (flags & FLAG_DIRTY)
+ s += "|dirty";
+ if (flags & FLAG_USES_TMAP)
+ s += "|uses_tmap";
+ if (s.length())
+ return s.substr(1);
+ return s;
+ }
+ string get_flag_string() const {
+ return get_flag_string(flags);
+ }
osd_reqid_t wrlock_by; // [head]
vector<snapid_t> snaps; // [clone]
uint64_t truncate_seq, truncate_size;
-
map<pair<uint64_t, entity_name_t>, watch_info_t> watchers;
- bool uses_tmap;
void copy_user_bits(const object_info_t& other);
static ps_t legacy_object_locator_to_ps(const object_t &oid,
const object_locator_t &loc);
+ bool test_flag(flag_t f) const {
+ return (flags & f) == f;
+ }
+ void set_flag(flag_t f) {
+ flags = (flag_t)(flags | f);
+ }
+ void clear_flag(flag_t f) {
+ flags = (flag_t)(flags & ~f);
+ }
+ bool is_lost() const {
+ return test_flag(FLAG_LOST);
+ }
+ bool is_whiteout() const {
+ return test_flag(FLAG_WHITEOUT);
+ }
+ bool is_dirty() const {
+ return test_flag(FLAG_DIRTY);
+ }
+
void encode(bufferlist& bl) const;
void decode(bufferlist::iterator& bl);
void decode(bufferlist& bl) {
@@ -1983,13 +2127,14 @@ struct object_info_t {
static void generate_test_instances(list<object_info_t*>& o);
explicit object_info_t()
- : size(0), lost(false),
- truncate_seq(0), truncate_size(0), uses_tmap(false)
+ : user_version(0), size(0), flags((flag_t)0),
+ truncate_seq(0), truncate_size(0)
{}
object_info_t(const hobject_t& s)
- : soid(s), size(0),
- lost(false), truncate_seq(0), truncate_size(0), uses_tmap(false) {}
+ : soid(s),
+ user_version(0), size(0), flags((flag_t)0),
+ truncate_seq(0), truncate_size(0) {}
object_info_t(bufferlist& bl) {
decode(bl);
@@ -1999,7 +2144,7 @@ WRITE_CLASS_ENCODER(object_info_t)
struct ObjectState {
object_info_t oi;
- bool exists;
+ bool exists; ///< the stored object exists (i.e., we will remember the object_info_t)
ObjectState() : exists(false) {}
@@ -2042,6 +2187,9 @@ public:
Cond cond;
int unstable_writes, readers, writers_waiting, readers_waiting;
+ /// in-progress copyfrom ops for this object
+ int copyfrom_readside;
+
// set if writes for this object are blocked on another objects recovery
ObjectContextRef blocked_by; // object blocking our writes
set<ObjectContextRef> blocking; // objects whose writes we block
@@ -2053,12 +2201,18 @@ public:
: ssc(NULL),
destructor_callback(0),
lock("ReplicatedPG::ObjectContext::lock"),
- unstable_writes(0), readers(0), writers_waiting(0), readers_waiting(0) {}
+ unstable_writes(0), readers(0), writers_waiting(0), readers_waiting(0),
+ copyfrom_readside(0) {}
~ObjectContext() {
if (destructor_callback)
destructor_callback->complete(0);
}
+
+ bool is_blocked() const {
+ return copyfrom_readside > 0;
+ }
+
// do simple synchronous mutual exclusion, for now. now waitqueues or anything fancy.
void ondisk_write_lock() {
lock.Lock();
diff --git a/src/osdc/Makefile.am b/src/osdc/Makefile.am
new file mode 100644
index 00000000000..3a8a2165aaa
--- /dev/null
+++ b/src/osdc/Makefile.am
@@ -0,0 +1,17 @@
+libosdc_la_SOURCES = \
+ osdc/Objecter.cc \
+ osdc/ObjectCacher.cc \
+ osdc/Filer.cc \
+ osdc/Striper.cc \
+ osdc/Journaler.cc
+noinst_LTLIBRARIES += libosdc.la
+
+noinst_HEADERS += \
+ osdc/Blinker.h \
+ osdc/Filer.h \
+ osdc/Journaler.h \
+ osdc/ObjectCacher.h \
+ osdc/Objecter.h \
+ osdc/Striper.h \
+ osdc/WritebackHandler.h
+
diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc
index 01eeccc03be..81335b7957f 100644
--- a/src/osdc/ObjectCacher.cc
+++ b/src/osdc/ObjectCacher.cc
@@ -11,6 +11,8 @@
#include "include/assert.h"
+#define MAX_FLUSH_UNDER_LOCK 20 ///< max bh's we start writeback on while holding the lock
+
/*** ObjectCacher::BufferHead ***/
@@ -899,11 +901,10 @@ void ObjectCacher::bh_write_commit(int64_t poolid, sobject_t oid, loff_t start,
ob->last_commit_tid = tid;
// waiters?
+ list<Context*> ls;
if (ob->waitfor_commit.count(tid)) {
- list<Context*> ls;
ls.splice(ls.begin(), ob->waitfor_commit[tid]);
ob->waitfor_commit.erase(tid);
- finish_contexts(cct, ls, r);
}
// is the entire object set now clean and fully committed?
@@ -915,6 +916,9 @@ void ObjectCacher::bh_write_commit(int64_t poolid, sobject_t oid, loff_t start,
oset->dirty_or_tx == 0) { // nothing dirty/tx
flush_set_callback(flush_set_callback_arg, oset);
}
+
+ if (!ls.empty())
+ finish_contexts(cct, ls, r);
}
}
@@ -1446,8 +1450,10 @@ void ObjectCacher::flusher_entry()
utime_t cutoff = ceph_clock_now(cct);
cutoff -= max_dirty_age;
BufferHead *bh = 0;
+ int max = MAX_FLUSH_UNDER_LOCK;
while ((bh = static_cast<BufferHead*>(bh_lru_dirty.lru_get_next_expire())) != 0 &&
- bh->last_write < cutoff) {
+ bh->last_write < cutoff &&
+ --max > 0) {
ldout(cct, 10) << "flusher flushing aged dirty bh " << *bh << dendl;
bh_write(bh);
}
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc
index f94dc7baf6c..d2c574d982e 100644
--- a/src/osdc/Objecter.cc
+++ b/src/osdc/Objecter.cc
@@ -229,7 +229,8 @@ void Objecter::init_locked()
assert(!initialized);
schedule_tick();
- maybe_request_map();
+ if (osdmap->get_epoch() == 0)
+ maybe_request_map();
initialized = true;
}
@@ -296,7 +297,7 @@ void Objecter::send_linger(LingerOp *info)
if (ops.count(info->register_tid)) {
Op *o = ops[info->register_tid];
op_cancel_map_check(o);
- cancel_op(o);
+ cancel_linger_op(o);
}
info->register_tid = _op_submit(o);
} else {
@@ -357,7 +358,7 @@ tid_t Objecter::linger_mutate(const object_t& oid, const object_locator_t& oloc,
const SnapContext& snapc, utime_t mtime,
bufferlist& inbl, int flags,
Context *onack, Context *oncommit,
- eversion_t *objver)
+ version_t *objver)
{
LingerOp *info = new LingerOp;
info->oid = oid;
@@ -388,7 +389,7 @@ tid_t Objecter::linger_read(const object_t& oid, const object_locator_t& oloc,
ObjectOperation& op,
snapid_t snap, bufferlist& inbl, bufferlist *poutbl, int flags,
Context *onfinish,
- eversion_t *objver)
+ version_t *objver)
{
LingerOp *info = new LingerOp;
info->oid = oid;
@@ -658,7 +659,7 @@ void Objecter::handle_osd_map(MOSDMap *m)
send_op(op);
}
} else {
- cancel_op(op);
+ cancel_linger_op(op);
}
}
for (list<LingerOp*>::iterator p = need_resend_linger.begin(); p != need_resend_linger.end(); ++p) {
@@ -1002,7 +1003,7 @@ void Objecter::kick_requests(OSDSession *session)
if (op->should_resend) {
resend[op->tid] = op;
} else {
- cancel_op(op);
+ cancel_linger_op(op);
}
}
while (!resend.empty()) {
@@ -1242,8 +1243,8 @@ tid_t Objecter::_op_submit(Op *op)
}
// send?
- ldout(cct, 10) << "op_submit oid " << op->oid
- << " " << op->oloc
+ ldout(cct, 10) << "op_submit oid " << op->base_oid
+ << " " << op->base_oloc << " " << op->target_oloc
<< " " << op->ops << " tid " << op->tid
<< " osd." << (op->session ? op->session->osd : -1)
<< dendl;
@@ -1280,6 +1281,32 @@ tid_t Objecter::_op_submit(Op *op)
return op->tid;
}
+int Objecter::op_cancel(tid_t tid)
+{
+ assert(client_lock.is_locked());
+ assert(initialized);
+
+ map<tid_t, Op*>::iterator p = ops.find(tid);
+ if (p == ops.end()) {
+ ldout(cct, 10) << __func__ << " tid " << tid << " dne" << dendl;
+ return -ENOENT;
+ }
+
+ ldout(cct, 10) << __func__ << " tid " << tid << dendl;
+ Op *op = p->second;
+ if (op->onack) {
+ op->onack->complete(-ECANCELED);
+ op->onack = NULL;
+ }
+ if (op->oncommit) {
+ op->oncommit->complete(-ECANCELED);
+ op->oncommit = NULL;
+ }
+ op_cancel_map_check(op);
+ finish_op(op);
+ return 0;
+}
+
bool Objecter::is_pg_changed(vector<int>& o, vector<int>& n, bool any_change)
{
if (o.empty() && n.empty())
@@ -1297,12 +1324,37 @@ int Objecter::recalc_op_target(Op *op)
{
vector<int> acting;
pg_t pgid = op->pgid;
+
+ bool is_read = op->flags & CEPH_OSD_FLAG_READ;
+ bool is_write = op->flags & CEPH_OSD_FLAG_WRITE;
+
+ bool need_check_tiering = false;
+ if (op->target_oid.name.empty()) {
+ op->target_oid = op->base_oid;
+ need_check_tiering = true;
+ }
+ if (op->target_oloc.empty()) {
+ op->target_oloc = op->base_oloc;
+ need_check_tiering = true;
+ }
+
+ if (honor_cache_redirects && need_check_tiering) {
+ const pg_pool_t *pi = osdmap->get_pg_pool(op->base_oloc.pool);
+ if (pi) {
+ if (is_read && pi->has_read_tier())
+ op->target_oloc.pool = pi->read_tier;
+ if (is_write && pi->has_write_tier())
+ op->target_oloc.pool = pi->write_tier;
+ }
+ }
+
if (op->precalc_pgid) {
+ assert(op->base_oid.name.empty()); // make sure this is a listing op
ldout(cct, 10) << "recalc_op_target have " << pgid << " pool " << osdmap->have_pg_pool(pgid.pool()) << dendl;
if (!osdmap->have_pg_pool(pgid.pool()))
return RECALC_OP_TARGET_POOL_DNE;
} else {
- int ret = osdmap->object_locator_to_pg(op->oid, op->oloc, pgid);
+ int ret = osdmap->object_locator_to_pg(op->target_oid, op->target_oloc, pgid);
if (ret == -ENOENT)
return RECALC_OP_TARGET_POOL_DNE;
}
@@ -1318,7 +1370,7 @@ int Objecter::recalc_op_target(Op *op)
op->used_replica = false;
if (!acting.empty()) {
int osd;
- bool read = (op->flags & CEPH_OSD_FLAG_READ) && (op->flags & CEPH_OSD_FLAG_WRITE) == 0;
+ bool read = is_read && !is_write;
if (read && (op->flags & CEPH_OSD_FLAG_BALANCE_READS)) {
int p = rand() % acting.size();
if (p)
@@ -1388,12 +1440,10 @@ bool Objecter::recalc_linger_op_target(LingerOp *linger_op)
return RECALC_OP_TARGET_NO_ACTION;
}
-void Objecter::cancel_op(Op *op)
+void Objecter::cancel_linger_op(Op *op)
{
ldout(cct, 15) << "cancel_op " << op->tid << dendl;
- // currently this only works for linger registrations, since we just
- // throw out the callbacks.
assert(!op->should_resend);
delete op->onack;
delete op->oncommit;
@@ -1444,7 +1494,8 @@ void Objecter::send_op(Op *op)
op->stamp = ceph_clock_now(cct);
MOSDOp *m = new MOSDOp(client_inc, op->tid,
- op->oid, op->oloc, op->pgid, osdmap->get_epoch(),
+ op->target_oid, op->target_oloc, op->pgid,
+ osdmap->get_epoch(),
flags);
m->set_snapid(op->snapid);
@@ -1455,8 +1506,8 @@ void Objecter::send_op(Op *op)
m->set_mtime(op->mtime);
m->set_retry_attempt(op->attempts++);
- if (op->version != eversion_t())
- m->set_version(op->version); // we're replaying this op!
+ if (op->replay_version != eversion_t())
+ m->set_version(op->replay_version); // we're replaying this op!
if (op->priority)
m->set_priority(op->priority);
@@ -1505,6 +1556,15 @@ void Objecter::throttle_op(Op *op, int op_budget)
}
}
+void Objecter::unregister_op(Op *op)
+{
+ if (op->onack)
+ num_unacked--;
+ if (op->oncommit)
+ num_uncommitted--;
+ ops.erase(op->tid);
+}
+
/* This function DOES put the passed message before returning */
void Objecter::handle_osd_op_reply(MOSDOpReply *m)
{
@@ -1525,7 +1585,8 @@ void Objecter::handle_osd_op_reply(MOSDOpReply *m)
ldout(cct, 7) << "handle_osd_op_reply " << tid
<< (m->is_ondisk() ? " ondisk":(m->is_onnvram() ? " onnvram":" ack"))
- << " v " << m->get_version() << " in " << m->get_pg()
+ << " v " << m->get_replay_version() << " uv " << m->get_user_version()
+ << " in " << m->get_pg()
<< " attempt " << m->get_retry_attempt()
<< dendl;
Op *op = ops[tid];
@@ -1550,19 +1611,25 @@ void Objecter::handle_osd_op_reply(MOSDOpReply *m)
int rc = m->get_result();
+ if (m->is_redirect_reply()) {
+ ldout(cct, 5) << " got redirect reply; redirecting" << dendl;
+ unregister_op(op);
+ m->get_redirect().combine_with_locator(op->target_oloc, op->target_oid.name);
+ op_submit(op);
+ m->put();
+ return;
+ }
+
if (rc == -EAGAIN) {
ldout(cct, 7) << " got -EAGAIN, resubmitting" << dendl;
- if (op->onack)
- num_unacked--;
- if (op->oncommit)
- num_uncommitted--;
+ unregister_op(op);
op_submit(op);
m->put();
return;
}
if (op->objver)
- *op->objver = m->get_version();
+ *op->objver = m->get_user_version();
if (op->reply_epoch)
*op->reply_epoch = m->get_map_epoch();
@@ -1602,7 +1669,7 @@ void Objecter::handle_osd_op_reply(MOSDOpReply *m)
// ack|commit -> ack
if (op->onack) {
ldout(cct, 15) << "handle_osd_op_reply ack" << dendl;
- op->version = m->get_version();
+ op->replay_version = m->get_replay_version();
onack = op->onack;
op->onack = 0; // only do callback once
num_unacked--;
@@ -2177,7 +2244,7 @@ void Objecter::dump_active()
for (map<tid_t,Op*>::iterator p = ops.begin(); p != ops.end(); ++p) {
Op *op = p->second;
ldout(cct, 20) << op->tid << "\t" << op->pgid << "\tosd." << (op->session ? op->session->osd : -1)
- << "\t" << op->oid << "\t" << op->ops << dendl;
+ << "\t" << op->base_oid << "\t" << op->ops << dendl;
}
}
@@ -2208,8 +2275,9 @@ void Objecter::dump_ops(Formatter *fmt) const
fmt->dump_int("osd", op->session ? op->session->osd : -1);
fmt->dump_stream("last_sent") << op->stamp;
fmt->dump_int("attempts", op->attempts);
- fmt->dump_stream("object_id") << op->oid;
- fmt->dump_stream("object_locator") << op->oloc;
+ fmt->dump_stream("object_id") << op->base_oid;
+ fmt->dump_stream("object_locator") << op->base_oloc;
+ fmt->dump_stream("target_object_locator") << op->target_oloc;
fmt->dump_stream("snapid") << op->snapid;
fmt->dump_stream("snap_context") << op->snapc;
fmt->dump_stream("mtime") << op->mtime;
diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h
index 090fb331611..938c97a4f31 100644
--- a/src/osdc/Objecter.h
+++ b/src/osdc/Objecter.h
@@ -386,7 +386,6 @@ struct ObjectOperation {
pwatchers->push_back(ow);
}
}
- *prval = 0;
}
catch (buffer::error& e) {
if (prval)
@@ -424,8 +423,6 @@ struct ObjectOperation {
}
psnaps->seq = resp.seq;
}
- if (prval)
- *prval = 0;
}
catch (buffer::error& e) {
if (prval)
@@ -567,6 +564,118 @@ struct ObjectOperation {
}
}
+ struct C_ObjectOperation_copyget : public Context {
+ bufferlist bl;
+ object_copy_cursor_t *cursor;
+ uint64_t *out_size;
+ utime_t *out_mtime;
+ std::map<std::string,bufferlist> *out_attrs;
+ bufferlist *out_data;
+ std::map<std::string,bufferlist> *out_omap;
+ int *prval;
+ C_ObjectOperation_copyget(object_copy_cursor_t *c,
+ uint64_t *s,
+ utime_t *m,
+ std::map<std::string,bufferlist> *a,
+ bufferlist *d,
+ std::map<std::string,bufferlist> *o,
+ int *r)
+ : cursor(c),
+ out_size(s), out_mtime(m), out_attrs(a),
+ out_data(d), out_omap(o), prval(r) {}
+ void finish(int r) {
+ if (r < 0)
+ return;
+ try {
+ bufferlist::iterator p = bl.begin();
+ uint64_t size;
+ ::decode(size, p);
+ if (out_size)
+ *out_size = size;
+ utime_t mtime;
+ ::decode(mtime, p);
+ if (out_mtime)
+ *out_mtime = mtime;
+ if (out_attrs) {
+ ::decode_noclear(*out_attrs, p);
+ } else {
+ std::map<std::string,bufferlist> t;
+ ::decode(t, p);
+ }
+ bufferlist bl;
+ ::decode(bl, p);
+ if (out_data)
+ out_data->claim_append(bl);
+ if (out_omap) {
+ ::decode_noclear(*out_omap, p);
+ } else {
+ std::map<std::string,bufferlist> t;
+ ::decode(t, p);
+ }
+ ::decode(*cursor, p);
+ } catch (buffer::error& e) {
+ if (prval)
+ *prval = -EIO;
+ }
+ }
+ };
+
+ void copy_get(object_copy_cursor_t *cursor,
+ uint64_t max,
+ uint64_t *out_size,
+ utime_t *out_mtime,
+ std::map<std::string,bufferlist> *out_attrs,
+ bufferlist *out_data,
+ std::map<std::string,bufferlist> *out_omap,
+ int *prval) {
+ OSDOp& osd_op = add_op(CEPH_OSD_OP_COPY_GET);
+ osd_op.op.copy_get.max = max;
+ ::encode(*cursor, osd_op.indata);
+ ::encode(max, osd_op.indata);
+ unsigned p = ops.size() - 1;
+ out_rval[p] = prval;
+ C_ObjectOperation_copyget *h =
+ new C_ObjectOperation_copyget(cursor, out_size, out_mtime, out_attrs, out_data, out_omap, prval);
+ out_bl[p] = &h->bl;
+ out_handler[p] = h;
+ }
+
+ void undirty() {
+ add_op(CEPH_OSD_OP_UNDIRTY);
+ }
+
+ struct C_ObjectOperation_isdirty : public Context {
+ bufferlist bl;
+ bool *pisdirty;
+ int *prval;
+ C_ObjectOperation_isdirty(bool *p, int *r)
+ : pisdirty(p), prval(r) {}
+ void finish(int r) {
+ if (r < 0)
+ return;
+ try {
+ bufferlist::iterator p = bl.begin();
+ bool isdirty;
+ ::decode(isdirty, p);
+ if (pisdirty)
+ *pisdirty = isdirty;
+ } catch (buffer::error& e) {
+ if (prval)
+ *prval = -EIO;
+ }
+ }
+ };
+
+ void is_dirty(bool *pisdirty, int *prval) {
+ add_op(CEPH_OSD_OP_ISDIRTY);
+ unsigned p = ops.size() - 1;
+ out_rval[p] = prval;
+ C_ObjectOperation_isdirty *h =
+ new C_ObjectOperation_isdirty(pisdirty, prval);
+ out_bl[p] = &h->bl;
+ out_handler[p] = h;
+ }
+
void omap_get_header(bufferlist *bl, int *prval) {
add_op(CEPH_OSD_OP_OMAPGETHEADER);
unsigned p = ops.size() - 1;
@@ -647,8 +756,8 @@ struct ObjectOperation {
}
void assert_version(uint64_t ver) {
- bufferlist bl;
- add_watch(CEPH_OSD_OP_ASSERT_VER, 0, ver, 0, bl);
+ OSDOp& osd_op = add_op(CEPH_OSD_OP_ASSERT_VER);
+ osd_op.op.assert_ver.ver = ver;
}
void assert_src_version(const object_t& srcoid, snapid_t srcsnapid, uint64_t ver) {
bufferlist bl;
@@ -677,6 +786,14 @@ struct ObjectOperation {
OSDOp& osd_op = add_op(CEPH_OSD_OP_ROLLBACK);
osd_op.op.snap.snapid = snapid;
}
+
+ void copy_from(object_t src, snapid_t snapid, object_locator_t src_oloc, version_t src_version) {
+ OSDOp& osd_op = add_op(CEPH_OSD_OP_COPY_FROM);
+ osd_op.op.copy_from.snapid = snapid;
+ osd_op.op.copy_from.src_version = src_version;
+ ::encode(src, osd_op.indata);
+ ::encode(src_oloc, osd_op.indata);
+ }
};
@@ -701,6 +818,7 @@ class Objecter {
int global_op_flags; // flags which are applied to each IO op
bool keep_balanced_budget;
bool honor_osdmap_full;
+ bool honor_cache_redirects;
void maybe_request_map();
@@ -744,8 +862,10 @@ public:
xlist<Op*>::item session_item;
int incarnation;
- object_t oid;
- object_locator_t oloc;
+ object_t base_oid;
+ object_locator_t base_oloc;
+ object_t target_oid;
+ object_locator_t target_oloc;
pg_t pgid;
vector<int> acting;
@@ -768,12 +888,12 @@ public:
Context *onack, *oncommit;
tid_t tid;
- eversion_t version; // for op replay
+ eversion_t replay_version; // for op replay
int attempts;
bool paused;
- eversion_t *objver;
+ version_t *objver;
epoch_t *reply_epoch;
utime_t stamp;
@@ -787,9 +907,9 @@ public:
bool should_resend;
Op(const object_t& o, const object_locator_t& ol, vector<OSDOp>& op,
- int f, Context *ac, Context *co, eversion_t *ov) :
+ int f, Context *ac, Context *co, version_t *ov) :
session(NULL), session_item(this), incarnation(0),
- oid(o), oloc(ol),
+ base_oid(o), base_oloc(ol),
used_replica(false), con(NULL),
snapid(CEPH_NOSNAP),
outbl(NULL),
@@ -811,8 +931,8 @@ public:
out_rval[i] = NULL;
}
- if (oloc.key == o)
- oloc.key.clear();
+ if (base_oloc.key == o)
+ base_oloc.key.clear();
}
~Op() {
while (!out_handler.empty()) {
@@ -1005,7 +1125,7 @@ public:
vector<OSDOp> ops;
bufferlist inbl;
bufferlist *poutbl;
- eversion_t *pobjver;
+ version_t *pobjver;
bool registered;
Context *on_reg_ack, *on_reg_commit;
@@ -1101,7 +1221,7 @@ public:
map<epoch_t,list< pair<Context*, int> > > waiting_for_map;
void send_op(Op *op);
- void cancel_op(Op *op);
+ void cancel_linger_op(Op *op);
void finish_op(Op *op);
bool is_pg_changed(vector<int>& a, vector<int>& b, bool any_change=false);
enum recalc_op_target_result {
@@ -1174,6 +1294,7 @@ public:
num_unacked(0), num_uncommitted(0),
global_op_flags(0),
keep_balanced_budget(false), honor_osdmap_full(true),
+ honor_cache_redirects(true),
last_seen_osdmap_version(0),
last_seen_pgmap_version(0),
client_lock(l), timer(t),
@@ -1207,6 +1328,9 @@ public:
void set_honor_osdmap_full() { honor_osdmap_full = true; }
void unset_honor_osdmap_full() { honor_osdmap_full = false; }
+ void set_honor_cache_redirects() { honor_cache_redirects = true; }
+ void unset_honor_cache_redirects() { honor_cache_redirects = false; }
+
void scan_requests(bool skipped_map,
map<tid_t, Op*>& need_resend,
list<LingerOp*>& need_resend_linger,
@@ -1223,6 +1347,7 @@ private:
// low-level
tid_t op_submit(Op *op);
tid_t _op_submit(Op *op);
+ inline void unregister_op(Op *op);
// public interface
public:
@@ -1254,6 +1379,9 @@ private:
/** Clear the passed flags from the global op flag set */
void clear_global_op_flag(int flags) { global_op_flags &= ~flags; }
+ /// cancel an in-progress request
+ int op_cancel(tid_t tid);
+
// commands
int osd_command(int osd, vector<string>& cmd, bufferlist& inbl, tid_t *ptid,
bufferlist *poutbl, string *prs, Context *onfinish) {
@@ -1283,7 +1411,7 @@ private:
tid_t mutate(const object_t& oid, const object_locator_t& oloc,
ObjectOperation& op,
const SnapContext& snapc, utime_t mtime, int flags,
- Context *onack, Context *oncommit, eversion_t *objver = NULL) {
+ Context *onack, Context *oncommit, version_t *objver = NULL) {
Op *o = new Op(oid, oloc, op.ops, flags | global_op_flags | CEPH_OSD_FLAG_WRITE, onack, oncommit, objver);
o->priority = op.priority;
o->mtime = mtime;
@@ -1293,7 +1421,7 @@ private:
tid_t read(const object_t& oid, const object_locator_t& oloc,
ObjectOperation& op,
snapid_t snapid, bufferlist *pbl, int flags,
- Context *onack, eversion_t *objver = NULL) {
+ Context *onack, version_t *objver = NULL) {
Op *o = new Op(oid, oloc, op.ops, flags | global_op_flags | CEPH_OSD_FLAG_READ, onack, NULL, objver);
o->priority = op.priority;
o->snapid = snapid;
@@ -1308,12 +1436,12 @@ private:
const SnapContext& snapc, utime_t mtime,
bufferlist& inbl, int flags,
Context *onack, Context *onfinish,
- eversion_t *objver);
+ version_t *objver);
tid_t linger_read(const object_t& oid, const object_locator_t& oloc,
ObjectOperation& op,
snapid_t snap, bufferlist& inbl, bufferlist *poutbl, int flags,
Context *onack,
- eversion_t *objver);
+ version_t *objver);
void unregister_linger(uint64_t linger_id);
/**
@@ -1347,7 +1475,7 @@ private:
tid_t stat(const object_t& oid, const object_locator_t& oloc, snapid_t snap,
uint64_t *psize, utime_t *pmtime, int flags,
Context *onfinish,
- eversion_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+ version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
vector<OSDOp> ops;
int i = init_ops(ops, 1, extra_ops);
ops[i].op.op = CEPH_OSD_OP_STAT;
@@ -1361,7 +1489,7 @@ private:
tid_t read(const object_t& oid, const object_locator_t& oloc,
uint64_t off, uint64_t len, snapid_t snap, bufferlist *pbl, int flags,
Context *onfinish,
- eversion_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+ version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
vector<OSDOp> ops;
int i = init_ops(ops, 1, extra_ops);
ops[i].op.op = CEPH_OSD_OP_READ;
@@ -1379,7 +1507,7 @@ private:
uint64_t off, uint64_t len, snapid_t snap, bufferlist *pbl, int flags,
uint64_t trunc_size, __u32 trunc_seq,
Context *onfinish,
- eversion_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+ version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
vector<OSDOp> ops;
int i = init_ops(ops, 1, extra_ops);
ops[i].op.op = CEPH_OSD_OP_READ;
@@ -1395,7 +1523,7 @@ private:
tid_t mapext(const object_t& oid, const object_locator_t& oloc,
uint64_t off, uint64_t len, snapid_t snap, bufferlist *pbl, int flags,
Context *onfinish,
- eversion_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+ version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
vector<OSDOp> ops;
int i = init_ops(ops, 1, extra_ops);
ops[i].op.op = CEPH_OSD_OP_MAPEXT;
@@ -1411,7 +1539,7 @@ private:
tid_t getxattr(const object_t& oid, const object_locator_t& oloc,
const char *name, snapid_t snap, bufferlist *pbl, int flags,
Context *onfinish,
- eversion_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+ version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
vector<OSDOp> ops;
int i = init_ops(ops, 1, extra_ops);
ops[i].op.op = CEPH_OSD_OP_GETXATTR;
@@ -1428,7 +1556,7 @@ private:
tid_t getxattrs(const object_t& oid, const object_locator_t& oloc, snapid_t snap,
map<string,bufferlist>& attrset,
int flags, Context *onfinish,
- eversion_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+ version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
vector<OSDOp> ops;
int i = init_ops(ops, 1, extra_ops);
ops[i].op.op = CEPH_OSD_OP_GETXATTRS;
@@ -1442,7 +1570,7 @@ private:
tid_t read_full(const object_t& oid, const object_locator_t& oloc,
snapid_t snap, bufferlist *pbl, int flags,
Context *onfinish,
- eversion_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+ version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
return read(oid, oloc, 0, 0, snap, pbl, flags | global_op_flags | CEPH_OSD_FLAG_READ, onfinish, objver);
}
@@ -1451,7 +1579,7 @@ private:
vector<OSDOp>& ops, utime_t mtime,
const SnapContext& snapc, int flags,
Context *onack, Context *oncommit,
- eversion_t *objver = NULL) {
+ version_t *objver = NULL) {
Op *o = new Op(oid, oloc, ops, flags | global_op_flags | CEPH_OSD_FLAG_WRITE, onack, oncommit, objver);
o->mtime = mtime;
o->snapc = snapc;
@@ -1461,7 +1589,7 @@ private:
uint64_t off, uint64_t len, const SnapContext& snapc, const bufferlist &bl,
utime_t mtime, int flags,
Context *onack, Context *oncommit,
- eversion_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+ version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
vector<OSDOp> ops;
int i = init_ops(ops, 1, extra_ops);
ops[i].op.op = CEPH_OSD_OP_WRITE;
@@ -1479,7 +1607,7 @@ private:
uint64_t len, const SnapContext& snapc, const bufferlist &bl,
utime_t mtime, int flags,
Context *onack, Context *oncommit,
- eversion_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+ version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
vector<OSDOp> ops;
int i = init_ops(ops, 1, extra_ops);
ops[i].op.op = CEPH_OSD_OP_APPEND;
@@ -1498,7 +1626,7 @@ private:
utime_t mtime, int flags,
uint64_t trunc_size, __u32 trunc_seq,
Context *onack, Context *oncommit,
- eversion_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+ version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
vector<OSDOp> ops;
int i = init_ops(ops, 1, extra_ops);
ops[i].op.op = CEPH_OSD_OP_WRITE;
@@ -1515,7 +1643,7 @@ private:
tid_t write_full(const object_t& oid, const object_locator_t& oloc,
const SnapContext& snapc, const bufferlist &bl, utime_t mtime, int flags,
Context *onack, Context *oncommit,
- eversion_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+ version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
vector<OSDOp> ops;
int i = init_ops(ops, 1, extra_ops);
ops[i].op.op = CEPH_OSD_OP_WRITEFULL;
@@ -1532,7 +1660,7 @@ private:
utime_t mtime, int flags,
uint64_t trunc_size, __u32 trunc_seq,
Context *onack, Context *oncommit,
- eversion_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+ version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
vector<OSDOp> ops;
int i = init_ops(ops, 1, extra_ops);
ops[i].op.op = CEPH_OSD_OP_TRUNCATE;
@@ -1547,7 +1675,7 @@ private:
tid_t zero(const object_t& oid, const object_locator_t& oloc,
uint64_t off, uint64_t len, const SnapContext& snapc, utime_t mtime, int flags,
Context *onack, Context *oncommit,
- eversion_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+ version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
vector<OSDOp> ops;
int i = init_ops(ops, 1, extra_ops);
ops[i].op.op = CEPH_OSD_OP_ZERO;
@@ -1561,7 +1689,7 @@ private:
tid_t rollback_object(const object_t& oid, const object_locator_t& oloc,
const SnapContext& snapc, snapid_t snapid,
utime_t mtime, Context *onack, Context *oncommit,
- eversion_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+ version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
vector<OSDOp> ops;
int i = init_ops(ops, 1, extra_ops);
ops[i].op.op = CEPH_OSD_OP_ROLLBACK;
@@ -1575,7 +1703,7 @@ private:
const SnapContext& snapc, utime_t mtime,
int global_flags, int create_flags,
Context *onack, Context *oncommit,
- eversion_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+ version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
vector<OSDOp> ops;
int i = init_ops(ops, 1, extra_ops);
ops[i].op.op = CEPH_OSD_OP_CREATE;
@@ -1588,7 +1716,7 @@ private:
tid_t remove(const object_t& oid, const object_locator_t& oloc,
const SnapContext& snapc, utime_t mtime, int flags,
Context *onack, Context *oncommit,
- eversion_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+ version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
vector<OSDOp> ops;
int i = init_ops(ops, 1, extra_ops);
ops[i].op.op = CEPH_OSD_OP_DELETE;
@@ -1599,7 +1727,7 @@ private:
}
tid_t lock(const object_t& oid, const object_locator_t& oloc, int op, int flags,
- Context *onack, Context *oncommit, eversion_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+ Context *onack, Context *oncommit, version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
SnapContext snapc; // no snapc for lock ops
vector<OSDOp> ops;
int i = init_ops(ops, 1, extra_ops);
@@ -1612,7 +1740,7 @@ private:
const char *name, const SnapContext& snapc, const bufferlist &bl,
utime_t mtime, int flags,
Context *onack, Context *oncommit,
- eversion_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+ version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
vector<OSDOp> ops;
int i = init_ops(ops, 1, extra_ops);
ops[i].op.op = CEPH_OSD_OP_SETXATTR;
@@ -1630,7 +1758,7 @@ private:
const char *name, const SnapContext& snapc,
utime_t mtime, int flags,
Context *onack, Context *oncommit,
- eversion_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+ version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
vector<OSDOp> ops;
int i = init_ops(ops, 1, extra_ops);
ops[i].op.op = CEPH_OSD_OP_RMXATTR;
diff --git a/src/perfglue/Makefile.am b/src/perfglue/Makefile.am
new file mode 100644
index 00000000000..f2b8d5030e6
--- /dev/null
+++ b/src/perfglue/Makefile.am
@@ -0,0 +1,23 @@
+libperfglue_la_SOURCES =
+
+if WITH_TCMALLOC
+libperfglue_la_SOURCES += perfglue/heap_profiler.cc
+libperfglue_la_LIBADD = -ltcmalloc
+AM_CFLAGS += -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
+AM_CXXFLAGS += -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
+else
+libperfglue_la_SOURCES += perfglue/disabled_heap_profiler.cc
+endif # WITH_TCMALLOC
+
+if WITH_PROFILER
+libperfglue_la_SOURCES += perfglue/cpu_profiler.cc
+else
+libperfglue_la_SOURCES += perfglue/disabled_stubs.cc
+endif # WITH_PROFILER
+
+noinst_LTLIBRARIES += libperfglue.la
+
+noinst_HEADERS += \
+ perfglue/cpu_profiler.h \
+ perfglue/heap_profiler.h
+
diff --git a/src/perfglue/heap_profiler.cc b/src/perfglue/heap_profiler.cc
index 550f7f924c6..6b079b865fa 100644
--- a/src/perfglue/heap_profiler.cc
+++ b/src/perfglue/heap_profiler.cc
@@ -88,7 +88,7 @@ void ceph_heap_profiler_dump(const char *reason)
void ceph_heap_profiler_handle_command(const std::vector<std::string>& cmd,
ostream& out)
{
- if (cmd.size() == 2 && cmd[1] == "dump") {
+ if (cmd.size() == 1 && cmd[0] == "dump") {
if (!ceph_heap_profiler_running()) {
out << "heap profiler not running; can't dump";
return;
@@ -98,16 +98,16 @@ void ceph_heap_profiler_handle_command(const std::vector<std::string>& cmd,
out << g_conf->name << "dumping heap profile now.\n"
<< heap_stats;
ceph_heap_profiler_dump("admin request");
- } else if (cmd.size() == 2 && cmd[1] == "start_profiler") {
+ } else if (cmd.size() == 1 && cmd[0] == "start_profiler") {
ceph_heap_profiler_start();
out << g_conf->name << " started profiler";
- } else if (cmd.size() == 2 && cmd[1] == "stop_profiler") {
+ } else if (cmd.size() == 1 && cmd[0] == "stop_profiler") {
ceph_heap_profiler_stop();
out << g_conf->name << " stopped profiler";
- } else if (cmd.size() == 2 && cmd[1] == "release") {
+ } else if (cmd.size() == 1 && cmd[0] == "release") {
ceph_heap_release_free_memory();
out << g_conf->name << " releasing free RAM back to system.";
- } else if (cmd.size() == 2 && cmd[1] == "stats") {
+ } else if (cmd.size() == 1 && cmd[0] == "stats") {
char *heap_stats = new char[1024];
ceph_heap_profiler_stats(heap_stats, 1024);
out << g_conf->name << "tcmalloc heap stats:"
diff --git a/src/pybind/ceph_argparse.py b/src/pybind/ceph_argparse.py
index 427a4621216..1f6e90b6c1d 100644
--- a/src/pybind/ceph_argparse.py
+++ b/src/pybind/ceph_argparse.py
@@ -275,12 +275,26 @@ class CephIPAddr(CephArgtype):
class CephEntityAddr(CephIPAddr):
"""
- EntityAddress, that is, IP address/nonce
+ EntityAddress, that is, IP address[/nonce]
"""
def valid(self, s, partial=False):
- ip, nonce = s.split('/')
+ nonce = None
+ if '/' in s:
+ ip, nonce = s.split('/')
+ else:
+ ip = s
super(self.__class__, self).valid(ip)
- self.nonce = nonce
+ if nonce:
+ nonce_long = None
+ try:
+ nonce_long = long(nonce)
+ except ValueError:
+ pass
+ if nonce_long is None or nonce_long < 0:
+ raise ArgumentValid(
+ '{0}: invalid entity, nonce {1} not integer > 0'.\
+ format(s, nonce)
+ )
self.val = s
def __str__(self):
@@ -829,6 +843,11 @@ def validate(args, signature, partial=False):
# wanted n, got too few
if partial:
return d
+ # special-case the "0 expected 1" case
+ if desc.numseen == 0 and desc.n == 1:
+ raise ArgumentNumber(
+ 'missing required parameter {0}'.format(desc)
+ )
raise ArgumentNumber(
'saw {0} of {1}, expected {2}'.\
format(desc.numseen, desc, desc.n)
@@ -937,6 +956,7 @@ def validate_command(sigdict, args, verbose=False):
# Stop now, because we have the right command but
# some other input is invalid
print >> sys.stderr, "Invalid command: ", str(e)
+ print >> sys.stderr, concise_sig(sig), ': ', cmd['help']
return {}
if found:
break
diff --git a/src/pybind/ceph_rest_api.py b/src/pybind/ceph_rest_api.py
index 421cc59edcc..75e61060544 100755
--- a/src/pybind/ceph_rest_api.py
+++ b/src/pybind/ceph_rest_api.py
@@ -1,10 +1,10 @@
-#!/usr/bin/python
# vim: ts=4 sw=4 smarttab expandtab
import errno
import json
import logging
import logging.handlers
+import os
import rados
import textwrap
import xml.etree.ElementTree
@@ -26,6 +26,7 @@ DEFAULT_ID = 'restapi'
DEFAULT_BASEURL = '/api/v0.1'
DEFAULT_LOG_LEVEL = 'warning'
+DEFAULT_LOGDIR = '/var/log/ceph'
# default client name will be 'client.<DEFAULT_ID>'
# 'app' must be global for decorators, etc.
@@ -117,7 +118,18 @@ def api_setup(app, conf, cluster, clientname, clientid, args):
loglevel = app.ceph_cluster.conf_get('restapi_log_level') \
or DEFAULT_LOG_LEVEL
+ # ceph has a default log file for daemons only; clients (like this)
+ # default to "". Override that for this particular client.
logfile = app.ceph_cluster.conf_get('log_file')
+ if not logfile:
+ logfile = os.path.join(
+ DEFAULT_LOGDIR,
+ '{cluster}-{clientname}.{pid}.log'.format(
+ cluster=cluster,
+ clientname=clientname,
+ pid=os.getpid()
+ )
+ )
app.logger.addHandler(logging.handlers.WatchedFileHandler(logfile))
app.logger.setLevel(LOGLEVELS[loglevel.lower()])
for h in app.logger.handlers:
diff --git a/src/rbd.cc b/src/rbd.cc
index 7d5d46611ec..147eb2c5138 100644
--- a/src/rbd.cc
+++ b/src/rbd.cc
@@ -8,6 +8,7 @@
* LGPL2. See file COPYING.
*
*/
+#include "include/int_types.h"
#include "mon/MonClient.h"
#include "mon/MonMap.h"
@@ -33,7 +34,6 @@
#include <boost/scoped_ptr.hpp>
#include <dirent.h>
#include <errno.h>
-#include <inttypes.h>
#include <iostream>
#include <memory>
#include <sstream>
@@ -68,6 +68,7 @@ static string dir_info_oid = RBD_INFO;
bool udevadm_settle = true;
bool progress = true;
bool resize_allow_shrink = false;
+bool read_only = false;
#define dout_subsys ceph_subsys_rbd
@@ -151,6 +152,7 @@ void usage()
" --pretty-format make json or xml output more readable\n"
" --no-settle do not wait for udevadm to settle on map/unmap\n"
" --no-progress do not show progress for long-running commands\n"
+" --read-only set device readonly when mapping image\n"
" --allow-shrink allow shrinking of an image when resizing\n";
}
@@ -1640,8 +1642,13 @@ static int do_kernel_add(const char *poolname, const char *imgname,
oss << ",";
}
+ if (read_only)
+ oss << " ro";
+ else
+ oss << " rw";
+
const char *user = g_conf->name.get_id().c_str();
- oss << " name=" << user;
+ oss << ",name=" << user;
char key_name[strlen(user) + strlen("client.") + 1];
snprintf(key_name, sizeof(key_name), "client.%s", user);
@@ -2200,6 +2207,8 @@ int main(int argc, const char **argv)
lock_tag = strdup(val.c_str());
} else if (ceph_argparse_flag(args, i, "--no-settle", (char *)NULL)) {
udevadm_settle = false;
+ } else if (ceph_argparse_flag(args, i, "--read-only", (char *)NULL)) {
+ read_only = true;
} else if (ceph_argparse_flag(args, i, "--no-progress", (char *)NULL)) {
progress = false;
} else if (ceph_argparse_flag(args, i , "--allow-shrink", (char *)NULL)) {
@@ -2247,7 +2256,7 @@ int main(int argc, const char **argv)
opt_cmd = get_cmd(*i, false, false);
}
if (opt_cmd == OPT_NO_CMD) {
- cerr << "rbd: error parsing command '" << *i << "'" << std::endl;
+ cerr << "rbd: error parsing command '" << *i << "'; -h or --help for usage" << std::endl;
return EXIT_FAILURE;
}
@@ -2484,7 +2493,7 @@ if (!set_conf_param(v, p1, p2, p3)) { \
if (r < 0) {
cerr << "rbd: error opening pool " << poolname << ": "
<< cpp_strerror(-r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
}
@@ -2511,7 +2520,7 @@ if (!set_conf_param(v, p1, p2, p3)) { \
if (r < 0) {
cerr << "rbd: error opening image " << imgname << ": "
<< cpp_strerror(-r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
}
@@ -2526,7 +2535,7 @@ if (!set_conf_param(v, p1, p2, p3)) { \
if (r < 0) {
cerr << "rbd: error setting snapshot context: " << cpp_strerror(-r)
<< std::endl;
- return EXIT_FAILURE;
+ return -r;
}
}
@@ -2535,14 +2544,14 @@ if (!set_conf_param(v, p1, p2, p3)) { \
if (r < 0) {
cerr << "rbd: error opening pool " << dest_poolname << ": "
<< cpp_strerror(-r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
}
if (opt_cmd == OPT_CREATE || opt_cmd == OPT_RESIZE) {
if (!size_set) {
cerr << "rbd: must specify --size <MB>" << std::endl;
- return EXIT_FAILURE;
+ return EINVAL;
}
}
@@ -2558,7 +2567,7 @@ if (!set_conf_param(v, p1, p2, p3)) { \
default:
cerr << "rbd: list: " << cpp_strerror(-r) << std::endl;
}
- return EXIT_FAILURE;
+ return -r;
}
break;
@@ -2566,19 +2575,19 @@ if (!set_conf_param(v, p1, p2, p3)) { \
if (order && (order < 12 || order > 25)) {
cerr << "rbd: order must be between 12 (4 KB) and 25 (32 MB)"
<< std::endl;
- return EXIT_FAILURE;
+ return EINVAL;
}
if ((stripe_unit && !stripe_count) || (!stripe_unit && stripe_count)) {
cerr << "must specify both (or neither) of stripe-unit and stripe-count"
<< std::endl;
usage();
- return EXIT_FAILURE;
+ return EINVAL;
}
r = do_create(rbd, io_ctx, imgname, size, &order, format, features,
stripe_unit, stripe_count);
if (r < 0) {
cerr << "rbd: create error: " << cpp_strerror(-r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
@@ -2586,14 +2595,14 @@ if (!set_conf_param(v, p1, p2, p3)) { \
if (order && (order < 12 || order > 25)) {
cerr << "rbd: order must be between 12 (4 KB) and 25 (32 MB)"
<< std::endl;
- return EXIT_FAILURE;
+ return EINVAL;
}
r = do_clone(rbd, io_ctx, imgname, snapname, dest_io_ctx, destname,
features, &order);
if (r < 0) {
cerr << "rbd: clone error: " << cpp_strerror(-r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
@@ -2601,7 +2610,7 @@ if (!set_conf_param(v, p1, p2, p3)) { \
r = do_flatten(image);
if (r < 0) {
cerr << "rbd: flatten error: " << cpp_strerror(-r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
@@ -2609,7 +2618,7 @@ if (!set_conf_param(v, p1, p2, p3)) { \
r = do_rename(rbd, io_ctx, imgname, destname);
if (r < 0) {
cerr << "rbd: rename error: " << cpp_strerror(-r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
@@ -2617,7 +2626,7 @@ if (!set_conf_param(v, p1, p2, p3)) { \
r = do_show_info(imgname, image, snapname, formatter.get());
if (r < 0) {
cerr << "rbd: info: " << cpp_strerror(-r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
@@ -2647,18 +2656,18 @@ if (!set_conf_param(v, p1, p2, p3)) { \
r = image.stat(info, sizeof(info));
if (r < 0) {
cerr << "rbd: resize error: " << cpp_strerror(-r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
if (info.size > size && !resize_allow_shrink) {
cerr << "rbd: shrinking an image is only allowed with the --allow-shrink flag" << std::endl;
- return EXIT_FAILURE;
+ return EINVAL;
}
r = do_resize(image, size);
if (r < 0) {
cerr << "rbd: resize error: " << cpp_strerror(-r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
@@ -2671,87 +2680,87 @@ if (!set_conf_param(v, p1, p2, p3)) { \
if (r < 0) {
cerr << "rbd: failed to list snapshots: " << cpp_strerror(-r)
<< std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
case OPT_SNAP_CREATE:
if (!imgname || !snapname) {
cerr << "rbd: snap create requires image and snapname" << std::endl;
- return EXIT_FAILURE;
+ return EINVAL;
}
r = do_add_snap(image, snapname);
if (r < 0) {
cerr << "rbd: failed to create snapshot: " << cpp_strerror(-r)
<< std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
case OPT_SNAP_ROLLBACK:
if (!imgname) {
cerr << "rbd: snap rollback requires image name" << std::endl;
- return EXIT_FAILURE;
+ return EINVAL;
}
r = do_rollback_snap(image, snapname);
if (r < 0) {
cerr << "rbd: rollback failed: " << cpp_strerror(-r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
case OPT_SNAP_REMOVE:
if (!imgname) {
cerr << "rbd: snap remove requires image name" << std::endl;
- return EXIT_FAILURE;
+ return EINVAL;
}
r = do_remove_snap(image, snapname);
- if (r == -EBUSY) {
- cerr << "rbd: snapshot '" << snapname << "' is protected from removal."
- << std::endl;
- return EXIT_FAILURE;
- }
if (r < 0) {
- cerr << "rbd: failed to remove snapshot: " << cpp_strerror(-r)
- << std::endl;
- return EXIT_FAILURE;
+ if (r == -EBUSY) {
+ cerr << "rbd: snapshot '" << snapname << "' is protected from removal."
+ << std::endl;
+ } else {
+ cerr << "rbd: failed to remove snapshot: " << cpp_strerror(-r)
+ << std::endl;
+ }
+ return -r;
}
break;
case OPT_SNAP_PURGE:
if (!imgname) {
cerr << "rbd: snap purge requires image name" << std::endl;
- return EXIT_FAILURE;
+ return EINVAL;
}
r = do_purge_snaps(image);
if (r < 0) {
cerr << "rbd: removing snaps failed: " << cpp_strerror(-r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
case OPT_SNAP_PROTECT:
if (!imgname) {
cerr << "rbd: snap protect requires image name" << std::endl;
- return EXIT_FAILURE;
+ return EINVAL;
}
r = do_protect_snap(image, snapname);
if (r < 0) {
cerr << "rbd: protecting snap failed: " << cpp_strerror(-r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
case OPT_SNAP_UNPROTECT:
if (!imgname) {
cerr << "rbd: snap unprotect requires image name" << std::endl;
- return EXIT_FAILURE;
+ return EINVAL;
}
r = do_unprotect_snap(image, snapname);
if (r < 0) {
cerr << "rbd: unprotecting snap failed: " << cpp_strerror(-r)
<< std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
@@ -2759,19 +2768,19 @@ if (!set_conf_param(v, p1, p2, p3)) { \
r = do_list_children(image, formatter.get());
if (r < 0) {
cerr << "rbd: listing children failed: " << cpp_strerror(-r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
case OPT_EXPORT:
if (!path) {
cerr << "rbd: export requires pathname" << std::endl;
- return EXIT_FAILURE;
+ return EINVAL;
}
r = do_export(image, path);
if (r < 0) {
cerr << "rbd: export error: " << cpp_strerror(-r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
@@ -2779,32 +2788,32 @@ if (!set_conf_param(v, p1, p2, p3)) { \
r = do_diff(image, fromsnapname, formatter.get());
if (r < 0) {
cerr << "rbd: diff error: " << cpp_strerror(-r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
case OPT_EXPORT_DIFF:
if (!path) {
cerr << "rbd: export-diff requires pathname" << std::endl;
- return EXIT_FAILURE;
+ return EINVAL;
}
r = do_export_diff(image, fromsnapname, snapname, path);
if (r < 0) {
cerr << "rbd: export-diff error: " << cpp_strerror(-r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
case OPT_IMPORT:
if (!path) {
cerr << "rbd: import requires pathname" << std::endl;
- return EXIT_FAILURE;
+ return EINVAL;
}
r = do_import(rbd, dest_io_ctx, destname, &order, path,
format, features, size);
if (r < 0) {
cerr << "rbd: import failed: " << cpp_strerror(-r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
@@ -2813,7 +2822,7 @@ if (!set_conf_param(v, p1, p2, p3)) { \
r = do_import_diff(image, path);
if (r < 0) {
cerr << "rbd: import-diff failed: " << cpp_strerror(-r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
@@ -2821,7 +2830,7 @@ if (!set_conf_param(v, p1, p2, p3)) { \
r = do_copy(image, dest_io_ctx, destname);
if (r < 0) {
cerr << "rbd: copy failed: " << cpp_strerror(-r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
@@ -2829,7 +2838,7 @@ if (!set_conf_param(v, p1, p2, p3)) { \
r = do_watch(io_ctx, imgname);
if (r < 0) {
cerr << "rbd: watch failed: " << cpp_strerror(-r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
@@ -2837,7 +2846,7 @@ if (!set_conf_param(v, p1, p2, p3)) { \
r = do_kernel_add(poolname, imgname, snapname);
if (r < 0) {
cerr << "rbd: add failed: " << cpp_strerror(-r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
@@ -2845,7 +2854,7 @@ if (!set_conf_param(v, p1, p2, p3)) { \
r = do_kernel_rm(devpath);
if (r < 0) {
cerr << "rbd: remove failed: " << cpp_strerror(-r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
@@ -2853,7 +2862,7 @@ if (!set_conf_param(v, p1, p2, p3)) { \
r = do_kernel_showmapped(formatter.get());
if (r < 0) {
cerr << "rbd: showmapped failed: " << cpp_strerror(-r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
@@ -2861,7 +2870,7 @@ if (!set_conf_param(v, p1, p2, p3)) { \
r = do_lock_list(image, formatter.get());
if (r < 0) {
cerr << "rbd: listing locks failed: " << cpp_strerror(r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
@@ -2878,7 +2887,7 @@ if (!set_conf_param(v, p1, p2, p3)) { \
} else {
cerr << "rbd: taking lock failed: " << cpp_strerror(r) << std::endl;
}
- return EXIT_FAILURE;
+ return -r;
}
break;
@@ -2886,7 +2895,7 @@ if (!set_conf_param(v, p1, p2, p3)) { \
r = do_lock_remove(image, lock_cookie, lock_client);
if (r < 0) {
cerr << "rbd: releasing lock failed: " << cpp_strerror(r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
@@ -2894,7 +2903,7 @@ if (!set_conf_param(v, p1, p2, p3)) { \
r = do_bench_write(image, bench_io_size, bench_io_threads, bench_bytes, bench_pattern);
if (r < 0) {
cerr << "bench-write failed: " << cpp_strerror(-r) << std::endl;
- return EXIT_FAILURE;
+ return -r;
}
break;
}
diff --git a/src/rbd_fuse/rbd-fuse.c b/src/rbd_fuse/rbd-fuse.c
index 5a4bfe2702c..2a6a8d22e81 100644
--- a/src/rbd_fuse/rbd-fuse.c
+++ b/src/rbd_fuse/rbd-fuse.c
@@ -1,7 +1,9 @@
/*
* rbd-fuse
*/
-#define FUSE_USE_VERSION 26
+#define FUSE_USE_VERSION 30
+
+#include "include/int_types.h"
#include <stdio.h>
#include <stdlib.h>
@@ -15,7 +17,6 @@
#include <sys/types.h>
#include <unistd.h>
#include <getopt.h>
-#include <inttypes.h>
#include "include/rbd/librbd.h"
diff --git a/src/rgw/Makefile.am b/src/rgw/Makefile.am
new file mode 100644
index 00000000000..b92c35e08d6
--- /dev/null
+++ b/src/rgw/Makefile.am
@@ -0,0 +1,152 @@
+if WITH_RADOSGW
+librgw_la_SOURCES = \
+ rgw/librgw.cc \
+ rgw/rgw_acl.cc \
+ rgw/rgw_acl_s3.cc \
+ rgw/rgw_acl_swift.cc \
+ rgw/rgw_client_io.cc \
+ rgw/rgw_fcgi.cc \
+ rgw/rgw_xml.cc \
+ rgw/rgw_usage.cc \
+ rgw/rgw_json_enc.cc \
+ rgw/rgw_user.cc \
+ rgw/rgw_bucket.cc\
+ rgw/rgw_tools.cc \
+ rgw/rgw_rados.cc \
+ rgw/rgw_http_client.cc \
+ rgw/rgw_rest_client.cc \
+ rgw/rgw_rest_conn.cc \
+ rgw/rgw_op.cc \
+ rgw/rgw_common.cc \
+ rgw/rgw_cache.cc \
+ rgw/rgw_formats.cc \
+ rgw/rgw_log.cc \
+ rgw/rgw_multi.cc \
+ rgw/rgw_policy_s3.cc \
+ rgw/rgw_gc.cc \
+ rgw/rgw_multi_del.cc \
+ rgw/rgw_env.cc \
+ rgw/rgw_cors.cc \
+ rgw/rgw_cors_s3.cc \
+ rgw/rgw_auth_s3.cc \
+ rgw/rgw_metadata.cc \
+ rgw/rgw_replica_log.cc \
+ rgw/rgw_keystone.cc \
+ rgw/rgw_quota.cc
+librgw_la_CXXFLAGS = -Woverloaded-virtual ${AM_CXXFLAGS}
+noinst_LTLIBRARIES += librgw.la
+
+LIBRGW_DEPS += \
+ $(LIBRADOS) \
+ libcls_rgw_client.la \
+ libcls_log_client.a \
+ libcls_statelog_client.a \
+ libcls_replica_log_client.a \
+ libcls_lock_client.la \
+ libcls_refcount_client.la \
+ libcls_version_client.a \
+ -lcurl \
+ -lexpat \
+ -lm \
+ -lfcgi
+
+radosgw_SOURCES = \
+ rgw/rgw_resolve.cc \
+ rgw/rgw_rest.cc \
+ rgw/rgw_rest_swift.cc \
+ rgw/rgw_rest_s3.cc \
+ rgw/rgw_rest_usage.cc \
+ rgw/rgw_rest_user.cc \
+ rgw/rgw_rest_bucket.cc \
+ rgw/rgw_rest_metadata.cc \
+ rgw/rgw_replica_log.cc \
+ rgw/rgw_rest_log.cc \
+ rgw/rgw_rest_opstate.cc \
+ rgw/rgw_rest_replica_log.cc \
+ rgw/rgw_rest_config.cc \
+ rgw/rgw_http_client.cc \
+ rgw/rgw_swift.cc \
+ rgw/rgw_swift_auth.cc \
+ rgw/rgw_main.cc
+radosgw_LDADD = $(LIBRGW) $(LIBRGW_DEPS) -lresolv $(CEPH_GLOBAL)
+bin_PROGRAMS += radosgw
+
+radosgw_admin_SOURCES = rgw/rgw_admin.cc
+radosgw_admin_LDADD = $(LIBRGW) $(LIBRGW_DEPS) $(CEPH_GLOBAL)
+bin_PROGRAMS += radosgw-admin
+
+ceph_rgw_multiparser_SOURCES = rgw/rgw_multiparser.cc
+ceph_rgw_multiparser_LDADD = $(LIBRGW) $(LIBRGW_DEPS) $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_rgw_multiparser
+
+ceph_rgw_jsonparser_SOURCES = \
+ rgw/rgw_jsonparser.cc \
+ rgw/rgw_common.cc \
+ rgw/rgw_env.cc \
+ rgw/rgw_json_enc.cc
+ceph_rgw_jsonparser_LDADD = $(LIBRGW) $(LIBRGW_DEPS) $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_rgw_jsonparser
+
+# inject rgw stuff in the decoder testcase
+DENCODER_SOURCES += \
+ rgw/rgw_dencoder.cc \
+ rgw/rgw_acl.cc \
+ rgw/rgw_common.cc \
+ rgw/rgw_env.cc \
+ rgw/rgw_json_enc.cc
+
+
+endif # WITH_RADOSGW
+
+
+noinst_HEADERS += \
+ rgw/logrotate.conf \
+ rgw/rgw_acl.h \
+ rgw/rgw_acl_s3.h \
+ rgw/rgw_acl_swift.h \
+ rgw/rgw_client_io.h \
+ rgw/rgw_fcgi.h \
+ rgw/rgw_xml.h \
+ rgw/rgw_cache.h \
+ rgw/rgw_common.h \
+ rgw/rgw_cors.h \
+ rgw/rgw_cors_s3.h \
+ rgw/rgw_cors_swift.h \
+ rgw/rgw_string.h \
+ rgw/rgw_formats.h \
+ rgw/rgw_http_errors.h \
+ rgw/rgw_log.h \
+ rgw/rgw_multi.h \
+ rgw/rgw_policy_s3.h \
+ rgw/rgw_gc.h \
+ rgw/rgw_metadata.h \
+ rgw/rgw_multi_del.h \
+ rgw/rgw_op.h \
+ rgw/rgw_http_client.h \
+ rgw/rgw_swift.h \
+ rgw/rgw_swift_auth.h \
+ rgw/rgw_quota.h \
+ rgw/rgw_rados.h \
+ rgw/rgw_replica_log.h \
+ rgw/rgw_resolve.h \
+ rgw/rgw_rest.h \
+ rgw/rgw_rest_swift.h \
+ rgw/rgw_rest_s3.h \
+ rgw/rgw_auth_s3.h \
+ rgw/rgw_rest_admin.h \
+ rgw/rgw_rest_usage.h \
+ rgw/rgw_rest_user.h \
+ rgw/rgw_rest_bucket.h \
+ rgw/rgw_rest_client.h \
+ rgw/rgw_rest_conn.h \
+ rgw/rgw_tools.h \
+ rgw/rgw_rest_metadata.h \
+ rgw/rgw_rest_log.h \
+ rgw/rgw_rest_opstate.h \
+ rgw/rgw_rest_replica_log.h \
+ rgw/rgw_rest_config.h \
+ rgw/rgw_usage.h \
+ rgw/rgw_user.h \
+ rgw/rgw_bucket.h \
+ rgw/rgw_keystone.h
+
diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc
index 644a1760aaf..b23bf3ba5d4 100644
--- a/src/rgw/rgw_admin.cc
+++ b/src/rgw/rgw_admin.cc
@@ -62,6 +62,9 @@ void _usage()
cerr << " bucket check check bucket index\n";
cerr << " object rm remove object\n";
cerr << " object unlink unlink object from bucket index\n";
+ cerr << " quota set set quota params\n";
+ cerr << " quota enable enable quota\n";
+ cerr << " quota disable disable quota\n";
cerr << " region get show region info\n";
cerr << " regions list list all regions set on this cluster\n";
cerr << " region set set region info (requires infile)\n";
@@ -127,6 +130,7 @@ void _usage()
cerr << " mdlog trim\n";
cerr << " replica mdlog get/delete\n";
cerr << " replica datalog get/delete\n";
+ cerr << " --metadata-key=<key> key to retrieve metadata from with metadata get\n";
cerr << " --rgw-region=<region> region in which radosgw is running\n";
cerr << " --rgw-zone=<zone> zone in which radosgw is running\n";
cerr << " --fix besides checking bucket index, will also fix it\n";
@@ -153,6 +157,11 @@ void _usage()
cerr << " --yes-i-really-mean-it required for certain operations\n";
cerr << "\n";
cerr << "<date> := \"YYYY-MM-DD[ hh:mm:ss]\"\n";
+ cerr << "\nQuota options:\n";
+ cerr << " --bucket specified bucket for quota command\n";
+ cerr << " --max-objects specify max objects\n";
+ cerr << " --max-size specify max size (in bytes)\n";
+ cerr << " --quota-scope scope of quota (bucket, user)\n";
cerr << "\n";
generic_client_usage();
}
@@ -202,6 +211,9 @@ enum {
OPT_OBJECT_RM,
OPT_OBJECT_UNLINK,
OPT_OBJECT_STAT,
+ OPT_QUOTA_SET,
+ OPT_QUOTA_ENABLE,
+ OPT_QUOTA_DISABLE,
OPT_GC_LIST,
OPT_GC_PROCESS,
OPT_REGION_GET,
@@ -252,6 +264,7 @@ static int get_cmd(const char *cmd, const char *prev_cmd, bool *need_more)
strcmp(cmd, "opstate") == 0 ||
strcmp(cmd, "pool") == 0 ||
strcmp(cmd, "pools") == 0 ||
+ strcmp(cmd, "quota") == 0 ||
strcmp(cmd, "region") == 0 ||
strcmp(cmd, "regions") == 0 ||
strcmp(cmd, "region-map") == 0 ||
@@ -361,6 +374,13 @@ static int get_cmd(const char *cmd, const char *prev_cmd, bool *need_more)
return OPT_REGION_SET;
if (strcmp(cmd, "default") == 0)
return OPT_REGION_DEFAULT;
+ } else if (strcmp(prev_cmd, "quota") == 0) {
+ if (strcmp(cmd, "set") == 0)
+ return OPT_QUOTA_SET;
+ if (strcmp(cmd, "enable") == 0)
+ return OPT_QUOTA_ENABLE;
+ if (strcmp(cmd, "disable") == 0)
+ return OPT_QUOTA_DISABLE;
} else if (strcmp(prev_cmd, "regions") == 0) {
if (strcmp(cmd, "list") == 0)
return OPT_REGION_LIST;
@@ -659,6 +679,64 @@ static bool dump_string(const char *field_name, bufferlist& bl, Formatter *f)
return true;
}
+void set_quota_info(RGWQuotaInfo& quota, int opt_cmd, int64_t max_size, int64_t max_objects)
+{
+ switch (opt_cmd) {
+ case OPT_QUOTA_ENABLE:
+ quota.enabled = true;
+
+ // falling through on purpose
+
+ case OPT_QUOTA_SET:
+ if (max_objects >= 0) {
+ quota.max_objects = max_objects;
+ }
+ if (max_size >= 0) {
+ quota.max_size_kb = rgw_rounded_kb(max_size);
+ }
+ break;
+ case OPT_QUOTA_DISABLE:
+ quota.enabled = false;
+ break;
+ }
+}
+
+int set_bucket_quota(RGWRados *store, int opt_cmd, string& bucket_name, int64_t max_size, int64_t max_objects)
+{
+ RGWBucketInfo bucket_info;
+ map<string, bufferlist> attrs;
+ int r = store->get_bucket_info(NULL, bucket_name, bucket_info, NULL, &attrs);
+ if (r < 0) {
+ cerr << "could not get bucket info for bucket=" << bucket_name << ": " << cpp_strerror(-r) << std::endl;
+ return -r;
+ }
+
+ set_quota_info(bucket_info.quota, opt_cmd, max_size, max_objects);
+
+ r = store->put_bucket_instance_info(bucket_info, false, 0, &attrs);
+ if (r < 0) {
+ cerr << "ERROR: failed writing bucket instance info: " << cpp_strerror(-r) << std::endl;
+ return -r;
+ }
+ return 0;
+}
+
+int set_user_bucket_quota(int opt_cmd, RGWUser& user, RGWUserAdminOpState& op_state, int64_t max_size, int64_t max_objects)
+{
+ RGWUserInfo& user_info = op_state.get_user_info();
+
+ set_quota_info(user_info.bucket_quota, opt_cmd, max_size, max_objects);
+
+ op_state.set_bucket_quota(user_info.bucket_quota);
+
+ string err;
+ int r = user.modify(op_state, &err);
+ if (r < 0) {
+ cerr << "ERROR: failed updating user info: " << cpp_strerror(-r) << ": " << err << std::endl;
+ return -r;
+ }
+ return 0;
+}
int main(int argc, char **argv)
{
@@ -720,6 +798,10 @@ int main(int argc, char **argv)
string replica_log_type_str;
ReplicaLogType replica_log_type = ReplicaLog_Invalid;
string op_mask_str;
+ string quota_scope;
+
+ int64_t max_objects = -1;
+ int64_t max_size = -1;
std::string val;
std::ostringstream errs;
@@ -787,6 +869,10 @@ int main(int argc, char **argv)
max_buckets = atoi(val.c_str());
} else if (ceph_argparse_witharg(args, i, &val, "--max-entries", (char*)NULL)) {
max_entries = atoi(val.c_str());
+ } else if (ceph_argparse_witharg(args, i, &val, "--max-size", (char*)NULL)) {
+ max_size = (int64_t)atoll(val.c_str());
+ } else if (ceph_argparse_witharg(args, i, &val, "--max-objects", (char*)NULL)) {
+ max_objects = (int64_t)atoll(val.c_str());
} else if (ceph_argparse_witharg(args, i, &val, "--date", "--time", (char*)NULL)) {
date = val;
if (end_date.empty())
@@ -847,6 +933,8 @@ int main(int argc, char **argv)
start_marker = val;
} else if (ceph_argparse_witharg(args, i, &val, "--end-marker", (char*)NULL)) {
end_marker = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--quota-scope", (char*)NULL)) {
+ quota_scope = val;
} else if (ceph_argparse_witharg(args, i, &val, "--replica-log-type", (char*)NULL)) {
replica_log_type_str = val;
replica_log_type = get_replicalog_type(replica_log_type_str);
@@ -2227,5 +2315,28 @@ next:
return -ret;
}
}
+
+ bool quota_op = (opt_cmd == OPT_QUOTA_SET || opt_cmd == OPT_QUOTA_ENABLE || opt_cmd == OPT_QUOTA_DISABLE);
+
+ if (quota_op) {
+ if (bucket_name.empty() && user_id.empty()) {
+ cerr << "ERROR: bucket name or uid is required for quota operation" << std::endl;
+ return EINVAL;
+ }
+
+ if (!bucket_name.empty()) {
+ if (!quota_scope.empty() && quota_scope != "bucket") {
+ cerr << "ERROR: invalid quota scope specification." << std::endl;
+ return EINVAL;
+ }
+ set_bucket_quota(store, opt_cmd, bucket_name, max_size, max_objects);
+ } else if (!user_id.empty()) {
+ if (quota_scope != "bucket") {
+ cerr << "ERROR: only bucket-level user quota can be handled. Please specify --quota-scope=bucket" << std::endl;
+ return EINVAL;
+ }
+ set_user_bucket_quota(opt_cmd, user, user_op, max_size, max_objects);
+ }
+ }
return 0;
}
diff --git a/src/rgw/rgw_auth_s3.cc b/src/rgw/rgw_auth_s3.cc
index f3f0c8322f0..b8246b784fa 100644
--- a/src/rgw/rgw_auth_s3.cc
+++ b/src/rgw/rgw_auth_s3.cc
@@ -15,6 +15,12 @@ static const char *signed_subresources[] = {
"partNumber",
"policy",
"requestPayment",
+ "response-cache-control",
+ "response-content-disposition",
+ "response-content-encoding",
+ "response-content-language",
+ "response-content-type",
+ "response-expires",
"torrent",
"uploadId",
"uploads",
@@ -73,9 +79,9 @@ static void get_canon_resource(const char *request_uri, map<string, string>& sub
if (!append_str.empty()) {
s.append(append_str);
}
- dout(10) << "get_canon_resource(): dest=" << dest << dendl;
-
dest = s;
+
+ dout(10) << "get_canon_resource(): dest=" << dest << dendl;
}
/*
diff --git a/src/rgw/rgw_bucket.cc b/src/rgw/rgw_bucket.cc
index 1e523f332cf..3267bc51948 100644
--- a/src/rgw/rgw_bucket.cc
+++ b/src/rgw/rgw_bucket.cc
@@ -901,6 +901,7 @@ static int bucket_stats(RGWRados *store, std::string& bucket_name, Formatter *f
formatter->dump_int("mtime", mtime);
formatter->dump_string("max_marker", max_marker);
dump_bucket_usage(stats, formatter);
+ encode_json("bucket_quota", bucket_info.quota, formatter);
formatter->close_section();
return 0;
@@ -1451,7 +1452,12 @@ public:
if (ret < 0)
return ret;
- ret = rgw_unlink_bucket(store, be.owner, entry);
+ /*
+ * We're unlinking the bucket but we don't want to update the entrypoint here — we're removing
+ * it immediately and don't want to invalidate our cached objv_version or the bucket obj removal
+ * will incorrectly fail.
+ */
+ ret = rgw_unlink_bucket(store, be.owner, entry, false);
if (ret < 0) {
lderr(store->ctx()) << "could not unlink bucket=" << entry << " owner=" << be.owner << dendl;
}
diff --git a/src/rgw/rgw_cache.cc b/src/rgw/rgw_cache.cc
index 5b96eb45b08..d0afdcd389c 100644
--- a/src/rgw/rgw_cache.cc
+++ b/src/rgw/rgw_cache.cc
@@ -107,7 +107,7 @@ void ObjectCache::remove(string& name)
void ObjectCache::touch_lru(string& name, std::list<string>::iterator& lru_iter)
{
- while (lru.size() > (size_t)cct->_conf->rgw_cache_lru_size) {
+ while (lru_size > (size_t)cct->_conf->rgw_cache_lru_size) {
list<string>::iterator iter = lru.begin();
if ((*iter).compare(name) == 0) {
/*
@@ -121,10 +121,12 @@ void ObjectCache::touch_lru(string& name, std::list<string>::iterator& lru_iter)
if (map_iter != cache_map.end())
cache_map.erase(map_iter);
lru.pop_front();
+ lru_size--;
}
if (lru_iter == lru.end()) {
lru.push_back(name);
+ lru_size++;
lru_iter--;
ldout(cct, 10) << "adding " << name << " to cache LRU end" << dendl;
} else {
@@ -142,6 +144,7 @@ void ObjectCache::remove_lru(string& name, std::list<string>::iterator& lru_iter
return;
lru.erase(lru_iter);
+ lru_size--;
lru_iter = lru.end();
}
diff --git a/src/rgw/rgw_cache.h b/src/rgw/rgw_cache.h
index 601fcdfc963..68720d0e6ac 100644
--- a/src/rgw/rgw_cache.h
+++ b/src/rgw/rgw_cache.h
@@ -131,13 +131,14 @@ struct ObjectCacheEntry {
class ObjectCache {
std::map<string, ObjectCacheEntry> cache_map;
std::list<string> lru;
+ unsigned long lru_size;
Mutex lock;
CephContext *cct;
void touch_lru(string& name, std::list<string>::iterator& lru_iter);
void remove_lru(string& name, std::list<string>::iterator& lru_iter);
public:
- ObjectCache() : lock("ObjectCache"), cct(NULL) { }
+ ObjectCache() : lru_size(0), lock("ObjectCache"), cct(NULL) { }
int get(std::string& name, ObjectCacheInfo& bl, uint32_t mask);
void put(std::string& name, ObjectCacheInfo& bl);
void remove(std::string& name);
diff --git a/src/rgw/rgw_common.cc b/src/rgw/rgw_common.cc
index ef0a2604d51..c872314fe4e 100644
--- a/src/rgw/rgw_common.cc
+++ b/src/rgw/rgw_common.cc
@@ -123,8 +123,8 @@ void req_info::rebuild_from(req_info& src)
req_state::req_state(CephContext *_cct, class RGWEnv *e) : cct(_cct), cio(NULL), op(OP_UNKNOWN),
- bucket_cors(NULL), has_acl_header(false),
- os_auth_token(NULL), info(_cct, e)
+ has_acl_header(false),
+ os_auth_token(NULL), info(_cct, e)
{
enable_ops_log = e->conf->enable_ops_log;
enable_usage_log = e->conf->enable_usage_log;
@@ -162,7 +162,6 @@ req_state::req_state(CephContext *_cct, class RGWEnv *e) : cct(_cct), cio(NULL),
req_state::~req_state() {
delete formatter;
delete bucket_acl;
- delete bucket_cors;
delete object_acl;
free((void *)object);
free((void *)bucket_name);
diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h
index 405c3d9b3ba..baf60001a8b 100644
--- a/src/rgw/rgw_common.h
+++ b/src/rgw/rgw_common.h
@@ -29,6 +29,7 @@
#include "include/utime.h"
#include "rgw_acl.h"
#include "rgw_cors.h"
+#include "rgw_quota.h"
#include "cls/version/cls_version_types.h"
#include "include/rados/librados.hpp"
@@ -90,6 +91,7 @@ using ceph::crypto::MD5;
#define RGW_OP_TYPE_WRITE 0x02
#define RGW_OP_TYPE_DELETE 0x04
+#define RGW_OP_TYPE_MODIFY (RGW_OP_TYPE_WRITE | RGW_OP_TYPE_DELETE)
#define RGW_OP_TYPE_ALL (RGW_OP_TYPE_READ | RGW_OP_TYPE_WRITE | RGW_OP_TYPE_DELETE)
#define RGW_DEFAULT_MAX_BUCKETS 1000
@@ -128,6 +130,7 @@ using ceph::crypto::MD5;
#define ERR_NOT_FOUND 2023
#define ERR_PERMANENT_REDIRECT 2024
#define ERR_LOCKED 2025
+#define ERR_QUOTA_EXCEEDED 2026
#define ERR_USER_SUSPENDED 2100
#define ERR_INTERNAL_ERROR 2200
@@ -423,11 +426,12 @@ struct RGWUserInfo
__u8 system;
string default_placement;
list<string> placement_tags;
+ RGWQuotaInfo bucket_quota;
RGWUserInfo() : auid(0), suspended(0), max_buckets(RGW_DEFAULT_MAX_BUCKETS), op_mask(RGW_OP_TYPE_ALL), system(0) {}
void encode(bufferlist& bl) const {
- ENCODE_START(13, 9, bl);
+ ENCODE_START(14, 9, bl);
::encode(auid, bl);
string access_key;
string secret_key;
@@ -462,6 +466,7 @@ struct RGWUserInfo
::encode(system, bl);
::encode(default_placement, bl);
::encode(placement_tags, bl);
+ ::encode(bucket_quota, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::iterator& bl) {
@@ -518,6 +523,9 @@ struct RGWUserInfo
::decode(default_placement, bl);
::decode(placement_tags, bl); /* tags of allowed placement rules */
}
+ if (struct_v >= 14) {
+ ::decode(bucket_quota, bl);
+ }
DECODE_FINISH(bl);
}
void dump(Formatter *f) const;
@@ -599,6 +607,10 @@ struct rgw_bucket {
void dump(Formatter *f) const;
void decode_json(JSONObj *obj);
static void generate_test_instances(list<rgw_bucket*>& o);
+
+ bool operator<(const rgw_bucket& b) const {
+ return name.compare(b.name) < 0;
+ }
};
WRITE_CLASS_ENCODER(rgw_bucket)
@@ -661,9 +673,10 @@ struct RGWBucketInfo
bool has_instance_obj;
RGWObjVersionTracker objv_tracker; /* we don't need to serialize this, for runtime tracking */
obj_version ep_objv; /* entry point object version, for runtime tracking only */
+ RGWQuotaInfo quota;
void encode(bufferlist& bl) const {
- ENCODE_START(8, 4, bl);
+ ENCODE_START(9, 4, bl);
::encode(bucket, bl);
::encode(owner, bl);
::encode(flags, bl);
@@ -672,6 +685,7 @@ struct RGWBucketInfo
::encode(ct, bl);
::encode(placement_rule, bl);
::encode(has_instance_obj, bl);
+ ::encode(quota, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::iterator& bl) {
@@ -692,6 +706,8 @@ struct RGWBucketInfo
::decode(placement_rule, bl);
if (struct_v >= 8)
::decode(has_instance_obj, bl);
+ if (struct_v >= 9)
+ ::decode(quota, bl);
DECODE_FINISH(bl);
}
void dump(Formatter *f) const;
@@ -754,6 +770,8 @@ struct RGWBucketStats
uint64_t num_kb;
uint64_t num_kb_rounded;
uint64_t num_objects;
+
+ RGWBucketStats() : num_kb(0), num_kb_rounded(0), num_objects(0) {}
};
struct req_state;
@@ -821,7 +839,6 @@ struct req_state {
RGWUserInfo user;
RGWAccessControlPolicy *bucket_acl;
RGWAccessControlPolicy *object_acl;
- RGWCORSConfiguration *bucket_cors;
bool system_request;
@@ -1214,6 +1231,11 @@ static inline const char *rgw_obj_category_name(RGWObjCategory category)
return "unknown";
}
+static inline uint64_t rgw_rounded_kb(uint64_t bytes)
+{
+ return (bytes + 1023) / 1024;
+}
+
extern string rgw_string_unquote(const string& s);
extern void parse_csv_string(const string& ival, vector<string>& ovals);
extern int parse_key_value(string& in_str, string& key, string& val);
diff --git a/src/rgw/rgw_cors.cc b/src/rgw/rgw_cors.cc
index 033bfa2f215..4be83605b50 100644
--- a/src/rgw/rgw_cors.cc
+++ b/src/rgw/rgw_cors.cc
@@ -79,7 +79,10 @@ static bool is_string_in_set(set<string>& s, string h) {
<< ", at offset not less than " << flen << dendl;
if (h.compare((h.size() - sl.size()), sl.size(), sl) != 0)
continue;
+ ssplit.pop_front();
}
+ if (!ssplit.empty())
+ continue;
return true;
}
}
diff --git a/src/rgw/rgw_cors.h b/src/rgw/rgw_cors.h
index 415f3f0b869..1e0ec3bc7ec 100644
--- a/src/rgw/rgw_cors.h
+++ b/src/rgw/rgw_cors.h
@@ -25,11 +25,13 @@
#define RGW_CORS_HEAD 0x4
#define RGW_CORS_POST 0x8
#define RGW_CORS_DELETE 0x10
-#define RGW_CORS_ALL (RGW_CORS_GET | \
- RGW_CORS_PUT | \
- RGW_CORS_HEAD | \
- RGW_CORS_POST | \
- RGW_CORS_DELETE)
+#define RGW_CORS_COPY 0x20
+#define RGW_CORS_ALL (RGW_CORS_GET | \
+ RGW_CORS_PUT | \
+ RGW_CORS_HEAD | \
+ RGW_CORS_POST | \
+ RGW_CORS_DELETE | \
+ RGW_CORS_COPY)
#define CORS_MAX_AGE_INVALID ((uint32_t)-1)
diff --git a/src/rgw/rgw_cors_s3.cc b/src/rgw/rgw_cors_s3.cc
index c1448f26b23..01150a9e65b 100644
--- a/src/rgw/rgw_cors_s3.cc
+++ b/src/rgw/rgw_cors_s3.cc
@@ -44,6 +44,8 @@ void RGWCORSRule_S3::to_xml(XMLFormatter& f) {
f.dump_string("AllowedMethod", "HEAD");
if (allowed_methods & RGW_CORS_POST)
f.dump_string("AllowedMethod", "POST");
+ if (allowed_methods & RGW_CORS_COPY)
+ f.dump_string("AllowedMethod", "COPY");
/*AllowedOrigins*/
for(set<string>::iterator it = allowed_origins.begin();
it != allowed_origins.end();
@@ -87,6 +89,8 @@ bool RGWCORSRule_S3::xml_end(const char *el) {
allowed_methods |= RGW_CORS_HEAD;
} else if (strcasecmp(s, "PUT") == 0) {
allowed_methods |= RGW_CORS_PUT;
+ } else if (strcasecmp(s, "COPY") == 0) {
+ allowed_methods |= RGW_CORS_COPY;
} else {
return false;
}
diff --git a/src/rgw/rgw_http_errors.h b/src/rgw/rgw_http_errors.h
index 6cb9fabf6c0..ba3e522651f 100644
--- a/src/rgw/rgw_http_errors.h
+++ b/src/rgw/rgw_http_errors.h
@@ -36,6 +36,7 @@ const static struct rgw_http_errors RGW_HTTP_ERRORS[] = {
{ EPERM, 403, "AccessDenied" },
{ ERR_USER_SUSPENDED, 403, "UserSuspended" },
{ ERR_REQUEST_TIME_SKEWED, 403, "RequestTimeTooSkewed" },
+ { ERR_QUOTA_EXCEEDED, 403, "QuotaExceeded" },
{ ENOENT, 404, "NoSuchKey" },
{ ERR_NO_SUCH_BUCKET, 404, "NoSuchBucket" },
{ ERR_NO_SUCH_UPLOAD, 404, "NoSuchUpload" },
diff --git a/src/rgw/rgw_json_enc.cc b/src/rgw/rgw_json_enc.cc
index 05d7206ba44..4d6b25374b9 100644
--- a/src/rgw/rgw_json_enc.cc
+++ b/src/rgw/rgw_json_enc.cc
@@ -6,6 +6,7 @@
#include "rgw_acl_s3.h"
#include "rgw_cache.h"
#include "rgw_bucket.h"
+#include "rgw_keystone.h"
#include "common/ceph_json.h"
#include "common/Formatter.h"
@@ -395,6 +396,7 @@ void RGWUserInfo::dump(Formatter *f) const
}
encode_json("default_placement", default_placement, f);
encode_json("placement_tags", placement_tags, f);
+ encode_json("bucket_quota", bucket_quota, f);
}
@@ -445,6 +447,21 @@ void RGWUserInfo::decode_json(JSONObj *obj)
system = (__u8)sys;
JSONDecoder::decode_json("default_placement", default_placement, obj);
JSONDecoder::decode_json("placement_tags", placement_tags, obj);
+ JSONDecoder::decode_json("bucket_quota", bucket_quota, obj);
+}
+
+void RGWQuotaInfo::dump(Formatter *f) const
+{
+ f->dump_bool("enabled", enabled);
+ f->dump_int("max_size_kb", max_size_kb);
+ f->dump_int("max_objects", max_objects);
+}
+
+void RGWQuotaInfo::decode_json(JSONObj *obj)
+{
+ JSONDecoder::decode_json("max_size_kb", max_size_kb, obj);
+ JSONDecoder::decode_json("max_objects", max_objects, obj);
+ JSONDecoder::decode_json("enabled", enabled, obj);
}
void rgw_bucket::dump(Formatter *f) const
@@ -496,6 +513,7 @@ void RGWBucketInfo::dump(Formatter *f) const
encode_json("region", region, f);
encode_json("placement_rule", placement_rule, f);
encode_json("has_instance_obj", has_instance_obj, f);
+ encode_json("quota", quota, f);
}
void RGWBucketInfo::decode_json(JSONObj *obj) {
@@ -506,6 +524,7 @@ void RGWBucketInfo::decode_json(JSONObj *obj) {
JSONDecoder::decode_json("region", region, obj);
JSONDecoder::decode_json("placement_rule", placement_rule, obj);
JSONDecoder::decode_json("has_instance_obj", has_instance_obj, obj);
+ JSONDecoder::decode_json("quota", quota, obj);
}
void RGWObjEnt::dump(Formatter *f) const
@@ -672,12 +691,14 @@ void RGWRegionMap::dump(Formatter *f) const
{
encode_json("regions", regions, f);
encode_json("master_region", master_region, f);
+ encode_json("bucket_quota", bucket_quota, f);
}
void RGWRegionMap::decode_json(JSONObj *obj)
{
JSONDecoder::decode_json("regions", regions, obj);
JSONDecoder::decode_json("master_region", master_region, obj);
+ JSONDecoder::decode_json("bucket_quota", bucket_quota, obj);
}
void RGWMetadataLogInfo::dump(Formatter *f) const
@@ -704,3 +725,70 @@ void RGWDataChangesLogInfo::decode_json(JSONObj *obj)
JSONDecoder::decode_json("last_update", last_update, obj);
}
+void KeystoneToken::Metadata::decode_json(JSONObj *obj)
+{
+ JSONDecoder::decode_json("is_admin", is_admin, obj);
+}
+
+void KeystoneToken::Service::Endpoint::decode_json(JSONObj *obj)
+{
+ JSONDecoder::decode_json("id", id, obj);
+ JSONDecoder::decode_json("adminURL", admin_url, obj);
+ JSONDecoder::decode_json("publicURL", public_url, obj);
+ JSONDecoder::decode_json("internalURL", internal_url, obj);
+ JSONDecoder::decode_json("region", region, obj);
+}
+
+void KeystoneToken::Service::decode_json(JSONObj *obj)
+{
+ JSONDecoder::decode_json("type", type, obj, true);
+ JSONDecoder::decode_json("name", name, obj, true);
+ JSONDecoder::decode_json("endpoints", endpoints, obj);
+}
+
+void KeystoneToken::Token::Tenant::decode_json(JSONObj *obj)
+{
+ JSONDecoder::decode_json("id", id, obj, true);
+ JSONDecoder::decode_json("name", name, obj, true);
+ JSONDecoder::decode_json("description", description, obj);
+ JSONDecoder::decode_json("enabled", enabled, obj);
+}
+
+void KeystoneToken::Token::decode_json(JSONObj *obj)
+{
+ string expires_iso8601;
+ struct tm t;
+
+ JSONDecoder::decode_json("id", id, obj, true);
+ JSONDecoder::decode_json("tenant", tenant, obj, true);
+ JSONDecoder::decode_json("expires", expires_iso8601, obj, true);
+
+ if (parse_iso8601(expires_iso8601.c_str(), &t)) {
+ expires = timegm(&t);
+ } else {
+ expires = 0;
+ throw JSONDecoder::err("Failed to parse ISO8601 expiration date from Keystone response.");
+ }
+}
+
+void KeystoneToken::User::Role::decode_json(JSONObj *obj)
+{
+ JSONDecoder::decode_json("id", id, obj);
+ JSONDecoder::decode_json("name", name, obj);
+}
+
+void KeystoneToken::User::decode_json(JSONObj *obj)
+{
+ JSONDecoder::decode_json("id", id, obj, true);
+ JSONDecoder::decode_json("name", name, obj);
+ JSONDecoder::decode_json("username", user_name, obj, true);
+ JSONDecoder::decode_json("roles", roles, obj);
+}
+
+void KeystoneToken::decode_json(JSONObj *access_obj)
+{
+ JSONDecoder::decode_json("metadata", metadata, access_obj);
+ JSONDecoder::decode_json("token", token, access_obj, true);
+ JSONDecoder::decode_json("user", user, access_obj, true);
+ JSONDecoder::decode_json("serviceCatalog", service_catalog, access_obj);
+}
diff --git a/src/rgw/rgw_keystone.cc b/src/rgw/rgw_keystone.cc
new file mode 100644
index 00000000000..7c746654129
--- /dev/null
+++ b/src/rgw/rgw_keystone.cc
@@ -0,0 +1,108 @@
+#include <errno.h>
+#include <fnmatch.h>
+
+#include "common/errno.h"
+#include "common/ceph_json.h"
+#include "include/types.h"
+#include "include/str_list.h"
+
+#include "rgw_common.h"
+#include "rgw_keystone.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+bool KeystoneToken::User::has_role(const string& r) {
+ list<Role>::iterator iter;
+ for (iter = roles.begin(); iter != roles.end(); ++iter) {
+ if (fnmatch(r.c_str(), ((*iter).name.c_str()), 0) == 0) {
+ return true;
+ }
+ }
+ return false;
+}
+
+int KeystoneToken::parse(CephContext *cct, bufferlist& bl)
+{
+ JSONParser parser;
+ if (!parser.parse(bl.c_str(), bl.length())) {
+ ldout(cct, 0) << "Keystone token parse error: malformed json" << dendl;
+ return -EINVAL;
+ }
+
+ try {
+ JSONDecoder::decode_json("access", *this, &parser);
+ } catch (JSONDecoder::err& err) {
+ ldout(cct, 0) << "Keystone token parse error: " << err.message << dendl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+bool RGWKeystoneTokenCache::find(const string& token_id, KeystoneToken& token)
+{
+ lock.Lock();
+ map<string, token_entry>::iterator iter = tokens.find(token_id);
+ if (iter == tokens.end()) {
+ lock.Unlock();
+ if (perfcounter) perfcounter->inc(l_rgw_keystone_token_cache_miss);
+ return false;
+ }
+
+ token_entry& entry = iter->second;
+ tokens_lru.erase(entry.lru_iter);
+
+ if (entry.token.expired()) {
+ tokens.erase(iter);
+ lock.Unlock();
+ if (perfcounter) perfcounter->inc(l_rgw_keystone_token_cache_hit);
+ return false;
+ }
+ token = entry.token;
+
+ tokens_lru.push_front(token_id);
+ entry.lru_iter = tokens_lru.begin();
+
+ lock.Unlock();
+ if (perfcounter) perfcounter->inc(l_rgw_keystone_token_cache_hit);
+
+ return true;
+}
+
+void RGWKeystoneTokenCache::add(const string& token_id, KeystoneToken& token)
+{
+ lock.Lock();
+ map<string, token_entry>::iterator iter = tokens.find(token_id);
+ if (iter != tokens.end()) {
+ token_entry& e = iter->second;
+ tokens_lru.erase(e.lru_iter);
+ }
+
+ tokens_lru.push_front(token_id);
+ token_entry& entry = tokens[token_id];
+ entry.token = token;
+ entry.lru_iter = tokens_lru.begin();
+
+ while (tokens_lru.size() > max) {
+ list<string>::reverse_iterator riter = tokens_lru.rbegin();
+ iter = tokens.find(*riter);
+ assert(iter != tokens.end());
+ tokens.erase(iter);
+ tokens_lru.pop_back();
+ }
+
+ lock.Unlock();
+}
+
+void RGWKeystoneTokenCache::invalidate(const string& token_id)
+{
+ Mutex::Locker l(lock);
+ map<string, token_entry>::iterator iter = tokens.find(token_id);
+ if (iter == tokens.end())
+ return;
+
+ ldout(cct, 20) << "invalidating revoked token id=" << token_id << dendl;
+ token_entry& e = iter->second;
+ tokens_lru.erase(e.lru_iter);
+ tokens.erase(iter);
+}
diff --git a/src/rgw/rgw_keystone.h b/src/rgw/rgw_keystone.h
new file mode 100644
index 00000000000..05199eef89d
--- /dev/null
+++ b/src/rgw/rgw_keystone.h
@@ -0,0 +1,106 @@
+#ifndef CEPH_RGW_KEYSTONE_H
+#define CEPH_RGW_KEYSTONE_H
+
+#include "rgw_common.h"
+
+class KeystoneToken {
+public:
+ class Metadata {
+ public:
+ Metadata() : is_admin(false) { };
+ bool is_admin;
+ void decode_json(JSONObj *obj);
+ };
+
+ class Service {
+ public:
+ class Endpoint {
+ public:
+ string id;
+ string admin_url;
+ string public_url;
+ string internal_url;
+ string region;
+ void decode_json(JSONObj *obj);
+ };
+ string type;
+ string name;
+ list<Endpoint> endpoints;
+ void decode_json(JSONObj *obj);
+ };
+
+ class Token {
+ public:
+ Token() : expires(0) { };
+ class Tenant {
+ public:
+ Tenant() : enabled(false) { };
+ string id;
+ string name;
+ string description;
+ bool enabled;
+ void decode_json(JSONObj *obj);
+ };
+ string id;
+ time_t expires;
+ Tenant tenant;
+ void decode_json(JSONObj *obj);
+ };
+
+ class User {
+ public:
+ class Role {
+ public:
+ string id;
+ string name;
+ void decode_json(JSONObj *obj);
+ };
+ string id;
+ string name;
+ string user_name;
+ list<Role> roles;
+ void decode_json(JSONObj *obj);
+ bool has_role(const string& r);
+ };
+
+ Metadata metadata;
+ list<Service> service_catalog;
+ Token token;
+ User user;
+
+public:
+ int parse(CephContext *cct, bufferlist& bl);
+
+ bool expired() {
+ uint64_t now = ceph_clock_now(NULL).sec();
+ return (now >= (uint64_t)token.expires);
+ }
+
+ void decode_json(JSONObj *access_obj);
+};
+
+struct token_entry {
+ KeystoneToken token;
+ list<string>::iterator lru_iter;
+};
+
+class RGWKeystoneTokenCache {
+ CephContext *cct;
+
+ map<string, token_entry> tokens;
+ list<string> tokens_lru;
+
+ Mutex lock;
+
+ size_t max;
+
+public:
+ RGWKeystoneTokenCache(CephContext *_cct, int _max) : cct(_cct), lock("RGWKeystoneTokenCache"), max(_max) {}
+
+ bool find(const string& token_id, KeystoneToken& token);
+ void add(const string& token_id, KeystoneToken& token);
+ void invalidate(const string& token_id);
+};
+
+
+#endif
diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc
index 12301ba9c5f..acaa5deffee 100644
--- a/src/rgw/rgw_main.cc
+++ b/src/rgw/rgw_main.cc
@@ -323,7 +323,7 @@ void RGWProcess::handle_request(RGWRequest *req)
RGWRESTMgr *mgr;
RGWHandler *handler = rest->get_handler(store, s, &client_io, &mgr, &init_error);
if (init_error != 0) {
- abort_early(s, init_error);
+ abort_early(s, NULL, init_error);
goto done;
}
@@ -332,7 +332,7 @@ void RGWProcess::handle_request(RGWRequest *req)
req->log(s, "getting op");
op = handler->get_op(store);
if (!op) {
- abort_early(s, -ERR_METHOD_NOT_ALLOWED);
+ abort_early(s, NULL, -ERR_METHOD_NOT_ALLOWED);
goto done;
}
req->op = op;
@@ -341,26 +341,33 @@ void RGWProcess::handle_request(RGWRequest *req)
ret = handler->authorize();
if (ret < 0) {
dout(10) << "failed to authorize request" << dendl;
- abort_early(s, ret);
+ abort_early(s, op, ret);
goto done;
}
if (s->user.suspended) {
dout(10) << "user is suspended, uid=" << s->user.user_id << dendl;
- abort_early(s, -ERR_USER_SUSPENDED);
+ abort_early(s, op, -ERR_USER_SUSPENDED);
goto done;
}
req->log(s, "reading permissions");
ret = handler->read_permissions(op);
if (ret < 0) {
- abort_early(s, ret);
+ abort_early(s, op, ret);
+ goto done;
+ }
+
+ req->log(s, "init op");
+ ret = op->init_processing();
+ if (ret < 0) {
+ abort_early(s, op, ret);
goto done;
}
req->log(s, "verifying op mask");
ret = op->verify_op_mask();
if (ret < 0) {
- abort_early(s, ret);
+ abort_early(s, op, ret);
goto done;
}
@@ -370,7 +377,7 @@ void RGWProcess::handle_request(RGWRequest *req)
if (s->system_request) {
dout(2) << "overriding permissions due to system operation" << dendl;
} else {
- abort_early(s, ret);
+ abort_early(s, op, ret);
goto done;
}
}
@@ -378,7 +385,7 @@ void RGWProcess::handle_request(RGWRequest *req)
req->log(s, "verifying op params");
ret = op->verify_params();
if (ret < 0) {
- abort_early(s, ret);
+ abort_early(s, op, ret);
goto done;
}
diff --git a/src/rgw/rgw_metadata.cc b/src/rgw/rgw_metadata.cc
index 6da1ff5ab24..23f73e26531 100644
--- a/src/rgw/rgw_metadata.cc
+++ b/src/rgw/rgw_metadata.cc
@@ -1,7 +1,7 @@
-#include "rgw_metadata.h"
#include "common/ceph_json.h"
+#include "rgw_metadata.h"
#include "cls/version/cls_version_types.h"
#include "rgw_rados.h"
@@ -388,6 +388,8 @@ int RGWMetadataManager::remove(string& metadata_key)
objv_tracker.read_version = obj->get_version();
+ delete obj;
+
return handler->remove(store, entry, objv_tracker);
}
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index d34e18bc4ba..2e07e3fcde6 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -421,6 +421,152 @@ int RGWOp::verify_op_mask()
return 0;
}
+int RGWOp::init_quota()
+{
+ /* no quota enforcement for system requests */
+ if (s->system_request)
+ return 0;
+
+ /* init quota related stuff */
+ if (!(s->user.op_mask & RGW_OP_TYPE_MODIFY)) {
+ return 0;
+ }
+
+ /* only interested in object related ops */
+ if (s->object_str.empty()) {
+ return 0;
+ }
+
+ if (s->bucket_info.quota.enabled) {
+ bucket_quota = s->bucket_info.quota;
+ return 0;
+ }
+ if (s->user.user_id == s->bucket_owner.get_id()) {
+ if (s->user.bucket_quota.enabled) {
+ bucket_quota = s->user.bucket_quota;
+ return 0;
+ }
+ } else {
+ RGWUserInfo owner_info;
+ int r = rgw_get_user_info_by_uid(store, s->bucket_info.owner, owner_info);
+ if (r < 0)
+ return r;
+
+ if (owner_info.bucket_quota.enabled) {
+ bucket_quota = owner_info.bucket_quota;
+ return 0;
+ }
+ }
+
+ bucket_quota = store->region_map.bucket_quota;
+ return 0;
+}
+
+static bool validate_cors_rule_method(RGWCORSRule *rule, const char *req_meth) {
+ uint8_t flags = 0;
+ if (strcmp(req_meth, "GET") == 0) flags = RGW_CORS_GET;
+ else if (strcmp(req_meth, "POST") == 0) flags = RGW_CORS_POST;
+ else if (strcmp(req_meth, "PUT") == 0) flags = RGW_CORS_PUT;
+ else if (strcmp(req_meth, "DELETE") == 0) flags = RGW_CORS_DELETE;
+ else if (strcmp(req_meth, "HEAD") == 0) flags = RGW_CORS_HEAD;
+
+ if ((rule->get_allowed_methods() & flags) == flags) {
+ dout(10) << "Method " << req_meth << " is supported" << dendl;
+ } else {
+ dout(5) << "Method " << req_meth << " is not supported" << dendl;
+ return false;
+ }
+
+ return true;
+}
+
+int RGWOp::read_bucket_cors()
+{
+ bufferlist bl;
+
+ map<string, bufferlist>::iterator aiter = s->bucket_attrs.find(RGW_ATTR_CORS);
+ if (aiter == s->bucket_attrs.end()) {
+ ldout(s->cct, 20) << "no CORS configuration attr found" << dendl;
+ cors_exist = false;
+ return 0; /* no CORS configuration found */
+ }
+
+ cors_exist = true;
+
+ bl = aiter->second;
+
+ bufferlist::iterator iter = bl.begin();
+ try {
+ bucket_cors.decode(iter);
+ } catch (buffer::error& err) {
+ ldout(s->cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
+ return -EIO;
+ }
+ if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) {
+ RGWCORSConfiguration_S3 *s3cors = static_cast<RGWCORSConfiguration_S3 *>(&bucket_cors);
+ ldout(s->cct, 15) << "Read RGWCORSConfiguration";
+ s3cors->to_xml(*_dout);
+ *_dout << dendl;
+ }
+ return 0;
+}
+
+static void get_cors_response_headers(RGWCORSRule *rule, const char *req_hdrs, string& hdrs, string& exp_hdrs, unsigned *max_age) {
+ if (req_hdrs) {
+ list<string> hl;
+ get_str_list(req_hdrs, hl);
+ for(list<string>::iterator it = hl.begin(); it != hl.end(); ++it) {
+ if (!rule->is_header_allowed((*it).c_str(), (*it).length())) {
+ dout(5) << "Header " << (*it) << " is not registered in this rule" << dendl;
+ } else {
+ if (hdrs.length() > 0) hdrs.append(",");
+ hdrs.append((*it));
+ }
+ }
+ }
+ rule->format_exp_headers(exp_hdrs);
+ *max_age = rule->get_max_age();
+}
+
+bool RGWOp::generate_cors_headers(string& origin, string& method, string& headers, string& exp_headers, unsigned *max_age)
+{
+ const char *orig = s->info.env->get("HTTP_ORIGIN");
+ if (!orig) {
+ return false;
+ }
+ origin = orig;
+ int ret = read_bucket_cors();
+ if (ret < 0) {
+ return false;
+ }
+
+ if (!cors_exist) {
+ dout(2) << "No CORS configuration set yet for this bucket" << dendl;
+ return false;
+ }
+
+ RGWCORSRule *rule = bucket_cors.host_name_rule(orig);
+ if (!rule)
+ return false;
+
+ const char *req_meth = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_METHOD");
+ if (!req_meth) {
+ req_meth = s->info.method;
+ }
+
+ if (req_meth)
+ method = req_meth;
+
+ if (!validate_cors_rule_method(rule, req_meth)) {
+ return false;
+ }
+
+ const char *req_hdrs = s->info.env->get("HTTP_ACCESS_CONTROL_ALLOW_HEADERS");
+
+ get_cors_response_headers(rule, req_hdrs, headers, exp_headers, max_age);
+
+ return true;
+}
int RGWGetObj::read_user_manifest_part(rgw_bucket& bucket, RGWObjEnt& ent, RGWAccessControlPolicy *bucket_policy, off_t start_ofs, off_t end_ofs)
{
@@ -1166,7 +1312,7 @@ int RGWPutObjProcessor_Multipart::prepare(RGWRados *store, void *obj_ctx)
int RGWPutObjProcessor_Multipart::do_complete(string& etag, time_t *mtime, time_t set_mtime, map<string, bufferlist>& attrs)
{
- complete_parts();
+ complete_writing_data();
RGWRados::PutObjMetaExtraParams params;
params.set_mtime = set_mtime;
@@ -1258,6 +1404,14 @@ void RGWPutObj::execute()
ldout(s->cct, 15) << "supplied_md5=" << supplied_md5 << dendl;
}
+ if (!chunked_upload) { /* with chunked upload we don't know how big is the upload.
+ we also check sizes at the end anyway */
+ ret = store->check_quota(s->bucket, bucket_quota, s->content_length);
+ if (ret < 0) {
+ goto done;
+ }
+ }
+
if (supplied_etag) {
strncpy(supplied_md5, supplied_etag, sizeof(supplied_md5) - 1);
supplied_md5[sizeof(supplied_md5) - 1] = '\0';
@@ -1302,6 +1456,11 @@ void RGWPutObj::execute()
s->obj_size = ofs;
perfcounter->inc(l_rgw_put_b, s->obj_size);
+ ret = store->check_quota(s->bucket, bucket_quota, s->obj_size);
+ if (ret < 0) {
+ goto done;
+ }
+
hash.Final(m);
buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5);
@@ -1499,6 +1658,13 @@ void RGWPutMetadata::execute()
}
}
+ map<string, string>::iterator giter;
+ for (giter = s->generic_attrs.begin(); giter != s->generic_attrs.end(); ++giter) {
+ bufferlist& attrbl = attrs[giter->first];
+ const string& val = giter->second;
+ attrbl.append(val.c_str(), val.size() + 1);
+ }
+
if (has_policy) {
policy.encode(bl);
attrs[RGW_ATTR_ACL] = bl;
@@ -1835,37 +2001,6 @@ void RGWPutACLs::execute()
}
}
-static int read_bucket_cors(RGWRados *store, struct req_state *s, RGWCORSConfiguration *bucket_cors, bool *exist)
-{
- bufferlist bl;
-
- map<string, bufferlist>::iterator aiter = s->bucket_attrs.find(RGW_ATTR_CORS);
- if (aiter == s->bucket_attrs.end()) {
- ldout(s->cct, 20) << "no CORS configuration attr found" << dendl;
- *exist = false;
- return 0; /* no CORS configuration found */
- }
-
- *exist = true;
-
- bl = aiter->second;
-
- bufferlist::iterator iter = bl.begin();
- try {
- bucket_cors->decode(iter);
- } catch (buffer::error& err) {
- ldout(s->cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
- return -EIO;
- }
- if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) {
- RGWCORSConfiguration_S3 *s3cors = static_cast<RGWCORSConfiguration_S3 *>(bucket_cors);
- ldout(s->cct, 15) << "Read RGWCORSConfiguration";
- s3cors->to_xml(*_dout);
- *_dout << dendl;
- }
- return 0;
-}
-
int RGWGetCORS::verify_permission()
{
if (s->user.user_id.compare(s->bucket_owner.get_id()) != 0)
@@ -1876,9 +2011,7 @@ int RGWGetCORS::verify_permission()
void RGWGetCORS::execute()
{
- bool cors_exist;
-
- ret = read_bucket_cors(store, s, &bucket_cors, &cors_exist);
+ ret = read_bucket_cors();
if (ret < 0)
return ;
@@ -1922,9 +2055,7 @@ int RGWDeleteCORS::verify_permission()
void RGWDeleteCORS::execute()
{
- bool cors_exist;
- RGWCORSConfiguration bucket_cors;
- ret = read_bucket_cors(store, s, &bucket_cors, &cors_exist);
+ ret = read_bucket_cors();
if (ret < 0)
return;
@@ -1961,52 +2092,34 @@ void RGWDeleteCORS::execute()
}
void RGWOptionsCORS::get_response_params(string& hdrs, string& exp_hdrs, unsigned *max_age) {
- if (req_hdrs) {
- list<string> hl;
- get_str_list(req_hdrs, hl);
- for(list<string>::iterator it = hl.begin(); it != hl.end(); ++it) {
- if (!rule->is_header_allowed((*it).c_str(), (*it).length())) {
- dout(5) << "Header " << (*it) << " is not registered in this rule" << dendl;
- } else {
- if (hdrs.length() > 0)hdrs.append(",");
- hdrs.append((*it));
- }
- }
- }
- rule->format_exp_headers(exp_hdrs);
- *max_age = rule->get_max_age();
+ get_cors_response_headers(rule, req_hdrs, hdrs, exp_hdrs, max_age);
}
-int RGWOptionsCORS::validate_cors_request() {
- RGWCORSConfiguration *cc = s->bucket_cors;
+int RGWOptionsCORS::validate_cors_request(RGWCORSConfiguration *cc) {
rule = cc->host_name_rule(origin);
if (!rule) {
- dout(10) << "There is no corsrule present for " << origin << dendl;
+ dout(10) << "There is no cors rule present for " << origin << dendl;
return -ENOENT;
}
- uint8_t flags = 0;
- if (strcmp(req_meth, "GET") == 0) flags = RGW_CORS_GET;
- else if (strcmp(req_meth, "POST") == 0) flags = RGW_CORS_POST;
- else if (strcmp(req_meth, "PUT") == 0) flags = RGW_CORS_PUT;
- else if (strcmp(req_meth, "DELETE") == 0) flags = RGW_CORS_DELETE;
- else if (strcmp(req_meth, "HEAD") == 0) flags = RGW_CORS_HEAD;
-
- if ((rule->get_allowed_methods() & flags) == flags) {
- dout(10) << "Method " << req_meth << " is supported" << dendl;
- } else {
- dout(5) << "Method " << req_meth << " is not supported" << dendl;
- req_meth = NULL;
- return -ENOTSUP;
+ if (!validate_cors_rule_method(rule, req_meth)) {
+ return -ENOENT;
}
return 0;
}
void RGWOptionsCORS::execute()
{
- if (!s->bucket_cors) {
- dout(2) << "No CORS configuration set yet for this bucket" << dendl;
- ret = -EACCES;
+ ret = read_bucket_cors();
+ if (ret < 0)
+ return;
+
+ origin = s->info.env->get("HTTP_ORIGIN");
+ if (!origin) {
+ dout(0) <<
+ "Preflight request without mandatory Origin header"
+ << dendl;
+ ret = -EINVAL;
return;
}
req_meth = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_METHOD");
@@ -2014,19 +2127,16 @@ void RGWOptionsCORS::execute()
dout(0) <<
"Preflight request without mandatory Access-control-request-method header"
<< dendl;
- ret = -EACCES;
+ ret = -EINVAL;
return;
}
- origin = s->info.env->get("HTTP_ORIGIN");
- if (!origin) {
- dout(0) <<
- "Preflight request without mandatory Origin header"
- << dendl;
- ret = -EACCES;
+ if (!cors_exist) {
+ dout(2) << "No CORS configuration set yet for this bucket" << dendl;
+ ret = -ENOENT;
return;
}
req_hdrs = s->info.env->get("HTTP_ACCESS_CONTROL_ALLOW_HEADERS");
- ret = validate_cors_request();
+ ret = validate_cors_request(&bucket_cors);
if (!rule) {
origin = req_meth = NULL;
return;
diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h
index d158f831cc7..eee5ea99065 100644
--- a/src/rgw/rgw_op.h
+++ b/src/rgw/rgw_op.h
@@ -20,6 +20,7 @@
#include "rgw_bucket.h"
#include "rgw_acl.h"
#include "rgw_cors.h"
+#include "rgw_quota.h"
using namespace std;
@@ -34,15 +35,31 @@ protected:
struct req_state *s;
RGWHandler *dialect_handler;
RGWRados *store;
+ RGWCORSConfiguration bucket_cors;
+ bool cors_exist;
+ RGWQuotaInfo bucket_quota;
+
+ virtual int init_quota();
public:
- RGWOp() : s(NULL), dialect_handler(NULL), store(NULL) {}
+ RGWOp() : s(NULL), dialect_handler(NULL), store(NULL), cors_exist(false) {}
virtual ~RGWOp() {}
+ virtual int init_processing() {
+ int ret = init_quota();
+ if (ret < 0)
+ return ret;
+
+ return 0;
+ }
+
virtual void init(RGWRados *store, struct req_state *s, RGWHandler *dialect_handler) {
this->store = store;
this->s = s;
this->dialect_handler = dialect_handler;
}
+ int read_bucket_cors();
+ bool generate_cors_headers(string& origin, string& method, string& headers, string& exp_headers, unsigned *max_age);
+
virtual int verify_params() { return 0; }
virtual bool prefetch_data() { return false; }
virtual int verify_permission() = 0;
@@ -526,7 +543,6 @@ public:
class RGWGetCORS : public RGWOp {
protected:
int ret;
- RGWCORSConfiguration bucket_cors;
public:
RGWGetCORS() : ret(0) {}
@@ -586,7 +602,7 @@ public:
}
int verify_permission() {return 0;}
- int validate_cors_request();
+ int validate_cors_request(RGWCORSConfiguration *cc);
void execute();
void get_response_params(string& allowed_hdrs, string& exp_hdrs, unsigned *max_age);
virtual void send_response() = 0;
diff --git a/src/rgw/rgw_quota.cc b/src/rgw/rgw_quota.cc
new file mode 100644
index 00000000000..66609ca723c
--- /dev/null
+++ b/src/rgw/rgw_quota.cc
@@ -0,0 +1,332 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "include/utime.h"
+#include "common/lru_map.h"
+#include "common/RefCountedObj.h"
+
+#include "rgw_common.h"
+#include "rgw_rados.h"
+#include "rgw_quota.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+
+struct RGWQuotaBucketStats {
+ RGWBucketStats stats;
+ utime_t expiration;
+ utime_t async_refresh_time;
+};
+
+class RGWBucketStatsCache {
+ RGWRados *store;
+ lru_map<rgw_bucket, RGWQuotaBucketStats> stats_map;
+ RefCountedWaitObject *async_refcount;
+
+ int fetch_bucket_totals(rgw_bucket& bucket, RGWBucketStats& stats);
+
+public:
+ RGWBucketStatsCache(RGWRados *_store) : store(_store), stats_map(store->ctx()->_conf->rgw_bucket_quota_cache_size) {
+ async_refcount = new RefCountedWaitObject;
+ }
+ ~RGWBucketStatsCache() {
+ async_refcount->put_wait(); /* wait for all pending async requests to complete */
+ }
+
+ int get_bucket_stats(rgw_bucket& bucket, RGWBucketStats& stats, RGWQuotaInfo& quota);
+ void adjust_bucket_stats(rgw_bucket& bucket, int objs_delta, uint64_t added_bytes, uint64_t removed_bytes);
+
+ bool can_use_cached_stats(RGWQuotaInfo& quota, RGWBucketStats& stats);
+
+ void set_stats(rgw_bucket& bucket, RGWQuotaBucketStats& qs, RGWBucketStats& stats);
+ int async_refresh(rgw_bucket& bucket, RGWQuotaBucketStats& qs);
+ void async_refresh_response(rgw_bucket& bucket, RGWBucketStats& stats);
+};
+
+bool RGWBucketStatsCache::can_use_cached_stats(RGWQuotaInfo& quota, RGWBucketStats& cached_stats)
+{
+ if (quota.max_size_kb >= 0) {
+ if (quota.max_size_soft_threshold < 0) {
+ quota.max_size_soft_threshold = quota.max_size_kb * store->ctx()->_conf->rgw_bucket_quota_soft_threshold;
+ }
+
+ if (cached_stats.num_kb_rounded >= (uint64_t)quota.max_size_soft_threshold) {
+ ldout(store->ctx(), 20) << "quota: can't use cached stats, exceeded soft threshold (size): "
+ << cached_stats.num_kb_rounded << " >= " << quota.max_size_soft_threshold << dendl;
+ return false;
+ }
+ }
+
+ if (quota.max_objects >= 0) {
+ if (quota.max_objs_soft_threshold < 0) {
+ quota.max_objs_soft_threshold = quota.max_objects * store->ctx()->_conf->rgw_bucket_quota_soft_threshold;
+ }
+
+ if (cached_stats.num_objects >= (uint64_t)quota.max_objs_soft_threshold) {
+ ldout(store->ctx(), 20) << "quota: can't use cached stats, exceeded soft threshold (num objs): "
+ << cached_stats.num_objects << " >= " << quota.max_objs_soft_threshold << dendl;
+ return false;
+ }
+ }
+
+ return true;
+}
+
+int RGWBucketStatsCache::fetch_bucket_totals(rgw_bucket& bucket, RGWBucketStats& stats)
+{
+ RGWBucketInfo bucket_info;
+
+ uint64_t bucket_ver;
+ uint64_t master_ver;
+
+ map<RGWObjCategory, RGWBucketStats> bucket_stats;
+ int r = store->get_bucket_stats(bucket, &bucket_ver, &master_ver, bucket_stats, NULL);
+ if (r < 0) {
+ ldout(store->ctx(), 0) << "could not get bucket info for bucket=" << bucket.name << dendl;
+ return r;
+ }
+
+ stats = RGWBucketStats();
+
+ map<RGWObjCategory, RGWBucketStats>::iterator iter;
+ for (iter = bucket_stats.begin(); iter != bucket_stats.end(); ++iter) {
+ RGWBucketStats& s = iter->second;
+ stats.num_kb += s.num_kb;
+ stats.num_kb_rounded += s.num_kb_rounded;
+ stats.num_objects += s.num_objects;
+ }
+
+ return 0;
+}
+
+class AsyncRefreshHandler : public RGWGetBucketStats_CB {
+ RGWRados *store;
+ RGWBucketStatsCache *cache;
+public:
+ AsyncRefreshHandler(RGWRados *_store, RGWBucketStatsCache *_cache, rgw_bucket& _bucket) : RGWGetBucketStats_CB(_bucket), store(_store), cache(_cache) {}
+
+ int init_fetch();
+
+ void handle_response(int r);
+};
+
+
+int AsyncRefreshHandler::init_fetch()
+{
+ ldout(store->ctx(), 20) << "initiating async quota refresh for bucket=" << bucket << dendl;
+ map<RGWObjCategory, RGWBucketStats> bucket_stats;
+ int r = store->get_bucket_stats_async(bucket, this);
+ if (r < 0) {
+ ldout(store->ctx(), 0) << "could not get bucket info for bucket=" << bucket.name << dendl;
+
+ /* get_bucket_stats_async() dropped our reference already */
+ return r;
+ }
+
+ return 0;
+}
+
+void AsyncRefreshHandler::handle_response(int r)
+{
+ if (r < 0) {
+ ldout(store->ctx(), 20) << "AsyncRefreshHandler::handle_response() r=" << r << dendl;
+ return; /* nothing to do here */
+ }
+
+ RGWBucketStats bs;
+
+ map<RGWObjCategory, RGWBucketStats>::iterator iter;
+ for (iter = stats->begin(); iter != stats->end(); ++iter) {
+ RGWBucketStats& s = iter->second;
+ bs.num_kb += s.num_kb;
+ bs.num_kb_rounded += s.num_kb_rounded;
+ bs.num_objects += s.num_objects;
+ }
+
+ cache->async_refresh_response(bucket, bs);
+}
+
+class RGWBucketStatsAsyncTestSet : public lru_map<rgw_bucket, RGWQuotaBucketStats>::UpdateContext {
+ int objs_delta;
+ uint64_t added_bytes;
+ uint64_t removed_bytes;
+public:
+ RGWBucketStatsAsyncTestSet() {}
+ bool update(RGWQuotaBucketStats *entry) {
+ if (entry->async_refresh_time.sec() == 0)
+ return false;
+
+ entry->async_refresh_time = utime_t(0, 0);
+
+ return true;
+ }
+};
+
+int RGWBucketStatsCache::async_refresh(rgw_bucket& bucket, RGWQuotaBucketStats& qs)
+{
+ /* protect against multiple updates */
+ RGWBucketStatsAsyncTestSet test_update;
+ if (!stats_map.find_and_update(bucket, NULL, &test_update)) {
+ /* most likely we just raced with another update */
+ return 0;
+ }
+
+ async_refcount->get();
+
+ AsyncRefreshHandler *handler = new AsyncRefreshHandler(store, this, bucket);
+
+ int ret = handler->init_fetch();
+ if (ret < 0) {
+ async_refcount->put();
+ handler->put();
+ return ret;
+ }
+
+ return 0;
+}
+
+void RGWBucketStatsCache::async_refresh_response(rgw_bucket& bucket, RGWBucketStats& stats)
+{
+ ldout(store->ctx(), 20) << "async stats refresh response for bucket=" << bucket << dendl;
+
+ RGWQuotaBucketStats qs;
+
+ stats_map.find(bucket, qs);
+
+ set_stats(bucket, qs, stats);
+
+ async_refcount->put();
+}
+
+void RGWBucketStatsCache::set_stats(rgw_bucket& bucket, RGWQuotaBucketStats& qs, RGWBucketStats& stats)
+{
+ qs.stats = stats;
+ qs.expiration = ceph_clock_now(store->ctx());
+ qs.async_refresh_time = qs.expiration;
+ qs.expiration += store->ctx()->_conf->rgw_bucket_quota_ttl;
+ qs.async_refresh_time += store->ctx()->_conf->rgw_bucket_quota_ttl / 2;
+
+ stats_map.add(bucket, qs);
+}
+
+int RGWBucketStatsCache::get_bucket_stats(rgw_bucket& bucket, RGWBucketStats& stats, RGWQuotaInfo& quota) {
+ RGWQuotaBucketStats qs;
+ utime_t now = ceph_clock_now(store->ctx());
+ if (stats_map.find(bucket, qs)) {
+ if (qs.async_refresh_time.sec() > 0 && now >= qs.async_refresh_time) {
+ int r = async_refresh(bucket, qs);
+ if (r < 0) {
+ ldout(store->ctx(), 0) << "ERROR: quota async refresh returned ret=" << r << dendl;
+
+ /* continue processing, might be a transient error, async refresh is just optimization */
+ }
+ }
+
+ if (can_use_cached_stats(quota, qs.stats) && qs.expiration > ceph_clock_now(store->ctx())) {
+ stats = qs.stats;
+ return 0;
+ }
+ }
+
+ int ret = fetch_bucket_totals(bucket, stats);
+ if (ret < 0 && ret != -ENOENT)
+ return ret;
+
+ set_stats(bucket, qs, stats);
+
+ return 0;
+}
+
+
+class RGWBucketStatsUpdate : public lru_map<rgw_bucket, RGWQuotaBucketStats>::UpdateContext {
+ int objs_delta;
+ uint64_t added_bytes;
+ uint64_t removed_bytes;
+public:
+ RGWBucketStatsUpdate(int _objs_delta, uint64_t _added_bytes, uint64_t _removed_bytes) :
+ objs_delta(_objs_delta), added_bytes(_added_bytes), removed_bytes(_removed_bytes) {}
+ bool update(RGWQuotaBucketStats *entry) {
+ uint64_t rounded_kb_added = rgw_rounded_kb(added_bytes);
+ uint64_t rounded_kb_removed = rgw_rounded_kb(removed_bytes);
+
+ entry->stats.num_kb_rounded += (rounded_kb_added - rounded_kb_removed);
+ entry->stats.num_kb += (added_bytes - removed_bytes) / 1024;
+ entry->stats.num_objects += objs_delta;
+
+ return true;
+ }
+};
+
+
+void RGWBucketStatsCache::adjust_bucket_stats(rgw_bucket& bucket, int objs_delta, uint64_t added_bytes, uint64_t removed_bytes)
+{
+ RGWBucketStatsUpdate update(objs_delta, added_bytes, removed_bytes);
+ stats_map.find_and_update(bucket, NULL, &update);
+}
+
+
+class RGWQuotaHandlerImpl : public RGWQuotaHandler {
+ RGWRados *store;
+ RGWBucketStatsCache stats_cache;
+public:
+ RGWQuotaHandlerImpl(RGWRados *_store) : store(_store), stats_cache(_store) {}
+ virtual int check_quota(rgw_bucket& bucket, RGWQuotaInfo& bucket_quota,
+ uint64_t num_objs, uint64_t size) {
+ uint64_t size_kb = rgw_rounded_kb(size);
+ if (!bucket_quota.enabled) {
+ return 0;
+ }
+
+ RGWBucketStats stats;
+
+ int ret = stats_cache.get_bucket_stats(bucket, stats, bucket_quota);
+ if (ret < 0)
+ return ret;
+
+ ldout(store->ctx(), 20) << "bucket quota: max_objects=" << bucket_quota.max_objects
+ << " max_size_kb=" << bucket_quota.max_size_kb << dendl;
+
+ if (bucket_quota.max_objects >= 0 &&
+ stats.num_objects + num_objs > (uint64_t)bucket_quota.max_objects) {
+ ldout(store->ctx(), 10) << "quota exceeded: stats.num_objects=" << stats.num_objects
+ << " bucket_quota.max_objects=" << bucket_quota.max_objects << dendl;
+
+ return -ERR_QUOTA_EXCEEDED;
+ }
+ if (bucket_quota.max_size_kb >= 0 &&
+ stats.num_kb_rounded + size_kb > (uint64_t)bucket_quota.max_size_kb) {
+ ldout(store->ctx(), 10) << "quota exceeded: stats.num_kb_rounded=" << stats.num_kb_rounded << " size_kb=" << size_kb
+ << " bucket_quota.max_size_kb=" << bucket_quota.max_size_kb << dendl;
+ return -ERR_QUOTA_EXCEEDED;
+ }
+
+ return 0;
+ }
+
+ virtual void update_stats(rgw_bucket& bucket, int obj_delta, uint64_t added_bytes, uint64_t removed_bytes) {
+ stats_cache.adjust_bucket_stats(bucket, obj_delta, added_bytes, removed_bytes);
+ };
+};
+
+
+RGWQuotaHandler *RGWQuotaHandler::generate_handler(RGWRados *store)
+{
+ return new RGWQuotaHandlerImpl(store);
+};
+
+void RGWQuotaHandler::free_handler(RGWQuotaHandler *handler)
+{
+ delete handler;
+}
diff --git a/src/rgw/rgw_quota.h b/src/rgw/rgw_quota.h
new file mode 100644
index 00000000000..2f8f28e85a2
--- /dev/null
+++ b/src/rgw/rgw_quota.h
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_RGW_QUOTA_H
+#define CEPH_RGW_QUOTA_H
+
+
+#include "include/utime.h"
+#include "include/atomic.h"
+#include "common/lru_map.h"
+
+class RGWRados;
+class JSONObj;
+
+struct RGWQuotaInfo {
+ int64_t max_size_kb;
+ int64_t max_objects;
+ bool enabled;
+ int64_t max_size_soft_threshold;
+ int64_t max_objs_soft_threshold;
+
+ RGWQuotaInfo() : max_size_kb(-1), max_objects(-1), enabled(false),
+ max_size_soft_threshold(-1), max_objs_soft_threshold(-1) {}
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ ::encode(max_size_kb, bl);
+ ::encode(max_objects, bl);
+ ::encode(enabled, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::iterator& bl) {
+ DECODE_START(1, bl);
+ ::decode(max_size_kb, bl);
+ ::decode(max_objects, bl);
+ ::decode(enabled, bl);
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+
+ void decode_json(JSONObj *obj);
+
+};
+WRITE_CLASS_ENCODER(RGWQuotaInfo)
+
+class rgw_bucket;
+
+class RGWQuotaHandler {
+public:
+ RGWQuotaHandler() {}
+ virtual ~RGWQuotaHandler() {
+ }
+ virtual int check_quota(rgw_bucket& bucket, RGWQuotaInfo& bucket_quota,
+ uint64_t num_objs, uint64_t size) = 0;
+
+ virtual void update_stats(rgw_bucket& bucket, int obj_delta, uint64_t added_bytes, uint64_t removed_bytes) = 0;
+
+ static RGWQuotaHandler *generate_handler(RGWRados *store);
+ static void free_handler(RGWQuotaHandler *handler);
+};
+
+#endif
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc
index 03cc1ebfdb3..9f0a900f3d3 100644
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -357,16 +357,20 @@ int RGWZoneParams::store_info(CephContext *cct, RGWRados *store, RGWRegion& regi
}
void RGWRegionMap::encode(bufferlist& bl) const {
- ENCODE_START(1, 1, bl);
+ ENCODE_START(2, 1, bl);
::encode(regions, bl);
::encode(master_region, bl);
+ ::encode(bucket_quota, bl);
ENCODE_FINISH(bl);
}
void RGWRegionMap::decode(bufferlist::iterator& bl) {
- DECODE_START(1, bl);
+ DECODE_START(2, bl);
::decode(regions, bl);
::decode(master_region, bl);
+
+ if (struct_v >= 2)
+ ::decode(bucket_quota, bl);
DECODE_FINISH(bl);
regions_by_api.clear();
@@ -741,7 +745,7 @@ void RGWPutObjProcessor_Atomic::complete_parts()
prepare_next_part(obj_len);
}
-int RGWPutObjProcessor_Atomic::do_complete(string& etag, time_t *mtime, time_t set_mtime, map<string, bufferlist>& attrs)
+int RGWPutObjProcessor_Atomic::complete_writing_data()
{
if (!data_ofs && !immutable_head()) {
first_chunk.claim(pending_data_bl);
@@ -762,6 +766,18 @@ int RGWPutObjProcessor_Atomic::do_complete(string& etag, time_t *mtime, time_t s
}
complete_parts();
+ int r = drain_pending();
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int RGWPutObjProcessor_Atomic::do_complete(string& etag, time_t *mtime, time_t set_mtime, map<string, bufferlist>& attrs) {
+ int r = complete_writing_data();
+ if (r < 0)
+ return r;
+
store->set_atomic(obj_ctx, head_obj);
RGWRados::PutObjMetaExtraParams extra_params;
@@ -772,9 +788,9 @@ int RGWPutObjProcessor_Atomic::do_complete(string& etag, time_t *mtime, time_t s
extra_params.mtime = mtime;
extra_params.set_mtime = set_mtime;
- int r = store->put_obj_meta(obj_ctx, head_obj, obj_len, attrs,
- RGW_OBJ_CATEGORY_MAIN, PUT_OBJ_CREATE,
- extra_params);
+ r = store->put_obj_meta(obj_ctx, head_obj, obj_len, attrs,
+ RGW_OBJ_CATEGORY_MAIN, PUT_OBJ_CREATE,
+ extra_params);
return r;
}
@@ -839,6 +855,7 @@ void RGWRados::finalize()
RGWRESTConn *conn = iter->second;
delete conn;
}
+ RGWQuotaHandler::free_handler(quota_handler);
}
/**
@@ -875,14 +892,6 @@ int RGWRados::init_complete()
{
int ret;
- if (need_watch_notify()) {
- ret = init_watch();
- if (ret < 0) {
- lderr(cct) << "ERROR: failed to initialize watch" << dendl;
- return ret;
- }
- }
-
ret = region.init(cct, this);
if (ret < 0)
return ret;
@@ -893,7 +902,9 @@ int RGWRados::init_complete()
ret = region_map.read(cct, this);
if (ret < 0) {
- ldout(cct, 0) << "WARNING: cannot read region map" << dendl;
+ if (ret != -ENOENT) {
+ ldout(cct, 0) << "WARNING: cannot read region map" << dendl;
+ }
ret = region_map.update(region);
if (ret < 0) {
ldout(cct, 0) << "ERROR: failed to update regionmap with local region info" << dendl;
@@ -920,6 +931,14 @@ int RGWRados::init_complete()
}
}
+ if (need_watch_notify()) {
+ ret = init_watch();
+ if (ret < 0) {
+ lderr(cct) << "ERROR: failed to initialize watch" << dendl;
+ return ret;
+ }
+ }
+
map<string, RGWZone>::iterator ziter;
for (ziter = region.zones.begin(); ziter != region.zones.end(); ++ziter) {
const string& name = ziter->first;
@@ -948,6 +967,8 @@ int RGWRados::init_complete()
if (use_gc_thread)
gc->start_processor();
+ quota_handler = RGWQuotaHandler::generate_handler(this);
+
return ret;
}
@@ -1109,6 +1130,8 @@ int RGWRados::init_watch()
return r;
}
+ watch_initialized = true;
+
return 0;
}
@@ -2168,8 +2191,8 @@ int RGWRados::create_pools(vector<string>& names, vector<int>& retcodes)
if (r < 0) {
ldout(cct, 0) << "WARNING: async pool_create returned " << r << dendl;
}
- c->release();
}
+ c->release();
retcodes.push_back(r);
}
return 0;
@@ -2326,6 +2349,11 @@ int RGWRados::put_obj_meta_impl(void *ctx, rgw_obj& obj, uint64_t size,
*mtime = set_mtime;
}
+ if (state) {
+ /* update quota cache */
+ quota_handler->update_stats(bucket, (state->exists ? 0 : 1), size, state->size);
+ }
+
return 0;
done_cancel:
@@ -2479,6 +2507,22 @@ static void set_copy_attrs(map<string, bufferlist>& src_attrs, map<string, buffe
}
}
+class GetObjHandleDestructor {
+ RGWRados *store;
+ void **handle;
+
+public:
+ GetObjHandleDestructor(RGWRados *_store) : store(_store), handle(NULL) {}
+ ~GetObjHandleDestructor() {
+ if (handle) {
+ store->finish_get_obj(handle);
+ }
+ }
+ void set_handle(void **_h) {
+ handle = _h;
+ }
+};
+
/**
* Copy an object.
* dest_obj: the object to copy into
@@ -2533,6 +2577,7 @@ int RGWRados::copy_obj(void *ctx,
ldout(cct, 5) << "Copy object " << src_obj.bucket << ":" << src_obj.object << " => " << dest_obj.bucket << ":" << dest_obj.object << dendl;
void *handle = NULL;
+ GetObjHandleDestructor handle_destructor(this);
map<string, bufferlist> src_attrs;
off_t ofs = 0;
@@ -2542,6 +2587,8 @@ int RGWRados::copy_obj(void *ctx,
mod_ptr, unmod_ptr, &lastmod, if_match, if_nomatch, &total_len, &obj_size, NULL, &handle, err);
if (ret < 0)
return ret;
+
+ handle_destructor.set_handle(&handle);
} else {
/* source is in a different region, copy it there */
@@ -2684,7 +2731,7 @@ set_err_state:
return 0;
} else if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
- return copy_obj_data(ctx, handle, end, dest_obj, src_obj, mtime, src_attrs, category, ptag, err);
+ return copy_obj_data(ctx, &handle, end, dest_obj, src_obj, mtime, src_attrs, category, ptag, err);
}
map<uint64_t, RGWObjManifestPart>::iterator miter = astate->manifest.objs.begin();
@@ -2789,7 +2836,7 @@ done_ret:
int RGWRados::copy_obj_data(void *ctx,
- void *handle, off_t end,
+ void **handle, off_t end,
rgw_obj& dest_obj,
rgw_obj& src_obj,
time_t *mtime,
@@ -2815,7 +2862,7 @@ int RGWRados::copy_obj_data(void *ctx,
do {
bufferlist bl;
- ret = get_obj(ctx, NULL, &handle, src_obj, bl, ofs, end);
+ ret = get_obj(ctx, NULL, handle, src_obj, bl, ofs, end);
if (ret < 0)
return ret;
@@ -2862,12 +2909,9 @@ int RGWRados::copy_obj_data(void *ctx,
if (mtime)
obj_stat(ctx, dest_obj, NULL, mtime, NULL, NULL, NULL, NULL);
- finish_get_obj(&handle);
-
return ret;
done_err:
delete_obj(ctx, shadow_obj);
- finish_get_obj(&handle);
return r;
}
@@ -3179,6 +3223,11 @@ int RGWRados::delete_obj_impl(void *ctx, rgw_obj& obj, RGWObjVersionTracker *obj
if (ret_not_existed)
return -ENOENT;
+ if (state) {
+ /* update quota cache */
+ quota_handler->update_stats(bucket, -1, 0, state->size);
+ }
+
return 0;
}
@@ -4566,6 +4615,38 @@ int RGWRados::get_bucket_stats(rgw_bucket& bucket, uint64_t *bucket_ver, uint64_
return 0;
}
+class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
+ RGWGetBucketStats_CB *cb;
+
+public:
+ RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb) : cb(_cb) {}
+ void handle_response(int r, rgw_bucket_dir_header& header) {
+ map<RGWObjCategory, RGWBucketStats> stats;
+
+ if (r >= 0) {
+ translate_raw_stats(header, stats);
+ cb->set_response(header.ver, header.master_ver, &stats, header.max_marker);
+ }
+
+ cb->handle_response(r);
+
+ cb->put();
+ }
+};
+
+int RGWRados::get_bucket_stats_async(rgw_bucket& bucket, RGWGetBucketStats_CB *ctx)
+{
+ RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx);
+ int r = cls_bucket_head_async(bucket, get_ctx);
+ if (r < 0) {
+ ctx->put();
+ delete get_ctx;
+ return r;
+ }
+
+ return 0;
+}
+
void RGWRados::get_bucket_instance_entry(rgw_bucket& bucket, string& entry)
{
entry = bucket.name + ":" + bucket.bucket_id;
@@ -4903,6 +4984,14 @@ int RGWRados::append_async(rgw_obj& obj, size_t size, bufferlist& bl)
int RGWRados::distribute(const string& key, bufferlist& bl)
{
+ /*
+ * we were called before watch was initialized. This can only happen if we're updating some system
+ * config object (e.g., zone info) during init. Don't try to distribute the cache info for these
+ * objects, they're currently only read on startup anyway.
+ */
+ if (!watch_initialized)
+ return 0;
+
string notify_oid;
pick_control_oid(key, notify_oid);
@@ -5440,6 +5529,25 @@ int RGWRados::cls_bucket_head(rgw_bucket& bucket, struct rgw_bucket_dir_header&
return 0;
}
+int RGWRados::cls_bucket_head_async(rgw_bucket& bucket, RGWGetDirHeader_CB *ctx)
+{
+ librados::IoCtx index_ctx;
+ string oid;
+ int r = open_bucket_index(bucket, index_ctx, oid);
+ if (r < 0)
+ return r;
+
+ r = cls_rgw_get_dir_header_async(index_ctx, oid, ctx);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int RGWRados::check_quota(rgw_bucket& bucket, RGWQuotaInfo& quota_info, uint64_t obj_size)
+{
+ return quota_handler->check_quota(bucket, quota_info, 1, obj_size);
+}
class IntentLogNameFilter : public RGWAccessListFilter
{
diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h
index e6ab244afa9..52b898123d4 100644
--- a/src/rgw/rgw_rados.h
+++ b/src/rgw/rgw_rados.h
@@ -238,11 +238,11 @@ class RGWPutObjProcessor_Aio : public RGWPutObjProcessor
struct put_obj_aio_info pop_pending();
int wait_pending_front();
bool pending_has_completed();
- int drain_pending();
protected:
uint64_t obj_len;
+ int drain_pending();
int handle_obj_data(rgw_obj& obj, bufferlist& bl, off_t ofs, off_t abs_ofs, void **phandle);
public:
@@ -284,6 +284,7 @@ protected:
void prepare_next_part(off_t ofs);
void complete_parts();
+ int complete_writing_data();
public:
~RGWPutObjProcessor_Atomic() {}
@@ -635,6 +636,8 @@ struct RGWRegionMap {
string master_region;
+ RGWQuotaInfo bucket_quota;
+
RGWRegionMap() : lock("RGWRegionMap") {}
void encode(bufferlist& bl) const;
@@ -758,6 +761,29 @@ public:
int renew_state();
};
+class RGWGetBucketStats_CB : public RefCountedObject {
+protected:
+ rgw_bucket bucket;
+ uint64_t bucket_ver;
+ uint64_t master_ver;
+ map<RGWObjCategory, RGWBucketStats> *stats;
+ string max_marker;
+public:
+ RGWGetBucketStats_CB(rgw_bucket& _bucket) : bucket(_bucket), stats(NULL) {}
+ virtual ~RGWGetBucketStats_CB() {}
+ virtual void handle_response(int r) = 0;
+ virtual void set_response(uint64_t _bucket_ver, uint64_t _master_ver,
+ map<RGWObjCategory, RGWBucketStats> *_stats,
+ const string &_max_marker) {
+ bucket_ver = _bucket_ver;
+ master_ver = _master_ver;
+ stats = _stats;
+ max_marker = _max_marker;
+ }
+};
+
+class RGWGetDirHeader_CB;
+
class RGWRados
{
@@ -801,6 +827,7 @@ class RGWRados
uint64_t *watch_handles;
librados::IoCtx root_pool_ctx; // .rgw
librados::IoCtx control_pool_ctx; // .rgw.control
+ bool watch_initialized;
Mutex bucket_id_lock;
uint64_t max_bucket_id;
@@ -860,13 +887,17 @@ protected:
string region_name;
string zone_name;
+ RGWQuotaHandler *quota_handler;
+
public:
RGWRados() : lock("rados_timer_lock"), timer(NULL),
gc(NULL), use_gc_thread(false),
num_watchers(0), watchers(NULL), watch_handles(NULL),
+ watch_initialized(false),
bucket_id_lock("rados_bucket_id"), max_bucket_id(0),
cct(NULL), rados(NULL),
pools_initialized(false),
+ quota_handler(NULL),
rest_master_conn(NULL),
meta_mgr(NULL), data_log(NULL) {}
@@ -1127,7 +1158,7 @@ public:
void *progress_data);
int copy_obj_data(void *ctx,
- void *handle, off_t end,
+ void **handle, off_t end,
rgw_obj& dest_obj,
rgw_obj& src_obj,
time_t *mtime,
@@ -1287,6 +1318,7 @@ public:
int decode_policy(bufferlist& bl, ACLOwner *owner);
int get_bucket_stats(rgw_bucket& bucket, uint64_t *bucket_ver, uint64_t *master_ver, map<RGWObjCategory, RGWBucketStats>& stats,
string *max_marker);
+ int get_bucket_stats_async(rgw_bucket& bucket, RGWGetBucketStats_CB *cb);
void get_bucket_instance_obj(rgw_bucket& bucket, rgw_obj& obj);
void get_bucket_instance_entry(rgw_bucket& bucket, string& entry);
void get_bucket_meta_oid(rgw_bucket& bucket, string& oid);
@@ -1318,6 +1350,7 @@ public:
map<string, RGWObjEnt>& m, bool *is_truncated,
string *last_entry, bool (*force_check_filter)(const string& name) = NULL);
int cls_bucket_head(rgw_bucket& bucket, struct rgw_bucket_dir_header& header);
+ int cls_bucket_head_async(rgw_bucket& bucket, RGWGetDirHeader_CB *ctx);
int prepare_update_index(RGWObjState *state, rgw_bucket& bucket,
RGWModifyOp op, rgw_obj& oid, string& tag);
int complete_update_index(rgw_bucket& bucket, string& oid, string& tag, int64_t poolid, uint64_t epoch, uint64_t size,
@@ -1373,6 +1406,8 @@ public:
int bucket_rebuild_index(rgw_bucket& bucket);
int remove_objs_from_index(rgw_bucket& bucket, list<string>& oid_list);
+ int check_quota(rgw_bucket& bucket, RGWQuotaInfo& quota_info, uint64_t obj_size);
+
string unique_id(uint64_t unique_num) {
char buf[32];
snprintf(buf, sizeof(buf), ".%llu.%llu", (unsigned long long)instance_id(), (unsigned long long)unique_num);
diff --git a/src/rgw/rgw_replica_log.cc b/src/rgw/rgw_replica_log.cc
index 483d256377b..f80ebf88525 100644
--- a/src/rgw/rgw_replica_log.cc
+++ b/src/rgw/rgw_replica_log.cc
@@ -34,6 +34,15 @@ RGWReplicaLogger::RGWReplicaLogger(RGWRados *_store) :
int RGWReplicaLogger::open_ioctx(librados::IoCtx& ctx, const string& pool)
{
int r = store->rados->ioctx_create(pool.c_str(), ctx);
+ if (r == -ENOENT) {
+ rgw_bucket p(pool.c_str());
+ r = store->create_pool(p);
+ if (r < 0)
+ return r;
+
+ // retry
+ r = store->rados->ioctx_create(pool.c_str(), ctx);
+ }
if (r < 0) {
lderr(cct) << "ERROR: could not open rados pool " << pool << dendl;
}
diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc
index 571e4869642..4aa1d401211 100644
--- a/src/rgw/rgw_rest.cc
+++ b/src/rgw/rgw_rest.cc
@@ -377,6 +377,20 @@ void dump_access_control(struct req_state *s, const char *origin, const char *me
}
}
+void dump_access_control(req_state *s, RGWOp *op)
+{
+ string origin;
+ string method;
+ string header;
+ string exp_header;
+ unsigned max_age = CORS_MAX_AGE_INVALID;
+
+ if (!op->generate_cors_headers(origin, method, header, exp_header, &max_age))
+ return;
+
+ dump_access_control(s, origin.c_str(), method.c_str(), header.c_str(), exp_header.c_str(), max_age);
+}
+
void dump_start(struct req_state *s)
{
if (!s->content_started) {
@@ -386,10 +400,14 @@ void dump_start(struct req_state *s)
}
}
-void end_header(struct req_state *s, const char *content_type)
+void end_header(struct req_state *s, RGWOp *op, const char *content_type)
{
string ctype;
+ if (op) {
+ dump_access_control(s, op);
+ }
+
if (!content_type || s->err.is_err()) {
switch (s->format) {
case RGW_FORMAT_XML:
@@ -424,7 +442,7 @@ void end_header(struct req_state *s, const char *content_type)
rgw_flush_formatter_and_reset(s, s->formatter);
}
-void abort_early(struct req_state *s, int err_no)
+void abort_early(struct req_state *s, RGWOp *op, int err_no)
{
if (!s->formatter) {
s->formatter = new JSONFormatter;
@@ -432,7 +450,7 @@ void abort_early(struct req_state *s, int err_no)
}
set_req_state_err(s, err_no);
dump_errno(s);
- end_header(s);
+ end_header(s, op);
rgw_flush_formatter_and_reset(s, s->formatter);
perfcounter->inc(l_rgw_failed_req);
}
@@ -644,7 +662,7 @@ void RGWRESTFlusher::do_start(int ret)
set_req_state_err(s, ret); /* no going back from here */
dump_errno(s);
dump_start(s);
- end_header(s);
+ end_header(s, op);
rgw_flush_formatter_and_reset(s, s->formatter);
}
@@ -691,7 +709,7 @@ int RGWPutObj_ObjStore::get_data(bufferlist& bl)
int r = s->cio->read(bp.c_str(), cl, &read_len);
len = read_len;
if (r < 0)
- return ret;
+ return r;
bl.append(bp, 0, len);
}
@@ -927,7 +945,7 @@ void RGWRESTOp::send_response()
if (!flusher.did_start()) {
set_req_state_err(s, http_ret);
dump_errno(s);
- end_header(s);
+ end_header(s, this);
}
flusher.flush();
}
@@ -1062,7 +1080,7 @@ int RGWHandler_ObjStore::read_permissions(RGWOp *op_obj)
case OP_COPY: // op itself will read and verify the permissions
return 0;
case OP_OPTIONS:
- only_bucket = false;
+ only_bucket = true;
break;
default:
return -EINVAL;
diff --git a/src/rgw/rgw_rest.h b/src/rgw/rgw_rest.h
index b65efb3de3e..15ac863aa52 100644
--- a/src/rgw/rgw_rest.h
+++ b/src/rgw/rgw_rest.h
@@ -33,15 +33,17 @@ public:
class RGWRESTFlusher : public RGWFormatterFlusher {
struct req_state *s;
+ RGWOp *op;
protected:
virtual void do_flush();
virtual void do_start(int ret);
public:
- RGWRESTFlusher(struct req_state *_s) : RGWFormatterFlusher(_s->formatter), s(_s) {}
- RGWRESTFlusher() : RGWFormatterFlusher(NULL), s(NULL) {}
+ RGWRESTFlusher(struct req_state *_s, RGWOp *_op) : RGWFormatterFlusher(_s->formatter), s(_s), op(_op) {}
+ RGWRESTFlusher() : RGWFormatterFlusher(NULL), s(NULL), op(NULL) {}
- void init(struct req_state *_s) {
+ void init(struct req_state *_s, RGWOp *_op) {
s = _s;
+ op = _op;
set_formatter(s->formatter);
}
};
@@ -228,7 +230,7 @@ public:
RGWRESTOp() : http_ret(0) {}
virtual void init(RGWRados *store, struct req_state *s, RGWHandler *dialect_handler) {
RGWOp::init(store, s, dialect_handler);
- flusher.init(s);
+ flusher.init(s, this);
}
virtual void send_response();
virtual int check_caps(RGWUserCaps& caps) { return -EPERM; } /* should to be implemented! */
@@ -310,7 +312,7 @@ public:
extern void set_req_state_err(struct req_state *s, int err_no);
extern void dump_errno(struct req_state *s);
extern void dump_errno(struct req_state *s, int ret);
-extern void end_header(struct req_state *s, const char *content_type = NULL);
+extern void end_header(struct req_state *s, RGWOp *op = NULL, const char *content_type = NULL);
extern void dump_start(struct req_state *s);
extern void list_all_buckets_start(struct req_state *s);
extern void dump_owner(struct req_state *s, string& id, string& name, const char *section = NULL);
@@ -318,7 +320,7 @@ extern void dump_content_length(struct req_state *s, uint64_t len);
extern void dump_etag(struct req_state *s, const char *etag);
extern void dump_epoch_header(struct req_state *s, const char *name, time_t t);
extern void dump_last_modified(struct req_state *s, time_t t);
-extern void abort_early(struct req_state *s, int err);
+extern void abort_early(struct req_state *s, RGWOp *op, int err);
extern void dump_range(struct req_state *s, uint64_t ofs, uint64_t end, uint64_t total_size);
extern void dump_continue(struct req_state *s);
extern void list_all_buckets_end(struct req_state *s);
@@ -331,6 +333,7 @@ extern void dump_pair(struct req_state *s, const char *key, const char *value);
extern bool is_valid_url(const char *url);
extern void dump_access_control(struct req_state *s, const char *origin, const char *meth,
const char *hdr, const char *exp_hdr, uint32_t max_age);
+extern void dump_access_control(req_state *s, RGWOp *op);
#endif
diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc
index 8690dd8fdbe..83874dd42c4 100644
--- a/src/rgw/rgw_rest_s3.cc
+++ b/src/rgw/rgw_rest_s3.cc
@@ -166,7 +166,8 @@ done:
if (!content_type)
content_type = "binary/octet-stream";
- end_header(s, content_type);
+
+ end_header(s, this, content_type);
if (metadata_bl.length()) {
s->cio->write(metadata_bl.c_str(), metadata_bl.length());
@@ -189,7 +190,7 @@ void RGWListBuckets_ObjStore_S3::send_response_begin(bool has_buckets)
set_req_state_err(s, ret);
dump_errno(s);
dump_start(s);
- end_header(s, "application/xml");
+ end_header(s, NULL, "application/xml");
if (!ret) {
list_all_buckets_start(s);
@@ -242,7 +243,7 @@ void RGWListBucket_ObjStore_S3::send_response()
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s, "application/xml");
+ end_header(s, this, "application/xml");
dump_start(s);
if (ret < 0)
return;
@@ -288,7 +289,7 @@ void RGWListBucket_ObjStore_S3::send_response()
void RGWGetBucketLogging_ObjStore_S3::send_response()
{
dump_errno(s);
- end_header(s, "application/xml");
+ end_header(s, this, "application/xml");
dump_start(s);
s->formatter->open_object_section_in_ns("BucketLoggingStatus",
@@ -315,7 +316,7 @@ void RGWStatBucket_ObjStore_S3::send_response()
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s);
+ end_header(s, this);
dump_start(s);
}
@@ -467,7 +468,7 @@ void RGWDeleteBucket_ObjStore_S3::send_response()
set_req_state_err(s, r);
dump_errno(s);
- end_header(s);
+ end_header(s, this);
if (s->system_request) {
JSONFormatter f; /* use json formatter for system requests output */
@@ -521,7 +522,7 @@ void RGWPutObj_ObjStore_S3::send_response()
dump_epoch_header(s, "Rgwx-Mtime", mtime);
}
dump_errno(s);
- end_header(s);
+ end_header(s, this);
}
/*
@@ -893,6 +894,11 @@ int RGWPostObj_ObjStore_S3::get_params()
rebuild_key(s->object_str);
+ if (s->object_str.empty()) {
+ err_msg = "Empty object name";
+ return -EINVAL;
+ }
+
env.add_var("key", s->object_str);
part_str("Content-Type", &content_type);
@@ -1206,7 +1212,7 @@ done:
set_req_state_err(s, ret);
dump_errno(s);
dump_content_length(s, s->formatter->get_len());
- end_header(s);
+ end_header(s, this);
if (ret != STATUS_CREATED)
return;
@@ -1224,7 +1230,7 @@ void RGWDeleteObj_ObjStore_S3::send_response()
set_req_state_err(s, r);
dump_errno(s);
- end_header(s);
+ end_header(s, this);
}
int RGWCopyObj_ObjStore_S3::init_dest_policy()
@@ -1308,7 +1314,7 @@ void RGWCopyObj_ObjStore_S3::send_partial_response(off_t ofs)
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s, "binary/octet-stream");
+ end_header(s, this, "binary/octet-stream");
if (ret == 0) {
s->formatter->open_object_section("CopyObjectResult");
}
@@ -1347,7 +1353,7 @@ void RGWGetACLs_ObjStore_S3::send_response()
if (ret)
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s, "application/xml");
+ end_header(s, this, "application/xml");
dump_start(s);
s->cio->write(acls.c_str(), acls.size());
}
@@ -1376,7 +1382,7 @@ void RGWPutACLs_ObjStore_S3::send_response()
if (ret)
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s, "application/xml");
+ end_header(s, this, "application/xml");
dump_start(s);
}
@@ -1389,7 +1395,7 @@ void RGWGetCORS_ObjStore_S3::send_response()
set_req_state_err(s, ret);
}
dump_errno(s);
- end_header(s, "application/xml");
+ end_header(s, NULL, "application/xml");
dump_start(s);
if (!ret) {
string cors;
@@ -1464,7 +1470,7 @@ void RGWPutCORS_ObjStore_S3::send_response()
if (ret)
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s, "application/xml");
+ end_header(s, NULL, "application/xml");
dump_start(s);
}
@@ -1476,7 +1482,7 @@ void RGWDeleteCORS_ObjStore_S3::send_response()
set_req_state_err(s, r);
dump_errno(s);
- end_header(s);
+ end_header(s, NULL);
}
void RGWOptionsCORS_ObjStore_S3::send_response()
@@ -1485,22 +1491,20 @@ void RGWOptionsCORS_ObjStore_S3::send_response()
uint32_t max_age = CORS_MAX_AGE_INVALID;
/*EACCES means, there is no CORS registered yet for the bucket
*ENOENT means, there is no match of the Origin in the list of CORSRule
- *ENOTSUPP means, the HTTP_METHOD is not supported
*/
if (ret == -ENOENT)
ret = -EACCES;
- if (ret != -EACCES) {
- get_response_params(hdrs, exp_hdrs, &max_age);
- } else {
+ if (ret < 0) {
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s);
+ end_header(s, NULL);
return;
}
+ get_response_params(hdrs, exp_hdrs, &max_age);
dump_errno(s);
dump_access_control(s, origin, req_meth, hdrs.c_str(), exp_hdrs.c_str(), max_age);
- end_header(s);
+ end_header(s, NULL);
}
int RGWInitMultipart_ObjStore_S3::get_params()
@@ -1520,7 +1524,7 @@ void RGWInitMultipart_ObjStore_S3::send_response()
if (ret)
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s, "application/xml");
+ end_header(s, this, "application/xml");
if (ret == 0) {
dump_start(s);
s->formatter->open_object_section_in_ns("InitiateMultipartUploadResult",
@@ -1538,7 +1542,7 @@ void RGWCompleteMultipart_ObjStore_S3::send_response()
if (ret)
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s, "application/xml");
+ end_header(s, this, "application/xml");
if (ret == 0) {
dump_start(s);
s->formatter->open_object_section_in_ns("CompleteMultipartUploadResult",
@@ -1561,7 +1565,7 @@ void RGWAbortMultipart_ObjStore_S3::send_response()
set_req_state_err(s, r);
dump_errno(s);
- end_header(s);
+ end_header(s, this);
}
void RGWListMultipart_ObjStore_S3::send_response()
@@ -1569,7 +1573,7 @@ void RGWListMultipart_ObjStore_S3::send_response()
if (ret)
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s, "application/xml");
+ end_header(s, this, "application/xml");
if (ret == 0) {
dump_start(s);
@@ -1624,7 +1628,7 @@ void RGWListBucketMultiparts_ObjStore_S3::send_response()
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s, "application/xml");
+ end_header(s, this, "application/xml");
dump_start(s);
if (ret < 0)
return;
@@ -1695,7 +1699,7 @@ void RGWDeleteMultiObj_ObjStore_S3::begin_response()
}
dump_start(s);
- end_header(s, "application/xml");
+ end_header(s, this, "application/xml");
s->formatter->open_object_section_in_ns("DeleteResult",
"http://s3.amazonaws.com/doc/2006-03-01/");
@@ -2020,6 +2024,72 @@ int RGWHandler_ObjStore_S3::init(RGWRados *store, struct req_state *s, RGWClient
/*
+ * Try to validate S3 auth against keystone s3token interface
+ */
+int RGW_Auth_S3_Keystone_ValidateToken::validate_s3token(const string& auth_id, const string& auth_token, const string& auth_sign) {
+ /* prepare keystone url */
+ string keystone_url = cct->_conf->rgw_keystone_url;
+ if (keystone_url[keystone_url.size() - 1] != '/')
+ keystone_url.append("/");
+ keystone_url.append("v2.0/s3tokens");
+
+ /* set required headers for keystone request */
+ append_header("X-Auth-Token", cct->_conf->rgw_keystone_admin_token);
+ append_header("Content-Type", "application/json");
+
+ /* encode token */
+ bufferlist token_buff;
+ bufferlist token_encoded;
+ token_buff.append(auth_token);
+ token_buff.encode_base64(token_encoded);
+ token_encoded.append((char)0);
+
+ /* create json credentials request body */
+ JSONFormatter credentials(false);
+ credentials.open_object_section("");
+ credentials.open_object_section("credentials");
+ credentials.dump_string("access", auth_id);
+ credentials.dump_string("token", token_encoded.c_str());
+ credentials.dump_string("signature", auth_sign);
+ credentials.close_section();
+ credentials.close_section();
+
+ std::stringstream os;
+ credentials.flush(os);
+ set_tx_buffer(os.str());
+
+ /* send request */
+ int ret = process("POST", keystone_url.c_str());
+ if (ret < 0) {
+ dout(2) << "s3 keystone: token validation ERROR: " << rx_buffer.c_str() << dendl;
+ return -EPERM;
+ }
+
+ /* now parse response */
+ if (response.parse(cct, rx_buffer) < 0) {
+ dout(2) << "s3 keystone: token parsing failed" << dendl;
+ return -EPERM;
+ }
+
+ /* check if we have a valid role */
+ bool found = false;
+ list<string>::iterator iter;
+ for (iter = roles_list.begin(); iter != roles_list.end(); ++iter) {
+ if ((found=response.user.has_role(*iter))==true)
+ break;
+ }
+
+ if (!found) {
+ ldout(cct, 5) << "s3 keystone: user does not hold a matching role; required roles: " << cct->_conf->rgw_keystone_accepted_roles << dendl;
+ return -EPERM;
+ }
+
+ /* everything seems fine, continue with this user */
+ ldout(cct, 5) << "s3 keystone: validated token: " << response.token.tenant.name << ":" << response.user.name << " expires: " << response.token.expires << dendl;
+ return 0;
+}
+
+/*
* verify that a signed request comes from the keyholder
* by checking the signature against our locally-computed version
*/
@@ -2032,6 +2102,13 @@ int RGW_Auth_S3::authorize(RGWRados *store, struct req_state *s)
time_t now;
time(&now);
+ /* neither keystone and rados enabled; warn and exit! */
+ if (!store->ctx()->_conf->rgw_s3_auth_use_rados
+ && !store->ctx()->_conf->rgw_s3_auth_use_keystone) {
+ dout(0) << "WARNING: no authorization backend enabled! Users will never authenticate." << dendl;
+ return -EPERM;
+ }
+
if (!s->http_auth || !(*s->http_auth)) {
auth_id = s->info.args.get("AWSAccessKeyId");
if (auth_id.size()) {
@@ -2061,75 +2138,113 @@ int RGW_Auth_S3::authorize(RGWRados *store, struct req_state *s)
auth_sign = auth_str.substr(pos + 1);
}
- /* first get the user info */
- if (rgw_get_user_info_by_access_key(store, auth_id, s->user) < 0) {
- dout(5) << "error reading user info, uid=" << auth_id << " can't authenticate" << dendl;
- return -EPERM;
- }
+ /* try keystone auth first */
+ int keystone_result = -EINVAL;
+ if (store->ctx()->_conf->rgw_s3_auth_use_keystone
+ && !store->ctx()->_conf->rgw_keystone_url.empty()) {
+ dout(20) << "s3 keystone: trying keystone auth" << dendl;
- /* now verify signature */
-
- string auth_hdr;
- if (!rgw_create_s3_canonical_header(s->info, &s->header_time, auth_hdr, qsr)) {
- dout(10) << "failed to create auth header\n" << auth_hdr << dendl;
- return -EPERM;
- }
- dout(10) << "auth_hdr:\n" << auth_hdr << dendl;
+ RGW_Auth_S3_Keystone_ValidateToken keystone_validator(store->ctx());
+ string token;
- time_t req_sec = s->header_time.sec();
- if ((req_sec < now - RGW_AUTH_GRACE_MINS * 60 ||
- req_sec > now + RGW_AUTH_GRACE_MINS * 60) && !qsr) {
- dout(10) << "req_sec=" << req_sec << " now=" << now << "; now - RGW_AUTH_GRACE_MINS=" << now - RGW_AUTH_GRACE_MINS * 60 << "; now + RGW_AUTH_GRACE_MINS=" << now + RGW_AUTH_GRACE_MINS * 60 << dendl;
- dout(0) << "NOTICE: request time skew too big now=" << utime_t(now, 0) << " req_time=" << s->header_time << dendl;
- return -ERR_REQUEST_TIME_SKEWED;
- }
+ if (!rgw_create_s3_canonical_header(s->info, &s->header_time, token, qsr)) {
+ dout(10) << "failed to create auth header\n" << token << dendl;
+ } else {
+ keystone_result = keystone_validator.validate_s3token(auth_id, token, auth_sign);
+ if (keystone_result == 0) {
+ s->user.user_id = keystone_validator.response.token.tenant.id;
+ s->user.display_name = keystone_validator.response.token.tenant.name; // wow.
+
+ /* try to store user if it not already exists */
+ if (rgw_get_user_info_by_uid(store, keystone_validator.response.token.tenant.id, s->user) < 0) {
+ int ret = rgw_store_user_info(store, s->user, NULL, NULL, 0, true);
+ if (ret < 0)
+ dout(10) << "NOTICE: failed to store new user's info: ret=" << ret << dendl;
+ }
- map<string, RGWAccessKey>::iterator iter = s->user.access_keys.find(auth_id);
- if (iter == s->user.access_keys.end()) {
- dout(0) << "ERROR: access key not encoded in user info" << dendl;
- return -EPERM;
+ s->perm_mask = RGW_PERM_FULL_CONTROL;
+ }
+ }
}
- RGWAccessKey& k = iter->second;
- if (!k.subuser.empty()) {
- map<string, RGWSubUser>::iterator uiter = s->user.subusers.find(k.subuser);
- if (uiter == s->user.subusers.end()) {
- dout(0) << "NOTICE: could not find subuser: " << k.subuser << dendl;
+ /* keystone failed (or not enabled); check if we want to use rados backend */
+ if (!store->ctx()->_conf->rgw_s3_auth_use_rados
+ && keystone_result < 0)
+ return keystone_result;
+
+ /* now try rados backend, but only if keystone did not succeed */
+ if (keystone_result < 0) {
+ /* get the user info */
+ if (rgw_get_user_info_by_access_key(store, auth_id, s->user) < 0) {
+ dout(5) << "error reading user info, uid=" << auth_id << " can't authenticate" << dendl;
return -EPERM;
}
- RGWSubUser& subuser = uiter->second;
- s->perm_mask = subuser.perm_mask;
- } else
- s->perm_mask = RGW_PERM_FULL_CONTROL;
- string digest;
- int ret = rgw_get_s3_header_digest(auth_hdr, k.key, digest);
- if (ret < 0) {
- return -EPERM;
- }
+ /* now verify signature */
- dout(15) << "calculated digest=" << digest << dendl;
- dout(15) << "auth_sign=" << auth_sign << dendl;
- dout(15) << "compare=" << auth_sign.compare(digest) << dendl;
+ string auth_hdr;
+ if (!rgw_create_s3_canonical_header(s->info, &s->header_time, auth_hdr, qsr)) {
+ dout(10) << "failed to create auth header\n" << auth_hdr << dendl;
+ return -EPERM;
+ }
+ dout(10) << "auth_hdr:\n" << auth_hdr << dendl;
+
+ time_t req_sec = s->header_time.sec();
+ if ((req_sec < now - RGW_AUTH_GRACE_MINS * 60 ||
+ req_sec > now + RGW_AUTH_GRACE_MINS * 60) && !qsr) {
+ dout(10) << "req_sec=" << req_sec << " now=" << now << "; now - RGW_AUTH_GRACE_MINS=" << now - RGW_AUTH_GRACE_MINS * 60 << "; now + RGW_AUTH_GRACE_MINS=" << now + RGW_AUTH_GRACE_MINS * 60 << dendl;
+ dout(0) << "NOTICE: request time skew too big now=" << utime_t(now, 0) << " req_time=" << s->header_time << dendl;
+ return -ERR_REQUEST_TIME_SKEWED;
+ }
- if (auth_sign != digest)
- return -EPERM;
+ map<string, RGWAccessKey>::iterator iter = s->user.access_keys.find(auth_id);
+ if (iter == s->user.access_keys.end()) {
+ dout(0) << "ERROR: access key not encoded in user info" << dendl;
+ return -EPERM;
+ }
+ RGWAccessKey& k = iter->second;
- if (s->user.system) {
- s->system_request = true;
- dout(20) << "system request" << dendl;
- s->info.args.set_system();
- string effective_uid = s->info.args.get(RGW_SYS_PARAM_PREFIX "uid");
- RGWUserInfo effective_user;
- if (!effective_uid.empty()) {
- ret = rgw_get_user_info_by_uid(store, effective_uid, effective_user);
- if (ret < 0) {
- ldout(s->cct, 0) << "User lookup failed!" << dendl;
- return -ENOENT;
+ if (!k.subuser.empty()) {
+ map<string, RGWSubUser>::iterator uiter = s->user.subusers.find(k.subuser);
+ if (uiter == s->user.subusers.end()) {
+ dout(0) << "NOTICE: could not find subuser: " << k.subuser << dendl;
+ return -EPERM;
}
- s->user = effective_user;
+ RGWSubUser& subuser = uiter->second;
+ s->perm_mask = subuser.perm_mask;
+ } else
+ s->perm_mask = RGW_PERM_FULL_CONTROL;
+
+ string digest;
+ int ret = rgw_get_s3_header_digest(auth_hdr, k.key, digest);
+ if (ret < 0) {
+ return -EPERM;
}
- }
+
+ dout(15) << "calculated digest=" << digest << dendl;
+ dout(15) << "auth_sign=" << auth_sign << dendl;
+ dout(15) << "compare=" << auth_sign.compare(digest) << dendl;
+
+ if (auth_sign != digest)
+ return -EPERM;
+
+ if (s->user.system) {
+ s->system_request = true;
+ dout(20) << "system request" << dendl;
+ s->info.args.set_system();
+ string effective_uid = s->info.args.get(RGW_SYS_PARAM_PREFIX "uid");
+ RGWUserInfo effective_user;
+ if (!effective_uid.empty()) {
+ ret = rgw_get_user_info_by_uid(store, effective_uid, effective_user);
+ if (ret < 0) {
+ ldout(s->cct, 0) << "User lookup failed!" << dendl;
+ return -ENOENT;
+ }
+ s->user = effective_user;
+ }
+ }
+
+ } /* if keystone_result < 0 */
// populate the owner info
s->owner.set_id(s->user.user_id);
diff --git a/src/rgw/rgw_rest_s3.h b/src/rgw/rgw_rest_s3.h
index b0d3c30384a..e62334b9585 100644
--- a/src/rgw/rgw_rest_s3.h
+++ b/src/rgw/rgw_rest_s3.h
@@ -6,6 +6,7 @@
#include "rgw_http_errors.h"
#include "rgw_acl_s3.h"
#include "rgw_policy_s3.h"
+#include "rgw_keystone.h"
#define RGW_AUTH_GRACE_MINS 15
@@ -258,6 +259,57 @@ public:
void end_response();
};
+class RGW_Auth_S3_Keystone_ValidateToken : public RGWHTTPClient {
+private:
+ bufferlist rx_buffer;
+ bufferlist tx_buffer;
+ bufferlist::iterator tx_buffer_it;
+ list<string> roles_list;
+
+public:
+ KeystoneToken response;
+
+private:
+ void set_tx_buffer(const string& d) {
+ tx_buffer.clear();
+ tx_buffer.append(d);
+ tx_buffer_it = tx_buffer.begin();
+ set_send_length(tx_buffer.length());
+ }
+
+public:
+ RGW_Auth_S3_Keystone_ValidateToken(CephContext *_cct)
+ : RGWHTTPClient(_cct) {
+ get_str_list(cct->_conf->rgw_keystone_accepted_roles, roles_list);
+ }
+
+ int receive_header(void *ptr, size_t len) {
+ return 0;
+ }
+ int receive_data(void *ptr, size_t len) {
+ rx_buffer.append((char *)ptr, len);
+ return 0;
+ }
+
+ int send_data(void *ptr, size_t len) {
+ if (!tx_buffer_it.get_remaining())
+ return 0; // nothing left to send
+
+ int l = MIN(tx_buffer_it.get_remaining(), len);
+ memcpy(ptr, tx_buffer_it.get_current_ptr().c_str(), l);
+ try {
+ tx_buffer_it.advance(l);
+ } catch (buffer::end_of_buffer &e) {
+ assert(0);
+ }
+
+ return l;
+ }
+
+ int validate_s3token(const string& auth_id, const string& auth_token, const string& auth_sign);
+
+};
+
class RGW_Auth_S3 {
public:
static int authorize(RGWRados *store, struct req_state *s);
diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc
index b4f830830f9..651c4635d37 100644
--- a/src/rgw/rgw_rest_swift.cc
+++ b/src/rgw/rgw_rest_swift.cc
@@ -52,7 +52,7 @@ void RGWListBuckets_ObjStore_SWIFT::send_response_begin(bool has_buckets)
set_req_state_err(s, ret);
}
dump_errno(s);
- end_header(s);
+ end_header(s, NULL);
if (!ret) {
dump_start(s);
@@ -211,7 +211,7 @@ next:
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s);
+ end_header(s, this);
if (ret < 0) {
return;
}
@@ -266,7 +266,7 @@ void RGWStatAccount_ObjStore_SWIFT::send_response()
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s);
+ end_header(s, NULL);
dump_start(s);
}
@@ -280,7 +280,7 @@ void RGWStatBucket_ObjStore_SWIFT::send_response()
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s);
+ end_header(s, this);
dump_start(s);
}
@@ -301,7 +301,7 @@ void RGWCreateBucket_ObjStore_SWIFT::send_response()
ret = STATUS_ACCEPTED;
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s);
+ end_header(s, NULL);
rgw_flush_formatter_and_reset(s, s->formatter);
}
@@ -313,7 +313,7 @@ void RGWDeleteBucket_ObjStore_SWIFT::send_response()
set_req_state_err(s, r);
dump_errno(s);
- end_header(s);
+ end_header(s, this);
rgw_flush_formatter_and_reset(s, s->formatter);
}
@@ -361,7 +361,7 @@ void RGWPutObj_ObjStore_SWIFT::send_response()
dump_etag(s, etag.c_str());
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s);
+ end_header(s, this);
rgw_flush_formatter_and_reset(s, s->formatter);
}
@@ -421,7 +421,7 @@ void RGWPutMetadata_ObjStore_SWIFT::send_response()
ret = STATUS_ACCEPTED;
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s);
+ end_header(s, this);
rgw_flush_formatter_and_reset(s, s->formatter);
}
@@ -433,7 +433,7 @@ void RGWDeleteObj_ObjStore_SWIFT::send_response()
set_req_state_err(s, r);
dump_errno(s);
- end_header(s);
+ end_header(s, this);
rgw_flush_formatter_and_reset(s, s->formatter);
}
@@ -484,7 +484,7 @@ void RGWCopyObj_ObjStore_SWIFT::send_partial_response(off_t ofs)
ret = STATUS_CREATED;
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s);
+ end_header(s, this);
/* Send progress information. Note that this diverge from the original swift
* spec. We do this in order to keep connection alive.
@@ -506,7 +506,7 @@ void RGWCopyObj_ObjStore_SWIFT::send_response()
ret = STATUS_CREATED;
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s);
+ end_header(s, this);
} else {
s->formatter->close_section();
rgw_flush_formatter(s, s->formatter);
@@ -570,7 +570,7 @@ int RGWGetObj_ObjStore_SWIFT::send_response_data(bufferlist& bl, off_t bl_ofs, o
if (!content_type)
content_type = "binary/octet-stream";
- end_header(s, content_type);
+ end_header(s, this, content_type);
sent_header = true;
@@ -600,12 +600,12 @@ void RGWOptionsCORS_ObjStore_SWIFT::send_response()
} else {
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s);
+ end_header(s, NULL);
return;
}
dump_errno(s);
dump_access_control(s, origin, req_meth, hdrs.c_str(), exp_hdrs.c_str(), max_age);
- end_header(s);
+ end_header(s, NULL);
}
RGWOp *RGWHandler_ObjStore_Service_SWIFT::op_get()
diff --git a/src/rgw/rgw_swift.cc b/src/rgw/rgw_swift.cc
index b62033b2764..24e09051320 100644
--- a/src/rgw/rgw_swift.cc
+++ b/src/rgw/rgw_swift.cc
@@ -8,6 +8,7 @@
#include "rgw_swift_auth.h"
#include "rgw_user.h"
#include "rgw_http_client.h"
+#include "rgw_keystone.h"
#include "include/str_list.h"
@@ -18,8 +19,6 @@
static list<string> roles_list;
-class RGWKeystoneTokenCache;
-
class RGWValidateSwiftToken : public RGWHTTPClient {
struct rgw_swift_auth_info *info;
@@ -105,192 +104,7 @@ int RGWSwift::validate_token(const char *token, struct rgw_swift_auth_info *info
return 0;
}
-int KeystoneToken::parse(CephContext *cct, bufferlist& bl)
-{
- JSONParser parser;
-
- if (!parser.parse(bl.c_str(), bl.length())) {
- ldout(cct, 0) << "malformed json" << dendl;
- return -EINVAL;
- }
-
- JSONObjIter iter = parser.find_first("access");
- if (iter.end()) {
- ldout(cct, 0) << "token response is missing access section" << dendl;
- return -EINVAL;
- }
-
- JSONObj *access_obj = *iter;
- JSONObj *user = access_obj->find_obj("user");
- if (!user) {
- ldout(cct, 0) << "token response is missing user section" << dendl;
- return -EINVAL;
- }
-
- if (!user->get_data("username", &user_name)) {
- ldout(cct, 0) << "token response is missing user username field" << dendl;
- return -EINVAL;
- }
-
- JSONObj *roles_obj = user->find_obj("roles");
- if (!roles_obj) {
- ldout(cct, 0) << "token response is missing roles section, or section empty" << dendl;
- return -EINVAL;
- }
-
- JSONObjIter riter = roles_obj->find_first();
- if (riter.end()) {
- ldout(cct, 0) << "token response has an empty roles list" << dendl;
- return -EINVAL;
- }
- for (; !riter.end(); ++riter) {
- JSONObj *role_obj = *riter;
- if (!role_obj) {
- ldout(cct, 0) << "ERROR: role object is NULL" << dendl;
- return -EINVAL;
- }
-
- JSONObj *role_name = role_obj->find_obj("name");
- if (!role_name) {
- ldout(cct, 0) << "token response is missing role name section" << dendl;
- return -EINVAL;
- }
- string role = role_name->get_data();
- roles[role] = true;
- }
-
- JSONObj *token = access_obj->find_obj("token");
- if (!token) {
- ldout(cct, 0) << "missing token section in response" << dendl;
- return -EINVAL;
- }
-
- string expires;
-
- if (!token->get_data("expires", &expires)) {
- ldout(cct, 0) << "token response is missing expiration field" << dendl;
- return -EINVAL;
- }
-
- struct tm t;
- if (!parse_iso8601(expires.c_str(), &t)) {
- ldout(cct, 0) << "failed to parse token expiration (" << expires << ")" << dendl;
- return -EINVAL;
- }
-
- expiration = timegm(&t);
-
- JSONObj *tenant = token->find_obj("tenant");
- if (!tenant) {
- ldout(cct, 0) << "token response is missing tenant section" << dendl;
- return -EINVAL;
- }
-
- if (!tenant->get_data("id", &tenant_id)) {
- ldout(cct, 0) << "tenant is missing id field" << dendl;
- return -EINVAL;
- }
-
-
- if (!tenant->get_data("name", &tenant_name)) {
- ldout(cct, 0) << "tenant is missing name field" << dendl;
- return -EINVAL;
- }
-
- return 0;
-}
-
-struct token_entry {
- KeystoneToken token;
- list<string>::iterator lru_iter;
-};
-
-class RGWKeystoneTokenCache {
- CephContext *cct;
-
- map<string, token_entry> tokens;
- list<string> tokens_lru;
-
- Mutex lock;
-
- size_t max;
-
-public:
- RGWKeystoneTokenCache(CephContext *_cct, int _max) : cct(_cct), lock("RGWKeystoneTokenCache"), max(_max) {}
-
- bool find(const string& token_id, KeystoneToken& token);
- void add(const string& token_id, KeystoneToken& token);
- void invalidate(const string& token_id);
-};
-
-bool RGWKeystoneTokenCache::find(const string& token_id, KeystoneToken& token)
-{
- lock.Lock();
- map<string, token_entry>::iterator iter = tokens.find(token_id);
- if (iter == tokens.end()) {
- lock.Unlock();
- if (perfcounter) perfcounter->inc(l_rgw_keystone_token_cache_miss);
- return false;
- }
-
- token_entry& entry = iter->second;
- tokens_lru.erase(entry.lru_iter);
-
- if (entry.token.expired()) {
- tokens.erase(iter);
- lock.Unlock();
- if (perfcounter) perfcounter->inc(l_rgw_keystone_token_cache_hit);
- return false;
- }
- token = entry.token;
-
- tokens_lru.push_front(token_id);
- entry.lru_iter = tokens_lru.begin();
-
- lock.Unlock();
- if (perfcounter) perfcounter->inc(l_rgw_keystone_token_cache_hit);
-
- return true;
-}
-
-void RGWKeystoneTokenCache::add(const string& token_id, KeystoneToken& token)
-{
- lock.Lock();
- map<string, token_entry>::iterator iter = tokens.find(token_id);
- if (iter != tokens.end()) {
- token_entry& e = iter->second;
- tokens_lru.erase(e.lru_iter);
- }
-
- tokens_lru.push_front(token_id);
- token_entry& entry = tokens[token_id];
- entry.token = token;
- entry.lru_iter = tokens_lru.begin();
-
- while (tokens_lru.size() > max) {
- list<string>::reverse_iterator riter = tokens_lru.rbegin();
- iter = tokens.find(*riter);
- assert(iter != tokens.end());
- tokens.erase(iter);
- tokens_lru.pop_back();
- }
-
- lock.Unlock();
-}
-
-void RGWKeystoneTokenCache::invalidate(const string& token_id)
-{
- Mutex::Locker l(lock);
- map<string, token_entry>::iterator iter = tokens.find(token_id);
- if (iter == tokens.end())
- return;
-
- ldout(cct, 20) << "invalidating revoked token id=" << token_id << dendl;
- token_entry& e = iter->second;
- tokens_lru.erase(e.lru_iter);
- tokens.erase(iter);
-}
class RGWValidateKeystoneToken : public RGWHTTPClient {
bufferlist *bl;
@@ -489,8 +303,8 @@ int RGWSwift::check_revoked()
static void rgw_set_keystone_token_auth_info(KeystoneToken& token, struct rgw_swift_auth_info *info)
{
- info->user = token.tenant_id;
- info->display_name = token.tenant_name;
+ info->user = token.token.tenant.id;
+ info->display_name = token.token.tenant.name;
info->status = 200;
}
@@ -504,10 +318,8 @@ int RGWSwift::parse_keystone_token_response(const string& token, bufferlist& bl,
list<string>::iterator iter;
for (iter = roles_list.begin(); iter != roles_list.end(); ++iter) {
const string& role = *iter;
- if (t.roles.find(role) != t.roles.end()) {
- found = true;
+ if ((found=t.user.has_role(role))==true)
break;
- }
}
if (!found) {
@@ -515,7 +327,7 @@ int RGWSwift::parse_keystone_token_response(const string& token, bufferlist& bl,
return -EPERM;
}
- ldout(cct, 0) << "validated token: " << t.tenant_name << ":" << t.user_name << " expires: " << t.expiration << dendl;
+ ldout(cct, 0) << "validated token: " << t.token.tenant.name << ":" << t.user.name << " expires: " << t.token.expires << dendl;
rgw_set_keystone_token_auth_info(t, info);
@@ -592,7 +404,7 @@ int RGWSwift::validate_keystone_token(RGWRados *store, const string& token, stru
if (keystone_token_cache->find(token_id, t)) {
rgw_set_keystone_token_auth_info(t, info);
- ldout(cct, 20) << "cached token.tenant_id=" << t.tenant_id << dendl;
+ ldout(cct, 20) << "cached token.tenant.id=" << t.token.tenant.id << dendl;
int ret = update_user_info(store, info, rgw_user);
if (ret < 0)
diff --git a/src/rgw/rgw_swift.h b/src/rgw/rgw_swift.h
index febc2675c27..3f0bd161946 100644
--- a/src/rgw/rgw_swift.h
+++ b/src/rgw/rgw_swift.h
@@ -6,6 +6,7 @@
#include "common/Cond.h"
class RGWRados;
+class KeystoneToken;
struct rgw_swift_auth_info {
int status;
@@ -17,25 +18,6 @@ struct rgw_swift_auth_info {
rgw_swift_auth_info() : status(0), ttl(0) {}
};
-class KeystoneToken {
-public:
- string tenant_name;
- string tenant_id;
- string user_name;
- time_t expiration;
-
- map<string, bool> roles;
-
- KeystoneToken() : expiration(0) {}
-
- int parse(CephContext *cct, bufferlist& bl);
-
- bool expired() {
- uint64_t now = ceph_clock_now(NULL).sec();
- return (now < (uint64_t)expiration);
- }
-};
-
class RGWSwift {
CephContext *cct;
atomic_t down_flag;
diff --git a/src/rgw/rgw_user.cc b/src/rgw/rgw_user.cc
index 5e5b5c564bb..dc529e3d48d 100644
--- a/src/rgw/rgw_user.cc
+++ b/src/rgw/rgw_user.cc
@@ -1682,6 +1682,9 @@ int RGWUser::execute_add(RGWUserAdminOpState& op_state, std::string *err_msg)
if (op_state.op_mask_specified)
user_info.op_mask = op_state.get_op_mask();
+ if (op_state.has_bucket_quota())
+ user_info.bucket_quota = op_state.get_bucket_quota();
+
// update the request
op_state.set_user_info(user_info);
op_state.set_populated();
@@ -1884,6 +1887,9 @@ int RGWUser::execute_modify(RGWUserAdminOpState& op_state, std::string *err_msg)
if (op_state.op_mask_specified)
user_info.op_mask = op_state.get_op_mask();
+ if (op_state.has_bucket_quota())
+ user_info.bucket_quota = op_state.get_bucket_quota();
+
if (op_state.has_suspension_op()) {
__u8 suspended = op_state.get_suspension_status();
user_info.suspended = suspended;
diff --git a/src/rgw/rgw_user.h b/src/rgw/rgw_user.h
index 32bcf199001..e71b8f81778 100644
--- a/src/rgw/rgw_user.h
+++ b/src/rgw/rgw_user.h
@@ -172,6 +172,10 @@ struct RGWUserAdminOpState {
bool subuser_params_checked;
bool user_params_checked;
+ bool bucket_quota_specified;
+
+ RGWQuotaInfo bucket_quota;
+
void set_access_key(std::string& access_key) {
if (access_key.empty())
return;
@@ -285,6 +289,12 @@ struct RGWUserAdminOpState {
key_op = true;
}
+ void set_bucket_quota(RGWQuotaInfo& quota)
+ {
+ bucket_quota = quota;
+ bucket_quota_specified = true;
+ }
+
bool is_populated() { return populated; };
bool is_initialized() { return initialized; };
bool has_existing_user() { return existing_user; };
@@ -303,6 +313,7 @@ struct RGWUserAdminOpState {
bool will_purge_keys() { return purge_keys; };
bool will_purge_data() { return purge_data; };
bool will_generate_subuser() { return gen_subuser; };
+ bool has_bucket_quota() { return bucket_quota_specified; }
void set_populated() { populated = true; };
void clear_populated() { populated = false; };
void set_initialized() { initialized = true; };
@@ -317,6 +328,7 @@ struct RGWUserAdminOpState {
uint32_t get_subuser_perm() { return perm_mask; };
uint32_t get_max_buckets() { return max_buckets; };
uint32_t get_op_mask() { return op_mask; };
+ RGWQuotaInfo& get_bucket_quota() { return bucket_quota; }
std::string get_user_id() { return user_id; };
std::string get_subuser() { return subuser; };
@@ -403,6 +415,7 @@ struct RGWUserAdminOpState {
key_params_checked = false;
subuser_params_checked = false;
user_params_checked = false;
+ bucket_quota_specified = false;
}
};
diff --git a/src/script/perf-watch.py b/src/script/perf-watch.py
index 8c18c3ec766..826d4a499d7 100755
--- a/src/script/perf-watch.py
+++ b/src/script/perf-watch.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
import json
import argparse
diff --git a/src/test/Makefile.am b/src/test/Makefile.am
new file mode 100644
index 00000000000..59b4d89e930
--- /dev/null
+++ b/src/test/Makefile.am
@@ -0,0 +1,906 @@
+## Unknown/other tests
+
+ceph_test_timers_SOURCES = test/TestTimers.cc
+ceph_test_timers_LDADD = $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_test_timers
+
+ceph_test_signal_handlers_SOURCES = test/TestSignalHandlers.cc
+ceph_test_signal_handlers_LDADD = $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_test_signal_handlers
+
+ceph_test_rados_SOURCES = \
+ test/osd/TestRados.cc \
+ test/osd/TestOpStat.cc \
+ test/osd/Object.cc \
+ test/osd/RadosModel.cc
+ceph_test_rados_LDADD = $(LIBRADOS) $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_test_rados
+
+ceph_test_mutate_SOURCES = test/test_mutate.cc
+ceph_test_mutate_LDADD = $(LIBRADOS) $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_test_mutate
+
+ceph_test_rewrite_latency_SOURCES = test/test_rewrite_latency.cc
+ceph_test_rewrite_latency_LDADD = $(LIBCOMMON) $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS)
+bin_DEBUGPROGRAMS += ceph_test_rewrite_latency
+
+ceph_test_msgr_SOURCES = test/testmsgr.cc
+ceph_test_msgr_LDADD = $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_test_msgr
+
+ceph_streamtest_SOURCES = test/streamtest.cc
+ceph_streamtest_LDADD = $(LIBOS) $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_streamtest
+
+ceph_test_trans_SOURCES = test/test_trans.cc
+ceph_test_trans_LDADD = $(LIBOS) $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_test_trans
+
+ceph_test_crypto_SOURCES = test/testcrypto.cc
+ceph_test_crypto_LDADD = $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_test_crypto
+
+ceph_test_keys_SOURCES = test/testkeys.cc
+ceph_test_keys_LDADD = $(LIBMON) $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_test_keys
+
+
+## Dencoder test
+
+ceph_dencoder_SOURCES = \
+ test/encoding/ceph_dencoder.cc \
+ $(DENCODER_SOURCES)
+ceph_dencoder_LDADD = \
+ $(LIBOSD) $(LIBMDS) $(LIBMON) \
+ $(DENCODER_DEPS) $(CEPH_GLOBAL)
+
+# These should always use explicit _CFLAGS/_CXXFLAGS so avoid basename conflicts
+ceph_dencoder_CFLAGS = ${AM_CFLAGS}
+ceph_dencoder_CXXFLAGS = ${AM_CXXFLAGS}
+
+if COMPILER_HAS_VTA
+ceph_dencoder_CFLAGS += -fno-var-tracking-assignments
+ceph_dencoder_CXXFLAGS += -fno-var-tracking-assignments
+endif
+
+bin_PROGRAMS += ceph-dencoder
+
+get_command_descriptions_SOURCES = test/common/get_command_descriptions.cc
+get_command_descriptions_LDADD = $(LIBMON) $(LIBCOMMON) $(CEPH_GLOBAL)
+noinst_PROGRAMS += get_command_descriptions
+
+
+## Build tests
+# These should all use explicit _CXXFLAGS so avoid basename conflicts
+
+if WITH_BUILD_TESTS
+test_build_libcommon_SOURCES = \
+ test/buildtest_skeleton.cc \
+ $(libcommon_la_SOURCES)
+test_build_libcommon_LDADD = \
+ $(LIBCOMMON_DEPS) \
+ $(PTHREAD_LIBS) $(CRYPTO_LIBS) $(EXTRALIBS)
+test_build_libcommon_LDFLAGS = -static-libtool-libs
+test_build_libcommon_CFLAGS = $(AM_CFLAGS)
+test_build_libcommon_CXXFLAGS = $(AM_CXXFLAGS)
+bin_DEBUGPROGRAMS += test_build_libcommon
+
+test_build_librados_SOURCES = \
+ test/buildtest_skeleton.cc \
+ $(librados_la_SOURCES)
+test_build_librados_LDADD = \
+ $(LIBRADOS_DEPS) \
+ $(PTHREAD_LIBS) $(CRYPTO_LIBS) $(EXTRALIBS)
+test_build_librados_LDFLAGS = -static-libtool-libs
+test_build_librados_CFLAGS = $(AM_CFLAGS)
+test_build_librados_CXXFLAGS = $(AM_CXXFLAGS)
+bin_DEBUGPROGRAMS += test_build_librados
+
+test_build_librgw_SOURCES = \
+ test/buildtest_skeleton.cc \
+ $(librgw_la_SOURCES)
+test_build_librgw_LDADD = \
+ $(LIBRGW_DEPS) \
+ $(PTHREAD_LIBS) $(CRYPTO_LIBS) $(EXTRALIBS) \
+ $(CEPH_GLOBAL)
+test_build_librgw_LDFLAGS = -static-libtool-libs
+test_build_librgw_CFLAGS = $(AM_CFLAGS)
+test_build_librgw_CXXFLAGS = $(AM_CXXFLAGS)
+bin_DEBUGPROGRAMS += test_build_librgw
+
+# I dont get this one... testing the osdc build but link in libcephfs?
+test_build_libcephfs_SOURCES = \
+ test/buildtest_skeleton.cc \
+ $(libosdc_la_SOURCES)
+test_build_libcephfs_LDADD = \
+ $(LIBCEPHFS) -lexpat \
+ $(PTHREAD_LIBS) $(CRYPTO_LIBS) $(EXTRALIBS)
+test_build_libcephfs_LDFLAGS = -static-libtool-libs
+test_build_libcephfs_CFLAGS = $(AM_CFLAGS)
+test_build_libcephfs_CXXFLAGS = $(AM_CXXFLAGS)
+bin_DEBUGPROGRAMS += test_build_libcephfs
+
+endif # WITH_BUILD_TESTS
+
+
+## Benchmarks
+
+ceph_smalliobench_SOURCES = \
+ test/bench/small_io_bench.cc \
+ test/bench/rados_backend.cc \
+ test/bench/detailed_stat_collector.cc \
+ test/bench/bencher.cc
+ceph_smalliobench_LDADD = $(LIBRADOS) -lboost_program_options $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_smalliobench
+
+ceph_smalliobenchfs_SOURCES = \
+ test/bench/small_io_bench_fs.cc \
+ test/bench/testfilestore_backend.cc \
+ test/bench/detailed_stat_collector.cc \
+ test/bench/bencher.cc
+ceph_smalliobenchfs_LDADD = $(LIBRADOS) -lboost_program_options $(LIBOS) $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_smalliobenchfs
+
+ceph_smalliobenchdumb_SOURCES = \
+ test/bench/small_io_bench_dumb.cc \
+ test/bench/dumb_backend.cc \
+ test/bench/detailed_stat_collector.cc \
+ test/bench/bencher.cc
+ceph_smalliobenchdumb_LDADD = $(LIBRADOS) -lboost_program_options $(LIBOS) $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_smalliobenchdumb
+
+ceph_smalliobenchrbd_SOURCES = \
+ test/bench/small_io_bench_rbd.cc \
+ test/bench/rbd_backend.cc \
+ test/bench/detailed_stat_collector.cc \
+ test/bench/bencher.cc
+ceph_smalliobenchrbd_LDADD = $(LIBRBD) $(LIBRADOS) -lboost_program_options $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_smalliobenchrbd
+
+ceph_tpbench_SOURCES = \
+ test/bench/tp_bench.cc \
+ test/bench/detailed_stat_collector.cc
+ceph_tpbench_LDADD = $(LIBRADOS) -lboost_program_options $(LIBOS) $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_tpbench
+
+ceph_omapbench_SOURCES = test/omap_bench.cc
+ceph_omapbench_LDADD = $(LIBRADOS) $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_omapbench
+
+ceph_kvstorebench_SOURCES = \
+ test/kv_store_bench.cc \
+ key_value_store/kv_flat_btree_async.cc
+ceph_kvstorebench_LDADD = $(LIBRADOS) $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_kvstorebench
+
+ceph_multi_stress_watch_SOURCES = \
+ test/multi_stress_watch.cc \
+ test/librados/test.cc
+ceph_multi_stress_watch_LDADD = $(LIBRADOS) $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_multi_stress_watch
+
+
+
+
+
+
+## System tests
+
+libsystest_la_SOURCES = \
+ test/system/cross_process_sem.cc \
+ test/system/systest_runnable.cc \
+ test/system/systest_settings.cc
+libsystest_la_LIBADD = $(CEPH_GLOBAL)
+noinst_LTLIBRARIES += libsystest.la
+
+ceph_test_rados_list_parallel_SOURCES = \
+ test/system/rados_list_parallel.cc \
+ test/system/st_rados_create_pool.cc \
+ test/system/st_rados_list_objects.cc
+ceph_test_rados_list_parallel_LDADD = $(LIBRADOS) libsystest.la $(PTHREAD_LIBS)
+bin_DEBUGPROGRAMS += ceph_test_rados_list_parallel
+
+ceph_test_rados_open_pools_parallel_SOURCES = \
+ test/system/rados_open_pools_parallel.cc \
+ test/system/st_rados_create_pool.cc
+ceph_test_rados_open_pools_parallel_LDADD = $(LIBRADOS) libsystest.la $(PTHREAD_LIBS)
+bin_DEBUGPROGRAMS += ceph_test_rados_open_pools_parallel
+
+ceph_test_rados_delete_pools_parallel_SOURCES = \
+ test/system/rados_delete_pools_parallel.cc \
+ test/system/st_rados_create_pool.cc \
+ test/system/st_rados_delete_pool.cc \
+ test/system/st_rados_list_objects.cc
+ceph_test_rados_delete_pools_parallel_LDADD = $(LIBRADOS) libsystest.la $(PTHREAD_LIBS)
+bin_DEBUGPROGRAMS += ceph_test_rados_delete_pools_parallel
+
+ceph_test_rados_watch_notify_SOURCES = \
+ test/system/rados_watch_notify.cc \
+ test/system/st_rados_create_pool.cc \
+ test/system/st_rados_delete_pool.cc \
+ test/system/st_rados_delete_objs.cc \
+ test/system/st_rados_watch.cc \
+ test/system/st_rados_notify.cc
+ceph_test_rados_watch_notify_LDADD = $(LIBRADOS) libsystest.la $(PTHREAD_LIBS)
+bin_DEBUGPROGRAMS += ceph_test_rados_watch_notify
+
+ceph_bench_log_SOURCES = test/bench_log.cc
+ceph_bench_log_LDADD = $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_bench_log
+
+
+
+## Unit tests
+
+check_SCRIPTS += \
+ unittest_bufferlist.sh \
+ test/encoding/check-generated.sh
+
+# target to build but not run the unit tests
+unittests:: $(check_PROGRAMS)
+
+UNITTEST_CXXFLAGS = \
+ $(AM_CXXFLAGS) \
+ -I$(top_srcdir)/src/gtest/include \
+ -I$(top_builddir)/src/gtest/include
+UNITTEST_LDADD = \
+ $(top_builddir)/src/gtest/lib/libgtest.a \
+ $(top_builddir)/src/gtest/lib/libgtest_main.a \
+ $(PTHREAD_LIBS)
+
+unittest_encoding_SOURCES = test/encoding.cc
+unittest_encoding_LDADD = $(LIBCEPHFS) $(LIBRADOS) -lm $(UNITTEST_LDADD)
+unittest_encoding_CXXFLAGS = $(UNITTEST_CXXFLAGS) -fno-strict-aliasing
+check_PROGRAMS += unittest_encoding
+
+unittest_addrs_SOURCES = test/test_addrs.cc
+unittest_addrs_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_addrs_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+check_PROGRAMS += unittest_addrs
+
+unittest_bloom_filter_SOURCES = test/common/test_bloom_filter.cc
+unittest_bloom_filter_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_bloom_filter_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+check_PROGRAMS += unittest_bloom_filter
+
+unittest_sharedptr_registry_SOURCES = test/common/test_sharedptr_registry.cc
+unittest_sharedptr_registry_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_sharedptr_registry_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+check_PROGRAMS += unittest_sharedptr_registry
+
+unittest_sloppy_crc_map_SOURCES = test/common/test_sloppy_crc_map.cc
+unittest_sloppy_crc_map_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_sloppy_crc_map_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+check_PROGRAMS += unittest_sloppy_crc_map
+
+unittest_util_SOURCES = test/common/test_util.cc
+unittest_util_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_util_LDADD = $(LIBCOMMON) -lm $(UNITTEST_LDADD) $(CRYPTO_LIBS) $(EXTRALIBS)
+check_PROGRAMS += unittest_util
+
+unittest_workqueue_SOURCES = test/test_workqueue.cc
+unittest_workqueue_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_workqueue_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+check_PROGRAMS += unittest_workqueue
+
+unittest_striper_SOURCES = test/test_striper.cc
+unittest_striper_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_striper_LDADD = $(LIBOSDC) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+check_PROGRAMS += unittest_striper
+
+unittest_prebufferedstreambuf_SOURCES = test/test_prebufferedstreambuf.cc
+unittest_prebufferedstreambuf_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_prebufferedstreambuf_LDADD = $(LIBCOMMON) $(UNITTEST_LDADD) $(EXTRALIBS)
+check_PROGRAMS += unittest_prebufferedstreambuf
+
+unittest_str_list_SOURCES = test/test_str_list.cc
+unittest_str_list_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_str_list_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+check_PROGRAMS += unittest_str_list
+
+unittest_log_SOURCES = log/test.cc
+unittest_log_LDADD = $(LIBCOMMON) $(UNITTEST_LDADD)
+unittest_log_CXXFLAGS = $(UNITTEST_CXXFLAGS) -O2
+check_PROGRAMS += unittest_log
+
+unittest_throttle_SOURCES = test/common/Throttle.cc
+unittest_throttle_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_throttle_CXXFLAGS = $(UNITTEST_CXXFLAGS) -O2
+check_PROGRAMS += unittest_throttle
+
+unittest_base64_SOURCES = test/base64.cc
+unittest_base64_LDADD = $(LIBCEPHFS) -lm $(UNITTEST_LDADD)
+unittest_base64_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_base64
+
+unittest_ceph_argparse_SOURCES = test/ceph_argparse.cc
+unittest_ceph_argparse_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_ceph_argparse_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_ceph_argparse
+
+unittest_ceph_compatset_SOURCES = test/ceph_compatset.cc
+unittest_ceph_compatset_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_ceph_compatset_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_ceph_compatset
+
+libec_example_la_SOURCES = test/osd/ErasureCodePluginExample.cc
+libec_example_la_CFLAGS = ${AM_CFLAGS}
+libec_example_la_CXXFLAGS= ${AM_CXXFLAGS}
+libec_example_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libec_example_la_LDFLAGS = ${AM_LDFLAGS} -export-symbols-regex '.*__erasure_code_.*'
+erasure_codelib_LTLIBRARIES += libec_example.la
+
+libec_missing_entry_point_la_SOURCES = test/osd/ErasureCodePluginMissingEntryPoint.cc
+libec_missing_entry_point_la_CFLAGS = ${AM_CFLAGS}
+libec_missing_entry_point_la_CXXFLAGS= ${AM_CXXFLAGS}
+libec_missing_entry_point_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libec_missing_entry_point_la_LDFLAGS = ${AM_LDFLAGS} -export-symbols-regex '.*__erasure_code_.*'
+erasure_codelib_LTLIBRARIES += libec_missing_entry_point.la
+
+libec_hangs_la_SOURCES = test/osd/ErasureCodePluginHangs.cc
+libec_hangs_la_CFLAGS = ${AM_CFLAGS}
+libec_hangs_la_CXXFLAGS= ${AM_CXXFLAGS}
+libec_hangs_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libec_hangs_la_LDFLAGS = ${AM_LDFLAGS} -export-symbols-regex '.*__erasure_code_.*'
+erasure_codelib_LTLIBRARIES += libec_hangs.la
+
+libec_fail_to_initialize_la_SOURCES = test/osd/ErasureCodePluginFailToInitialize.cc
+libec_fail_to_initialize_la_CFLAGS = ${AM_CFLAGS}
+libec_fail_to_initialize_la_CXXFLAGS= ${AM_CXXFLAGS}
+libec_fail_to_initialize_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libec_fail_to_initialize_la_LDFLAGS = ${AM_LDFLAGS} -export-symbols-regex '.*__erasure_code_.*'
+erasure_codelib_LTLIBRARIES += libec_fail_to_initialize.la
+
+libec_fail_to_register_la_SOURCES = test/osd/ErasureCodePluginFailToRegister.cc
+libec_fail_to_register_la_CFLAGS = ${AM_CFLAGS}
+libec_fail_to_register_la_CXXFLAGS= ${AM_CXXFLAGS}
+libec_fail_to_register_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libec_fail_to_register_la_LDFLAGS = ${AM_LDFLAGS} -export-symbols-regex '.*__erasure_code_.*'
+erasure_codelib_LTLIBRARIES += libec_fail_to_register.la
+
+unittest_erasure_code_plugin_SOURCES = test/osd/TestErasureCodePlugin.cc
+unittest_erasure_code_plugin_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_erasure_code_plugin_LDADD = $(LIBOSD) $(LIBCOMMON) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+if LINUX
+unittest_erasure_code_plugin_LDADD += -ldl
+endif
+check_PROGRAMS += unittest_erasure_code_plugin
+
+unittest_erasure_code_jerasure_SOURCES = \
+ test/osd/TestErasureCodeJerasure.cc \
+ $(libec_jerasure_la_SOURCES)
+unittest_erasure_code_jerasure_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_erasure_code_jerasure_LDADD = $(LIBOSD) $(LIBCOMMON) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+if LINUX
+unittest_erasure_code_jerasure_LDADD += -ldl
+endif
+check_PROGRAMS += unittest_erasure_code_jerasure
+
+unittest_erasure_code_plugin_jerasure_SOURCES = \
+ test/osd/TestErasureCodePluginJerasure.cc
+unittest_erasure_code_plugin_jerasure_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
+unittest_erasure_code_plugin_jerasure_LDADD = $(LIBOSD) $(LIBCOMMON) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+if LINUX
+unittest_erasure_code_plugin_jerasure_LDADD += -ldl
+endif
+check_PROGRAMS += unittest_erasure_code_plugin_jerasure
+
+unittest_erasure_code_example_SOURCES = test/osd/TestErasureCodeExample.cc
+noinst_HEADERS += test/osd/ErasureCodeExample.h
+unittest_erasure_code_example_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_erasure_code_example_LDADD = $(LIBOSD) $(LIBCOMMON) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+check_PROGRAMS += unittest_erasure_code_example
+
+unittest_osd_types_SOURCES = test/test_osd_types.cc
+unittest_osd_types_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_osd_types_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+check_PROGRAMS += unittest_osd_types
+
+unittest_pglog_SOURCES = test/osd/TestPGLog.cc
+unittest_pglog_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_pglog_LDADD = $(LIBOSD) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+check_PROGRAMS += unittest_pglog
+
+if LINUX
+unittest_pglog_LDADD += -ldl
+endif # LINUX
+
+unittest_gather_SOURCES = test/gather.cc
+unittest_gather_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_gather_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_gather
+
+unittest_run_cmd_SOURCES = test/run_cmd.cc
+unittest_run_cmd_LDADD = $(LIBCEPHFS) $(UNITTEST_LDADD)
+unittest_run_cmd_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_run_cmd
+
+unittest_signals_SOURCES = test/signals.cc
+unittest_signals_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_signals_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_signals
+
+unittest_simple_spin_SOURCES = test/simple_spin.cc
+unittest_simple_spin_LDADD = $(LIBCEPHFS) $(UNITTEST_LDADD)
+unittest_simple_spin_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_simple_spin
+
+unittest_librados_SOURCES = test/librados/librados.cc
+unittest_librados_LDADD = $(LIBRADOS) $(UNITTEST_LDADD)
+unittest_librados_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_librados
+
+unittest_bufferlist_SOURCES = test/bufferlist.cc
+unittest_bufferlist_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_bufferlist_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_bufferlist
+
+unittest_crc32c_SOURCES = test/common/test_crc32c.cc
+unittest_crc32c_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_crc32c_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_crc32c
+
+unittest_arch_SOURCES = test/test_arch.c
+unittest_arch_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_arch_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_arch
+
+unittest_crypto_SOURCES = test/crypto.cc
+unittest_crypto_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_crypto_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_crypto
+
+unittest_perf_counters_SOURCES = test/perf_counters.cc
+unittest_perf_counters_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_perf_counters_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_perf_counters
+
+unittest_admin_socket_SOURCES = test/admin_socket.cc
+unittest_admin_socket_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_admin_socket_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_admin_socket
+
+unittest_ceph_crypto_SOURCES = test/ceph_crypto.cc
+unittest_ceph_crypto_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_ceph_crypto_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_ceph_crypto
+
+unittest_utf8_SOURCES = test/utf8.cc
+unittest_utf8_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_utf8_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_utf8
+
+unittest_mime_SOURCES = test/mime.cc
+unittest_mime_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_mime_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_mime
+
+unittest_escape_SOURCES = test/escape.cc
+unittest_escape_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_escape_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_escape
+
+unittest_chain_xattr_SOURCES = test/filestore/chain_xattr.cc
+unittest_chain_xattr_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_chain_xattr_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_chain_xattr
+
+unittest_flatindex_SOURCES = test/os/TestFlatIndex.cc
+unittest_flatindex_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_flatindex_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_flatindex
+
+unittest_strtol_SOURCES = test/strtol.cc
+unittest_strtol_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_strtol_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_strtol
+
+unittest_confutils_SOURCES = test/confutils.cc
+unittest_confutils_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_confutils_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_confutils
+
+unittest_heartbeatmap_SOURCES = test/heartbeat_map.cc
+unittest_heartbeatmap_LDADD = $(LIBCOMMON) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_heartbeatmap_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_heartbeatmap
+
+# why does this include rgw/rgw_formats.cc...?
+unittest_formatter_SOURCES = \
+ test/formatter.cc \
+ rgw/rgw_formats.cc
+unittest_formatter_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_formatter_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_formatter
+
+unittest_libcephfs_config_SOURCES = test/libcephfs_config.cc
+unittest_libcephfs_config_LDADD = $(LIBCEPHFS) $(UNITTEST_LDADD)
+unittest_libcephfs_config_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_libcephfs_config
+
+unittest_lfnindex_SOURCES = test/os/TestLFNIndex.cc
+unittest_lfnindex_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_lfnindex_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_lfnindex
+
+unittest_librados_config_SOURCES = test/librados/librados_config.cc
+unittest_librados_config_LDADD = $(LIBRADOS) $(UNITTEST_LDADD)
+unittest_librados_config_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_librados_config
+
+#unittest_librgw_link_SOURCES = test/librgw_link.cc
+#unittest_librgw_link_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS}
+#unittest_librgw_link_LDADD = $(LIBRGW) ${UNITTEST_LDADD}
+#unittest_librgw_link_CXXFLAGS = ${CRYPTO_CFLAGS} ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
+#check_PROGRAMS += unittest_librgw_link
+
+unittest_daemon_config_SOURCES = test/daemon_config.cc
+unittest_daemon_config_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_daemon_config_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_daemon_config
+
+unittest_osd_osdcap_SOURCES = test/osd/osdcap.cc
+unittest_osd_osdcap_LDADD = $(LIBOSD) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_osd_osdcap_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_osd_osdcap
+
+unittest_mon_moncap_SOURCES = test/mon/moncap.cc
+unittest_mon_moncap_LDADD = $(LIBMON) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_mon_moncap_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_mon_moncap
+
+#if WITH_RADOSGW
+#unittest_librgw_SOURCES = test/librgw.cc
+#unittest_librgw_LDFLAGS = -lrt $(PTHREAD_CFLAGS) -lcurl ${AM_LDFLAGS}
+#unittest_librgw_LDADD = librgw.la $(LIBRADOS) ${UNITTEST_LDADD} -lexpat $(CEPH_GLOBAL)
+#unittest_librgw_CXXFLAGS = ${CRYPTO_CFLAGS} ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
+#check_PROGRAMS += unittest_librgw
+#endif # WITH_RADOSGW
+
+unittest_ipaddr_SOURCES = test/test_ipaddr.cc
+unittest_ipaddr_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_ipaddr_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_ipaddr
+
+unittest_texttable_SOURCES = test/test_texttable.cc
+unittest_texttable_LDADD = $(LIBCOMMON) $(UNITTEST_LDADD)
+unittest_texttable_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_texttable
+
+check_SCRIPTS += test/pybind/test_ceph_argparse.py
+
+if WITH_RADOSGW
+ceph_test_cors_SOURCES = test/test_cors.cc
+ceph_test_cors_LDADD = \
+ $(LIBRADOS) $(LIBRGW) $(CEPH_GLOBAL) \
+ $(UNITTEST_LDADD) \
+ -lcurl -luuid -lexpat
+ceph_test_cors_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_cors
+
+ceph_test_cls_rgw_meta_SOURCES = test/test_rgw_admin_meta.cc
+ceph_test_cls_rgw_meta_LDADD = \
+ $(LIBRADOS) $(LIBRGW) $(CEPH_GLOBAL) \
+ $(UNITTEST_LDADD) $(CRYPTO_LIBS) \
+ -lcurl -luuid -lexpat \
+ libcls_version_client.a libcls_log_client.a \
+ libcls_statelog_client.a libcls_refcount_client.la \
+ libcls_rgw_client.la libcls_lock_client.la
+ceph_test_cls_rgw_meta_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_cls_rgw_meta
+
+ceph_test_cls_rgw_log_SOURCES = test/test_rgw_admin_log.cc
+ceph_test_cls_rgw_log_LDADD = \
+ $(LIBRADOS) $(LIBRGW) $(CEPH_GLOBAL) \
+ $(UNITTEST_LDADD) $(CRYPTO_LIBS) \
+ -lcurl -luuid -lexpat \
+ libcls_version_client.a libcls_log_client.a \
+ libcls_statelog_client.a libcls_refcount_client.la \
+ libcls_rgw_client.la libcls_lock_client.la
+ceph_test_cls_rgw_log_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_cls_rgw_log
+
+ceph_test_cls_rgw_opstate_SOURCES = test/test_rgw_admin_opstate.cc
+ceph_test_cls_rgw_opstate_LDADD = \
+ $(LIBRADOS) $(LIBRGW) $(CEPH_GLOBAL) \
+ $(UNITTEST_LDADD) $(CRYPTO_LIBS) \
+ -lcurl -luuid -lexpat \
+ libcls_version_client.a libcls_log_client.a \
+ libcls_statelog_client.a libcls_refcount_client.la \
+ libcls_rgw_client.la libcls_lock_client.la
+ceph_test_cls_rgw_opstate_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_cls_rgw_opstate
+endif # WITH_RADOSGW
+
+ceph_test_librbd_SOURCES = \
+ test/librbd/test_librbd.cc \
+ test/librados/test.cc
+ceph_test_librbd_LDADD = $(LIBRBD) $(LIBRADOS) $(UNITTEST_LDADD)
+ceph_test_librbd_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_librbd
+
+ceph_test_librbd_fsx_SOURCES = test/librbd/fsx.c
+ceph_test_librbd_fsx_LDADD = $(LIBRBD) $(LIBRADOS) -lm
+ceph_test_librbd_fsx_CFLAGS = ${AM_CFLAGS} -Wno-format
+bin_DEBUGPROGRAMS += ceph_test_librbd_fsx
+
+ceph_test_cls_rbd_SOURCES = \
+ test/cls_rbd/test_cls_rbd.cc \
+ test/librados/test.cc
+ceph_test_cls_rbd_LDADD = $(LIBRADOS) libcls_rbd_client.la libcls_lock_client.la $(UNITTEST_LDADD)
+ceph_test_cls_rbd_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_cls_rbd
+
+ceph_test_cls_refcount_SOURCES = \
+ test/cls_refcount/test_cls_refcount.cc \
+ test/librados/test.cc
+ceph_test_cls_refcount_LDADD = $(LIBRADOS) libcls_refcount_client.la $(UNITTEST_LDADD)
+ceph_test_cls_refcount_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_cls_refcount
+
+ceph_test_cls_version_SOURCES = \
+ test/cls_version/test_cls_version.cc \
+ test/librados/test.cc
+ceph_test_cls_version_LDADD = $(LIBRADOS) libcls_version_client.a $(UNITTEST_LDADD)
+ceph_test_cls_version_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_cls_version
+
+ceph_test_cls_log_SOURCES = \
+ test/cls_log/test_cls_log.cc \
+ test/librados/test.cc
+ceph_test_cls_log_LDADD = $(LIBRADOS) libcls_log_client.a $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+ceph_test_cls_log_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_cls_log
+
+ceph_test_cls_statelog_SOURCES = \
+ test/cls_statelog/test_cls_statelog.cc \
+ test/librados/test.cc
+ceph_test_cls_statelog_LDADD = $(LIBRADOS) libcls_statelog_client.a $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+ceph_test_cls_statelog_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_cls_statelog
+
+ceph_test_cls_replica_log_SOURCES = \
+ test/cls_replica_log/test_cls_replica_log.cc \
+ test/librados/test.cc
+ceph_test_cls_replica_log_LDADD = \
+ $(LIBRADOS) libcls_replica_log_client.a \
+ $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+ceph_test_cls_replica_log_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_cls_replica_log
+
+ceph_test_cls_lock_SOURCES = \
+ test/cls_lock/test_cls_lock.cc \
+ test/librados/test.cc
+ceph_test_cls_lock_LDADD = $(LIBRADOS) libcls_lock_client.la $(UNITTEST_LDADD)
+ceph_test_cls_lock_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_cls_lock
+
+ceph_test_cls_hello_SOURCES = \
+ test/cls_hello/test_cls_hello.cc \
+ test/librados/test.cc
+ceph_test_cls_hello_LDADD = \
+ $(LIBRADOS) $(CRYPTO_LIBS) \
+ $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+ceph_test_cls_hello_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_cls_hello
+
+if WITH_RADOSGW
+ceph_test_cls_rgw_SOURCES = \
+ test/cls_rgw/test_cls_rgw.cc \
+ test/librados/test.cc
+ceph_test_cls_rgw_LDADD = $(LIBRADOS) libcls_rgw_client.la $(UNITTEST_LDADD)
+ceph_test_cls_rgw_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_cls_rgw
+endif # WITH_RADOSGW
+
+ceph_test_mon_workloadgen_SOURCES = test/mon/test_mon_workloadgen.cc
+ceph_test_mon_workloadgen_LDADD = $(LIBOS) $(LIBOSDC) $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_test_mon_workloadgen
+
+ceph_test_rados_api_cmd_SOURCES = \
+ test/librados/cmd.cc \
+ test/librados/test.cc
+ceph_test_rados_api_cmd_LDADD = $(LIBRADOS) $(UNITTEST_LDADD)
+ceph_test_rados_api_cmd_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_rados_api_cmd
+
+ceph_test_rados_api_io_SOURCES = \
+ test/librados/io.cc \
+ test/librados/test.cc
+ceph_test_rados_api_io_LDADD = $(LIBRADOS) $(UNITTEST_LDADD)
+ceph_test_rados_api_io_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_rados_api_io
+
+ceph_test_rados_api_aio_SOURCES = \
+ test/librados/aio.cc \
+ test/librados/test.cc
+ceph_test_rados_api_aio_LDADD = $(LIBRADOS) $(UNITTEST_LDADD)
+ceph_test_rados_api_aio_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_rados_api_aio
+
+ceph_test_rados_api_list_SOURCES = \
+ test/librados/list.cc \
+ test/librados/test.cc
+ceph_test_rados_api_list_LDADD = $(LIBRADOS) $(UNITTEST_LDADD)
+ceph_test_rados_api_list_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_rados_api_list
+
+ceph_test_rados_api_pool_SOURCES = \
+ test/librados/pool.cc \
+ test/librados/test.cc
+ceph_test_rados_api_pool_LDADD = $(LIBRADOS) $(UNITTEST_LDADD)
+ceph_test_rados_api_pool_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_rados_api_pool
+
+ceph_test_rados_api_stat_SOURCES = \
+ test/librados/stat.cc \
+ test/librados/test.cc
+ceph_test_rados_api_stat_LDADD = $(LIBRADOS) $(UNITTEST_LDADD)
+ceph_test_rados_api_stat_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_rados_api_stat
+
+ceph_test_rados_api_watch_notify_SOURCES = \
+ test/librados/watch_notify.cc \
+ test/librados/test.cc
+ceph_test_rados_api_watch_notify_LDADD = $(LIBRADOS) $(UNITTEST_LDADD)
+ceph_test_rados_api_watch_notify_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_rados_api_watch_notify
+
+ceph_test_rados_api_snapshots_SOURCES = \
+ test/librados/snapshots.cc \
+ test/librados/test.cc
+ceph_test_rados_api_snapshots_LDADD = $(LIBRADOS) $(UNITTEST_LDADD)
+ceph_test_rados_api_snapshots_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_rados_api_snapshots
+
+ceph_test_rados_api_cls_SOURCES = \
+ test/librados/cls.cc \
+ test/librados/test.cc
+ceph_test_rados_api_cls_LDADD = $(LIBRADOS) $(UNITTEST_LDADD)
+ceph_test_rados_api_cls_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_rados_api_cls
+
+ceph_test_rados_api_misc_SOURCES = \
+ test/librados/misc.cc \
+ test/librados/test.cc
+ceph_test_rados_api_misc_LDADD = $(LIBRADOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+ceph_test_rados_api_misc_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_rados_api_misc
+
+ceph_test_rados_api_lock_SOURCES = \
+ test/librados/lock.cc \
+ test/librados/test.cc
+ceph_test_rados_api_lock_LDADD = $(LIBRADOS) $(UNITTEST_LDADD)
+ceph_test_rados_api_lock_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_rados_api_lock
+
+ceph_test_libcephfs_SOURCES = \
+ test/libcephfs/test.cc \
+ test/libcephfs/readdir_r_cb.cc \
+ test/libcephfs/caps.cc \
+ test/libcephfs/multiclient.cc
+ceph_test_libcephfs_LDADD = $(LIBCEPHFS) $(UNITTEST_LDADD)
+ceph_test_libcephfs_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_libcephfs
+
+ceph_test_filestore_SOURCES = test/filestore/store_test.cc
+ceph_test_filestore_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+ceph_test_filestore_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_filestore
+
+ceph_test_filestore_workloadgen_SOURCES = \
+ test/filestore/workload_generator.cc \
+ test/filestore/TestFileStoreState.cc
+ceph_test_filestore_workloadgen_LDADD = $(LIBOS) $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_test_filestore_workloadgen
+
+ceph_test_filestore_idempotent_SOURCES = \
+ test/filestore/test_idempotent.cc \
+ test/filestore/FileStoreTracker.cc \
+ test/common/ObjectContents.cc
+ceph_test_filestore_idempotent_LDADD = $(LIBOS) $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_test_filestore_idempotent
+
+ceph_test_filestore_idempotent_sequence_SOURCES = \
+ test/filestore/test_idempotent_sequence.cc \
+ test/filestore/DeterministicOpSequence.cc \
+ test/filestore/TestFileStoreState.cc \
+ test/filestore/FileStoreDiff.cc
+ceph_test_filestore_idempotent_sequence_LDADD = $(LIBOS) $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_test_filestore_idempotent_sequence
+
+ceph_xattr_bench_SOURCES = test/xattr_bench.cc
+ceph_xattr_bench_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+ceph_xattr_bench_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_xattr_bench
+
+ceph_test_filejournal_SOURCES = test/test_filejournal.cc
+ceph_test_filejournal_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+ceph_test_filejournal_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_filejournal
+
+ceph_test_stress_watch_SOURCES = \
+ test/test_stress_watch.cc \
+ test/librados/test.cc
+ceph_test_stress_watch_LDADD = $(LIBRADOS) $(UNITTEST_LDADD)
+ceph_test_stress_watch_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_stress_watch
+
+ceph_test_objectcacher_stress_SOURCES = \
+ test/osdc/object_cacher_stress.cc \
+ test/osdc/FakeWriteback.cc
+ceph_test_objectcacher_stress_LDADD = $(LIBOSDC) $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_test_objectcacher_stress
+
+ceph_test_snap_mapper_SOURCES = test/test_snap_mapper.cc
+ceph_test_snap_mapper_LDADD = $(LIBOSD) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+ceph_test_snap_mapper_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_snap_mapper
+
+ceph_test_object_map_SOURCES = \
+ test/ObjectMap/test_object_map.cc \
+ test/ObjectMap/KeyValueDBMemory.cc
+ceph_test_object_map_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+ceph_test_object_map_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_object_map
+
+ceph_test_keyvaluedb_atomicity_SOURCES = test/ObjectMap/test_keyvaluedb_atomicity.cc
+ceph_test_keyvaluedb_atomicity_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+ceph_test_keyvaluedb_atomicity_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_keyvaluedb_atomicity
+
+ceph_test_keyvaluedb_iterators_SOURCES = \
+ test/ObjectMap/test_keyvaluedb_iterators.cc \
+ test/ObjectMap/KeyValueDBMemory.cc
+ceph_test_keyvaluedb_iterators_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+ceph_test_keyvaluedb_iterators_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_keyvaluedb_iterators
+
+ceph_test_store_tool_SOURCES = test/ObjectMap/test_store_tool/test_store_tool.cc
+ceph_test_store_tool_LDADD = $(LIBOS) $(CEPH_GLOBAL)
+ceph_test_store_tool_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_store_tool
+
+ceph_test_cfuse_cache_invalidate_SOURCES = test/test_cfuse_cache_invalidate.cc
+bin_DEBUGPROGRAMS += ceph_test_cfuse_cache_invalidate
+
+
+noinst_HEADERS += \
+ test/osd/RadosModel.h \
+ test/osd/Object.h \
+ test/osd/TestOpStat.h \
+ test/bench/distribution.h \
+ test/bench/rados_backend.h \
+ test/bench/rbd_backend.h \
+ test/bench/bencher.h \
+ test/bench/backend.h \
+ test/bench/dumb_backend.h \
+ test/bench/stat_collector.h \
+ test/bench/detailed_stat_collector.h \
+ test/bench/testfilestore_backend.h \
+ test/common/ObjectContents.h \
+ test/encoding/types.h \
+ test/filestore/DeterministicOpSequence.h \
+ test/filestore/FileStoreTracker.h \
+ test/filestore/FileStoreDiff.h \
+ test/filestore/TestFileStoreState.h \
+ test/filestore/workload_generator.h \
+ test/kv_store_bench.h \
+ test/librados/test.h \
+ test/ObjectMap/KeyValueDBMemory.h \
+ test/omap_bench.h \
+ test/osd/Object.h \
+ test/osd/RadosModel.h \
+ test/osd/TestOpStat.h \
+ test/osdc/FakeWriteback.h \
+ test/system/cross_process_sem.h \
+ test/system/st_rados_create_pool.h \
+ test/system/st_rados_list_objects.h \
+ test/system/st_rados_delete_objs.h \
+ test/system/st_rados_delete_pool.h \
+ test/system/st_rados_notify.h \
+ test/system/st_rados_watch.h \
+ test/system/systest_runnable.h \
+ test/system/systest_settings.h \
+ test/unit.h
+
diff --git a/src/test/ObjectMap/KeyValueDBMemory.h b/src/test/ObjectMap/KeyValueDBMemory.h
index 93d0809d491..5cffce3ef04 100644
--- a/src/test/ObjectMap/KeyValueDBMemory.h
+++ b/src/test/ObjectMap/KeyValueDBMemory.h
@@ -126,6 +126,24 @@ public:
return static_cast<TransactionImpl_*>(trans.get())->complete();
}
+ uint64_t get_estimated_size(map<string,uint64_t> &extras) {
+ uint64_t total_size = 0;
+
+ for (map<pair<string,string>,bufferlist>::iterator p = db.begin();
+ p != db.end(); ++p) {
+ string prefix = p->first.first;
+ bufferlist &bl = p->second;
+
+ uint64_t sz = bl.length();
+ total_size += sz;
+ if (extras.count(prefix) == 0)
+ extras[prefix] = 0;
+ extras[prefix] += sz;
+ }
+
+ return total_size;
+ }
+
private:
bool exists_prefix(const string &prefix) {
std::map<std::pair<string,string>,bufferlist>::iterator it;
diff --git a/src/test/ObjectMap/test_object_map.cc b/src/test/ObjectMap/test_object_map.cc
index 1b39c8068fb..23f220daf45 100644
--- a/src/test/ObjectMap/test_object_map.cc
+++ b/src/test/ObjectMap/test_object_map.cc
@@ -55,16 +55,16 @@ public:
}
void set_key(const string &objname, const string &key, const string &value) {
- set_key(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
+ set_key(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
key, value);
}
void set_xattr(const string &objname, const string &key, const string &value) {
- set_xattr(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
+ set_xattr(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
key, value);
}
- void set_key(hobject_t hoid,
+ void set_key(ghobject_t hoid,
string key, string value) {
map<string, bufferlist> to_write;
bufferptr bp(value.c_str(), value.size());
@@ -74,7 +74,7 @@ public:
db->set_keys(hoid, to_write);
}
- void set_xattr(hobject_t hoid,
+ void set_xattr(ghobject_t hoid,
string key, string value) {
map<string, bufferlist> to_write;
bufferptr bp(value.c_str(), value.size());
@@ -85,11 +85,11 @@ public:
}
void set_header(const string &objname, const string &value) {
- set_header(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
+ set_header(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
value);
}
- void set_header(hobject_t hoid,
+ void set_header(ghobject_t hoid,
const string &value) {
bufferlist header;
header.append(bufferptr(value.c_str(), value.size() + 1));
@@ -97,11 +97,11 @@ public:
}
int get_header(const string &objname, string *value) {
- return get_header(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
+ return get_header(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
value);
}
- int get_header(hobject_t hoid,
+ int get_header(ghobject_t hoid,
string *value) {
bufferlist header;
int r = db->get_header(hoid, &header);
@@ -115,11 +115,11 @@ public:
}
int get_xattr(const string &objname, const string &key, string *value) {
- return get_xattr(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
+ return get_xattr(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
key, value);
}
- int get_xattr(hobject_t hoid,
+ int get_xattr(ghobject_t hoid,
string key, string *value) {
set<string> to_get;
to_get.insert(key);
@@ -135,11 +135,11 @@ public:
}
int get_key(const string &objname, const string &key, string *value) {
- return get_key(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
+ return get_key(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
key, value);
}
- int get_key(hobject_t hoid,
+ int get_key(ghobject_t hoid,
string key, string *value) {
set<string> to_get;
to_get.insert(key);
@@ -155,11 +155,11 @@ public:
}
void remove_key(const string &objname, const string &key) {
- remove_key(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
+ remove_key(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
key);
}
- void remove_key(hobject_t hoid,
+ void remove_key(ghobject_t hoid,
string key) {
set<string> to_remove;
to_remove.insert(key);
@@ -167,11 +167,11 @@ public:
}
void remove_xattr(const string &objname, const string &key) {
- remove_xattr(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
+ remove_xattr(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
key);
}
- void remove_xattr(hobject_t hoid,
+ void remove_xattr(ghobject_t hoid,
string key) {
set<string> to_remove;
to_remove.insert(key);
@@ -179,20 +179,20 @@ public:
}
void clone(const string &objname, const string &target) {
- clone(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
- hobject_t(sobject_t(target, CEPH_NOSNAP)));
+ clone(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
+ ghobject_t(hobject_t(sobject_t(target, CEPH_NOSNAP))));
}
- void clone(hobject_t hoid,
- hobject_t hoid2) {
+ void clone(ghobject_t hoid,
+ ghobject_t hoid2) {
db->clone(hoid, hoid2);
}
void clear(const string &objname) {
- clear(hobject_t(sobject_t(objname, CEPH_NOSNAP)));
+ clear(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))));
}
- void clear(hobject_t hoid) {
+ void clear(ghobject_t hoid) {
db->clear(hoid);
}
@@ -543,7 +543,7 @@ int main(int argc, char **argv) {
}
TEST_F(ObjectMapTest, CreateOneObject) {
- hobject_t hoid(sobject_t("foo", CEPH_NOSNAP));
+ ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP)), 100, 0);
map<string, bufferlist> to_set;
string key("test");
string val("test_val");
@@ -579,8 +579,8 @@ TEST_F(ObjectMapTest, CreateOneObject) {
}
TEST_F(ObjectMapTest, CloneOneObject) {
- hobject_t hoid(sobject_t("foo", CEPH_NOSNAP));
- hobject_t hoid2(sobject_t("foo2", CEPH_NOSNAP));
+ ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP)), 200, 0);
+ ghobject_t hoid2(hobject_t(sobject_t("foo2", CEPH_NOSNAP)), 201, 1);
tester.set_key(hoid, "foo", "bar");
tester.set_key(hoid, "foo2", "bar2");
@@ -640,8 +640,8 @@ TEST_F(ObjectMapTest, CloneOneObject) {
}
TEST_F(ObjectMapTest, OddEvenClone) {
- hobject_t hoid(sobject_t("foo", CEPH_NOSNAP));
- hobject_t hoid2(sobject_t("foo2", CEPH_NOSNAP));
+ ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP)));
+ ghobject_t hoid2(hobject_t(sobject_t("foo2", CEPH_NOSNAP)));
for (unsigned i = 0; i < 1000; ++i) {
tester.set_key(hoid, "foo" + num_str(i), "bar" + num_str(i));
diff --git a/src/test/ObjectMap/test_store_tool/test_store_tool.cc b/src/test/ObjectMap/test_store_tool/test_store_tool.cc
index ace91220df6..8fcf3f30e82 100644
--- a/src/test/ObjectMap/test_store_tool/test_store_tool.cc
+++ b/src/test/ObjectMap/test_store_tool/test_store_tool.cc
@@ -24,6 +24,7 @@
#include "common/errno.h"
#include "common/safe_io.h"
#include "common/config.h"
+#include "common/strtol.h"
using namespace std;
@@ -38,7 +39,7 @@ class StoreTool
db.reset(db_ptr);
}
- void list(const string &prefix) {
+ void list(const string &prefix, const bool do_crc) {
KeyValueDB::WholeSpaceIterator iter = db->get_iterator();
if (prefix.empty())
@@ -51,7 +52,11 @@ class StoreTool
if (!prefix.empty() && (rk.first != prefix))
break;
- std::cout << rk.first << ":" << rk.second << std::endl;
+ std::cout << rk.first << ":" << rk.second;
+ if (do_crc) {
+ std::cout << " (" << iter->value().crc32c(0) << ")";
+ }
+ std::cout << std::endl;
iter->next();
}
}
@@ -79,7 +84,7 @@ class StoreTool
assert(!prefix.empty() && !key.empty());
map<string,bufferlist> result;
- set<string> keys;
+ std::set<std::string> keys;
keys.insert(key);
db->get(prefix, keys, &result);
@@ -90,6 +95,29 @@ class StoreTool
exists = false;
return bufferlist();
}
+
+ uint64_t get_size() {
+ map<string,uint64_t> extras;
+ uint64_t s = db->get_estimated_size(extras);
+ for (map<string,uint64_t>::iterator p = extras.begin();
+ p != extras.end(); ++p) {
+ std::cout << p->first << " - " << p->second << std::endl;
+ }
+ std::cout << "total: " << s << std::endl;
+ return s;
+ }
+
+ bool set(const string &prefix, const string &key, bufferlist &val) {
+ assert(!prefix.empty());
+ assert(!key.empty());
+ assert(val.length() > 0);
+
+ KeyValueDB::Transaction tx = db->get_transaction();
+ tx->set(prefix, key, val);
+ int ret = db->submit_transaction_sync(tx);
+
+ return (ret == 0);
+ }
};
void usage(const char *pname)
@@ -98,9 +126,12 @@ void usage(const char *pname)
<< "\n"
<< "Commands:\n"
<< " list [prefix]\n"
+ << " list-crc [prefix]\n"
<< " exists <prefix> [key]\n"
<< " get <prefix> <key>\n"
- << " verify <store path>\n"
+ << " crc <prefix> <key>\n"
+ << " get-size\n"
+ << " set <prefix> <key> [ver <N>|in <file>]\n"
<< std::endl;
}
@@ -128,12 +159,14 @@ int main(int argc, const char *argv[])
StoreTool st(path);
- if (cmd == "list") {
+ if (cmd == "list" || cmd == "list-crc") {
string prefix;
if (argc > 3)
prefix = argv[3];
- st.list(prefix);
+ bool do_crc = (cmd == "list-crc");
+
+ st.list(prefix, do_crc);
} else if (cmd == "exists") {
string key;
@@ -171,8 +204,63 @@ int main(int argc, const char *argv[])
bl.hexdump(os);
std::cout << os.str() << std::endl;
- } else if (cmd == "verify") {
- assert(0);
+ } else if (cmd == "crc") {
+ if (argc < 5) {
+ usage(argv[0]);
+ return 1;
+ }
+ string prefix(argv[3]);
+ string key(argv[4]);
+
+ bool exists = false;
+ bufferlist bl = st.get(prefix, key, exists);
+ std::cout << "(" << prefix << ", " << key << ") ";
+ if (!exists) {
+ std::cout << " does not exist" << std::endl;
+ return 1;
+ }
+ std::cout << " crc " << bl.crc32c(0) << std::endl;
+
+ } else if (cmd == "get-size") {
+ std::cout << "estimated store size: " << st.get_size() << std::endl;
+
+ } else if (cmd == "set") {
+ if (argc < 7) {
+ usage(argv[0]);
+ return 1;
+ }
+ string prefix(argv[3]);
+ string key(argv[4]);
+ string subcmd(argv[5]);
+
+ bufferlist val;
+ string errstr;
+ if (subcmd == "ver") {
+ version_t v = (version_t) strict_strtoll(argv[6], 10, &errstr);
+ if (!errstr.empty()) {
+ std::cerr << "error reading version: " << errstr << std::endl;
+ return 1;
+ }
+ ::encode(v, val);
+ } else if (subcmd == "in") {
+ int ret = val.read_file(argv[6], &errstr);
+ if (ret < 0 || !errstr.empty()) {
+ std::cerr << "error reading file: " << errstr << std::endl;
+ return 1;
+ }
+ } else {
+ std::cerr << "unrecognized subcommand '" << subcmd << "'" << std::endl;
+ usage(argv[0]);
+ return 1;
+ }
+
+ bool ret = st.set(prefix, key, val);
+ if (!ret) {
+ std::cerr << "error setting ("
+ << prefix << "," << key << ")" << std::endl;
+ return 1;
+ }
+
} else {
std::cerr << "Unrecognized command: " << cmd << std::endl;
return 1;
diff --git a/src/barclass.cc b/src/test/barclass.cc
index f5354f1e0f3..f5354f1e0f3 100644
--- a/src/barclass.cc
+++ b/src/test/barclass.cc
diff --git a/src/test/test_libcommon_build.cc b/src/test/buildtest_skeleton.cc
index 8215a05dbda..8215a05dbda 100644
--- a/src/test/test_libcommon_build.cc
+++ b/src/test/buildtest_skeleton.cc
diff --git a/src/test/ceph_compatset.cc b/src/test/ceph_compatset.cc
new file mode 100644
index 00000000000..2b57db01ab9
--- /dev/null
+++ b/src/test/ceph_compatset.cc
@@ -0,0 +1,164 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <fstream>
+#include <iostream>
+#include <errno.h>
+#include <sys/stat.h>
+#include <signal.h>
+#include <ctype.h>
+#include <boost/scoped_ptr.hpp>
+#include <string>
+
+#include "include/types.h"
+#include "include/compat.h"
+
+//#undef assert
+//#define assert(foo) if (!(foo)) abort();
+
+#include "include/CompatSet.h"
+
+#include "gtest/gtest.h"
+#include <vector>
+
+TEST(CephCompatSet, AllSet) {
+ CompatSet::FeatureSet compat;
+ CompatSet::FeatureSet ro;
+ CompatSet::FeatureSet incompat;
+
+ EXPECT_THROW(compat.insert(CompatSet::Feature(0, "test")), FailedAssertion);
+ EXPECT_THROW(compat.insert(CompatSet::Feature(64, "test")), FailedAssertion);
+
+ for (int i = 1; i < 64; i++) {
+ stringstream cname;
+ cname << string("c") << i;
+ compat.insert(CompatSet::Feature(i,cname.str().c_str()));
+ stringstream roname;
+ roname << string("r") << i;
+ ro.insert(CompatSet::Feature(i,roname.str().c_str()));
+ stringstream iname;
+ iname << string("i") << i;
+ incompat.insert(CompatSet::Feature(i,iname.str().c_str()));
+ }
+ CompatSet tcs(compat, ro, incompat);
+
+ //cout << tcs << std::endl;
+
+ //Due to a workaround for a bug bit 0 is always set even though it is
+ //not a legal feature.
+ EXPECT_EQ(tcs.compat.mask, (uint64_t)0xffffffffffffffff);
+ EXPECT_EQ(tcs.ro_compat.mask, (uint64_t)0xffffffffffffffff);
+ EXPECT_EQ(tcs.incompat.mask, (uint64_t)0xffffffffffffffff);
+
+ for (int i = 1; i < 64; i++) {
+ EXPECT_TRUE(tcs.compat.contains(i));
+ stringstream cname;
+ cname << string("c") << i;
+ EXPECT_TRUE(tcs.compat.contains(CompatSet::Feature(i,cname.str().c_str())));
+ tcs.compat.remove(i);
+
+ EXPECT_TRUE(tcs.ro_compat.contains(i));
+ stringstream roname;
+ roname << string("r") << i;
+ EXPECT_TRUE(tcs.ro_compat.contains(CompatSet::Feature(i,roname.str().c_str())));
+ tcs.ro_compat.remove(i);
+
+ EXPECT_TRUE(tcs.incompat.contains(i));
+ stringstream iname;
+ iname << string("i") << i;
+ EXPECT_TRUE(tcs.incompat.contains(CompatSet::Feature(i,iname.str().c_str())));
+ tcs.incompat.remove(i);
+ }
+ //Due to a workaround for a bug bit 0 is always set even though it is
+ //not a legal feature.
+ EXPECT_EQ(tcs.compat.mask, (uint64_t)1);
+ EXPECT_TRUE(tcs.compat.names.empty());
+ EXPECT_EQ(tcs.ro_compat.mask, (uint64_t)1);
+ EXPECT_TRUE(tcs.ro_compat.names.empty());
+ EXPECT_EQ(tcs.incompat.mask, (uint64_t)1);
+ EXPECT_TRUE(tcs.incompat.names.empty());
+}
+
+TEST(CephCompatSet, other) {
+ CompatSet s1, s2, s1dup;
+
+ s1.compat.insert(CompatSet::Feature(1, "c1"));
+ s1.compat.insert(CompatSet::Feature(2, "c2"));
+ s1.compat.insert(CompatSet::Feature(32, "c32"));
+ s1.ro_compat.insert(CompatSet::Feature(63, "r63"));
+ s1.incompat.insert(CompatSet::Feature(1, "i1"));
+
+ s2.compat.insert(CompatSet::Feature(1, "c1"));
+ s2.compat.insert(CompatSet::Feature(32, "c32"));
+ s2.ro_compat.insert(CompatSet::Feature(63, "r63"));
+ s2.incompat.insert(CompatSet::Feature(1, "i1"));
+
+ s1dup = s1;
+
+ //Check exact match
+ EXPECT_EQ(s1.compare(s1dup), 0);
+
+ //Check superset
+ EXPECT_EQ(s1.compare(s2), 1);
+
+ //Check missing features
+ EXPECT_EQ(s2.compare(s1), -1);
+
+ CompatSet diff = s2.unsupported(s1);
+ EXPECT_EQ(diff.compat.mask, (uint64_t)1<<2 | 1);
+ EXPECT_EQ(diff.ro_compat.mask, (uint64_t)1);
+ EXPECT_EQ(diff.incompat.mask, (uint64_t)1);
+
+ CompatSet s3 = s1;
+ s3.incompat.insert(CompatSet::Feature(4, "i4"));
+
+ diff = s1.unsupported(s3);
+ EXPECT_EQ(diff.compat.mask, (uint64_t)1);
+ EXPECT_EQ(diff.ro_compat.mask, (uint64_t)1);
+ EXPECT_EQ(diff.incompat.mask, (uint64_t)1<<4 | 1);
+}
+
+TEST(CephCompatSet, merge) {
+ CompatSet s1, s2, s1dup, s2dup;
+
+ s1.compat.insert(CompatSet::Feature(1, "c1"));
+ s1.compat.insert(CompatSet::Feature(2, "c2"));
+ s1.compat.insert(CompatSet::Feature(32, "c32"));
+ s1.ro_compat.insert(CompatSet::Feature(63, "r63"));
+ s1.incompat.insert(CompatSet::Feature(1, "i1"));
+
+ s1dup = s1;
+
+ s2.compat.insert(CompatSet::Feature(1, "c1"));
+ s2.compat.insert(CompatSet::Feature(32, "c32"));
+ s2.ro_compat.insert(CompatSet::Feature(1, "r1"));
+ s2.ro_compat.insert(CompatSet::Feature(63, "r63"));
+ s2.incompat.insert(CompatSet::Feature(1, "i1"));
+
+ s2dup = s2;
+
+ //Nothing to merge if they are the same
+ EXPECT_FALSE(s1.merge(s1dup));
+ EXPECT_FALSE(s2.merge(s2dup));
+
+ EXPECT_TRUE(s1.merge(s2));
+ EXPECT_EQ(s1.compat.mask, (uint64_t)1<<1 | (uint64_t)1<<2 | (uint64_t)1<<32 | 1);
+ EXPECT_EQ(s1.ro_compat.mask, (uint64_t)1<<1 | (uint64_t)1<<63 | 1);
+ EXPECT_EQ(s1.incompat.mask, (uint64_t)1<<1 | 1);
+
+ EXPECT_TRUE(s2.merge(s1dup));
+ EXPECT_EQ(s2.compat.mask, (uint64_t)1<<1 | (uint64_t)1<<2 | (uint64_t)1<<32 | 1);
+ EXPECT_EQ(s2.ro_compat.mask, (uint64_t)1<<1 | (uint64_t)1<<63 | 1);
+ EXPECT_EQ(s2.incompat.mask, (uint64_t)1<<1 | 1);
+}
diff --git a/src/test/cli-integration/rbd/formatted-output.t b/src/test/cli-integration/rbd/formatted-output.t
index bece14f11f1..707e0749367 100644
--- a/src/test/cli-integration/rbd/formatted-output.t
+++ b/src/test/cli-integration/rbd/formatted-output.t
@@ -39,7 +39,7 @@ For now, use a more inclusive regex.
$ rbd info foo
rbd image 'foo':
\tsize 1024 MB in 256 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 1 (esc)
$ rbd info foo --format json | python -mjson.tool
@@ -67,7 +67,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
$ rbd info foo@snap
rbd image 'foo':
\tsize 1024 MB in 256 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 1 (esc)
\tprotected: False (esc)
@@ -96,7 +96,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
$ rbd info bar
rbd image 'bar':
\tsize 1024 MB in 256 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering (esc)
@@ -131,7 +131,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
$ rbd info bar@snap
rbd image 'bar':
\tsize 512 MB in 128 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering (esc)
@@ -169,7 +169,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
$ rbd info bar@snap2
rbd image 'bar':
\tsize 1024 MB in 256 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering (esc)
@@ -207,7 +207,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
$ rbd info baz
rbd image 'baz':
\tsize 2048 MB in 512 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering (esc)
@@ -241,8 +241,8 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
</image>
$ rbd info quux
rbd image 'quux':
- \tsize 1024 KB in 1 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \tsize 1024 kB in 1 objects (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 1 (esc)
$ rbd info quux --format json | python -mjson.tool
@@ -268,7 +268,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
$ rbd info data/child
rbd image 'child':
\tsize 512 MB in 128 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering (esc)
@@ -303,7 +303,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
$ rbd info data/child@snap
rbd image 'child':
\tsize 512 MB in 128 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering (esc)
@@ -375,7 +375,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
NAME SIZE PARENT FMT PROT LOCK
foo 1024M 1
foo@snap 1024M 1
- quux 1024K 1 excl
+ quux 1024k 1 excl
bar 1024M 2
bar@snap 512M 2 yes
bar@snap2 1024M 2
diff --git a/src/test/cli/radosgw-admin/help.t b/src/test/cli/radosgw-admin/help.t
index 90f6beca133..4fe30b1cda7 100644
--- a/src/test/cli/radosgw-admin/help.t
+++ b/src/test/cli/radosgw-admin/help.t
@@ -23,6 +23,9 @@
bucket check check bucket index
object rm remove object
object unlink unlink object from bucket index
+ quota set set quota params
+ quota enable enable quota
+ quota disable disable quota
region get show region info
regions list list all regions set on this cluster
region set set region info (requires infile)
@@ -88,6 +91,7 @@
mdlog trim
replica mdlog get/delete
replica datalog get/delete
+ --metadata-key=<key> key to retrieve metadata from with metadata get
--rgw-region=<region> region in which radosgw is running
--rgw-zone=<zone> zone in which radosgw is running
--fix besides checking bucket index, will also fix it
@@ -115,9 +119,16 @@
<date> := "YYYY-MM-DD[ hh:mm:ss]"
- --conf/-c Read configuration from the given configuration file
- --id/-i set ID portion of my name
- --name/-n set name (TYPE.ID)
- --version show version and quit
+ Quota options:
+ --bucket specified bucket for quota command
+ --max-objects specify max objects
+ --max-size specify max size (in bytes)
+ --quota-scope scope of quota (bucket, user)
+
+ --conf/-c FILE read configuration from the given configuration file
+ --id/-i ID set ID portion of my name
+ --name/-n TYPE.ID set name
+ --cluster NAME set cluster name (default: ceph)
+ --version show version and quit
[1]
diff --git a/src/test/cli/rbd/help.t b/src/test/cli/rbd/help.t
index 1ad79385a7e..754e11f9357 100644
--- a/src/test/cli/rbd/help.t
+++ b/src/test/cli/rbd/help.t
@@ -76,4 +76,5 @@
--pretty-format make json or xml output more readable
--no-settle do not wait for udevadm to settle on map/unmap
--no-progress do not show progress for long-running commands
+ --read-only set device readonly when mapping image
--allow-shrink allow shrinking of an image when resizing
diff --git a/src/test/common/get_command_descriptions.cc b/src/test/common/get_command_descriptions.cc
new file mode 100644
index 00000000000..aff5575b8c4
--- /dev/null
+++ b/src/test/common/get_command_descriptions.cc
@@ -0,0 +1,116 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Library Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Library Public License for more details.
+ *
+ */
+
+#include <stdio.h>
+#include <signal.h>
+#include "mon/Monitor.h"
+#include "common/ceph_argparse.h"
+#include "global/global_init.h"
+#include "common/debug.h"
+
+#define dout_subsys ceph_subsys_mon
+
+static void usage(ostream &out)
+{
+ out << "usage: get_command_descriptions [options ...]" << std::endl;
+ out << "print on stdout the result of JSON formatted options\n";
+ out << "found in mon/MonCommands.h as produced by the\n";
+ out << "Monitor.cc::get_command_descriptions function.\n";
+ out << "Designed as a helper for ceph_argparse.py unit tests.\n";
+ out << "\n";
+ out << " --all all of mon/MonCommands.h \n";
+ out << " --pull585 reproduce the bug fixed by #585\n";
+ out << "\n";
+ out << "Examples:\n";
+ out << " get_command_descriptions --all\n";
+ out << " get_command_descriptions --pull585\n";
+}
+
+static void json_print(const MonCommand *mon_commands, int size)
+{
+ bufferlist rdata;
+ Formatter *f = new_formatter("json");
+ get_command_descriptions(mon_commands, size, f, &rdata);
+ delete f;
+ string data(rdata.c_str(), rdata.length());
+ dout(0) << data << dendl;
+}
+
+static void all()
+{
+#undef COMMAND
+ MonCommand mon_commands[] = {
+#define COMMAND(parsesig, helptext, modulename, req_perms, avail) \
+ {parsesig, helptext, modulename, req_perms, avail},
+#include <mon/MonCommands.h>
+ };
+
+ json_print(mon_commands, ARRAY_SIZE(mon_commands));
+}
+
+// syntax error https://github.com/ceph/ceph/pull/585
+static void pull585()
+{
+ MonCommand mon_commands[] = {
+ { "osd pool create "
+ "name=pool,type=CephPoolname "
+ "name=pg_num,type=CephInt,range=0 "
+ "name=pgp_num,type=CephInt,range=0,req=false" // !!! missing trailing space
+ "name=properties,type=CephString,n=N,req=false,goodchars=[A-Za-z0-9-_.=]",
+ "create pool", "osd", "rw", "cli,rest" }
+ };
+
+ json_print(mon_commands, ARRAY_SIZE(mon_commands));
+}
+
+int main(int argc, char **argv) {
+ vector<const char*> args;
+ argv_to_vec(argc, (const char **)argv, args);
+
+ global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
+ common_init_finish(g_ceph_context);
+
+ if (args.empty()) {
+ usage(cerr);
+ exit(1);
+ }
+ for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ++i) {
+ string err;
+
+ if (*i == string("help") || *i == string("-h") || *i == string("--help")) {
+ usage(cout);
+ exit(0);
+ } else if (*i == string("--all")) {
+ all();
+ } else if (*i == string("--pull585")) {
+ pull585();
+ }
+ }
+}
+
+/*
+ * Local Variables:
+ * compile-command: "cd ../.. ;
+ * make get_command_descriptions &&
+ * ./get_command_descriptions --all --pull585"
+ * End:
+ */
+
diff --git a/src/test/common/test_bloom_filter.cc b/src/test/common/test_bloom_filter.cc
new file mode 100644
index 00000000000..cfd41305caa
--- /dev/null
+++ b/src/test/common/test_bloom_filter.cc
@@ -0,0 +1,289 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank <info@inktank.com>
+ *
+ * LGPL2.1 (see COPYING-LGPL2.1) or later
+ */
+
+#include <iostream>
+#include <gtest/gtest.h>
+
+#include "include/stringify.h"
+#include "common/bloom_filter.hpp"
+
+TEST(BloomFilter, Basic) {
+ bloom_filter bf(10, .1, 1);
+ bf.insert("foo");
+ bf.insert("bar");
+
+ ASSERT_TRUE(bf.contains("foo"));
+ ASSERT_TRUE(bf.contains("bar"));
+}
+
+TEST(BloomFilter, Empty) {
+ bloom_filter bf;
+ for (int i=0; i<100; ++i) {
+ ASSERT_FALSE(bf.contains(i));
+ ASSERT_FALSE(bf.contains(stringify(i)));
+ }
+}
+
+TEST(BloomFilter, Sweep) {
+ std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
+ std::cout.precision(5);
+ std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl;
+ for (int ex = 3; ex < 12; ex += 2) {
+ for (float fpp = .001; fpp < .5; fpp *= 4.0) {
+ int max = 2 << ex;
+ bloom_filter bf(max, fpp, 1);
+ bf.insert("foo");
+ bf.insert("bar");
+
+ ASSERT_TRUE(bf.contains("foo"));
+ ASSERT_TRUE(bf.contains("bar"));
+
+ for (int n = 0; n < max; n++)
+ bf.insert("ok" + stringify(n));
+
+ int test = max * 100;
+ int hit = 0;
+ for (int n = 0; n < test; n++)
+ if (bf.contains("asdf" + stringify(n)))
+ hit++;
+
+ ASSERT_TRUE(bf.contains("foo"));
+ ASSERT_TRUE(bf.contains("bar"));
+
+ double actual = (double)hit / (double)test;
+
+ bufferlist bl;
+ ::encode(bf, bl);
+
+ double byte_per_insert = (double)bl.length() / (double)max;
+
+ std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert << std::endl;
+ ASSERT_TRUE(actual < fpp * 10);
+
+ }
+ }
+}
+
+TEST(BloomFilter, SweepInt) {
+ std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
+ std::cout.precision(5);
+ std::cout << "# max\tfpp\tactual\tsize\tB/insert\tdensity\tapprox_element_count" << std::endl;
+ for (int ex = 3; ex < 12; ex += 2) {
+ for (float fpp = .001; fpp < .5; fpp *= 4.0) {
+ int max = 2 << ex;
+ bloom_filter bf(max, fpp, 1);
+ bf.insert("foo");
+ bf.insert("bar");
+
+ ASSERT_TRUE(123);
+ ASSERT_TRUE(456);
+
+ for (int n = 0; n < max; n++)
+ bf.insert(n);
+
+ int test = max * 100;
+ int hit = 0;
+ for (int n = 0; n < test; n++)
+ if (bf.contains(100000 + n))
+ hit++;
+
+ ASSERT_TRUE(123);
+ ASSERT_TRUE(456);
+
+ double actual = (double)hit / (double)test;
+
+ bufferlist bl;
+ ::encode(bf, bl);
+
+ double byte_per_insert = (double)bl.length() / (double)max;
+
+ std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert
+ << "\t" << bf.density() << "\t" << bf.approx_unique_element_count() << std::endl;
+ ASSERT_TRUE(actual < fpp * 10);
+ ASSERT_TRUE(actual > fpp / 10);
+ ASSERT_TRUE(bf.density() > 0.40);
+ ASSERT_TRUE(bf.density() < 0.60);
+ }
+ }
+}
+
+
+TEST(BloomFilter, CompressibleSweep) {
+ std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
+ std::cout.precision(5);
+ std::cout << "# max\tins\test ins\tafter\ttgtfpp\tactual\tsize\tb/elem\n";
+ float fpp = .01;
+ int max = 1024;
+ for (int div = 1; div < 10; div++) {
+ compressible_bloom_filter bf(max, fpp, 1);
+ int t = max/div;
+ for (int n = 0; n < t; n++)
+ bf.insert(n);
+
+ unsigned est = bf.approx_unique_element_count();
+ if (div > 1)
+ bf.compress(1.0 / div);
+
+ for (int n = 0; n < t; n++)
+ ASSERT_TRUE(bf.contains(n));
+
+ int test = max * 100;
+ int hit = 0;
+ for (int n = 0; n < test; n++)
+ if (bf.contains(100000 + n))
+ hit++;
+
+ double actual = (double)hit / (double)test;
+
+ bufferlist bl;
+ ::encode(bf, bl);
+
+ double byte_per_insert = (double)bl.length() / (double)max;
+ unsigned est_after = bf.approx_unique_element_count();
+ std::cout << max
+ << "\t" << t
+ << "\t" << est
+ << "\t" << est_after
+ << "\t" << fpp
+ << "\t" << actual
+ << "\t" << bl.length() << "\t" << byte_per_insert
+ << std::endl;
+
+ ASSERT_TRUE(actual < fpp * 2.0);
+ ASSERT_TRUE(actual > fpp / 2.0);
+ ASSERT_TRUE(est_after < est * 2);
+ ASSERT_TRUE(est_after > est / 2);
+ }
+}
+
+
+
+TEST(BloomFilter, BinSweep) {
+ std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
+ std::cout.precision(5);
+ int total_max = 16384;
+ float total_fpp = .01;
+ std::cout << "total_inserts " << total_max << " target-fpp " << total_fpp << std::endl;
+ for (int bins = 1; bins < 16; ++bins) {
+ int max = total_max / bins;
+ float fpp = total_fpp / bins;//pow(total_fpp, bins);
+
+ std::vector<bloom_filter*> ls;
+ bufferlist bl;
+ for (int i=0; i<bins; i++) {
+ ls.push_back(new bloom_filter(max, fpp, i));
+ for (int j=0; j<max; j++) {
+ ls.back()->insert(10000 * (i+1) + j);
+ }
+ ::encode(*ls.front(), bl);
+ }
+
+ int hit = 0;
+ int test = max * 100;
+ for (int i=0; i<test; ++i) {
+ for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) {
+ if ((*j)->contains(i * 732)) { // note: sequential i does not work here; the intenral int hash is weak!!
+ hit++;
+ break;
+ }
+ }
+ }
+
+ double actual = (double)hit / (double)test;
+ std::cout << "bins " << bins << " bin-max " << max << " bin-fpp " << fpp
+ << " actual-fpp " << actual
+ << " total-size " << bl.length() << std::endl;
+ }
+}
+
+// disable these tests; doing dual insertions in consecutive filters
+// appears to be equivalent to doing a single insertion in a bloom
+// filter that is twice as big.
+#if 0
+
+// test the fpp over a sequence of bloom filters, each with unique
+// items inserted into it.
+//
+// we expect: actual_fpp = num_filters * per_filter_fpp
+TEST(BloomFilter, Sequence) {
+
+ int max = 1024;
+ double fpp = .01;
+ for (int seq = 2; seq <= 128; seq *= 2) {
+ std::vector<bloom_filter*> ls;
+ for (int i=0; i<seq; i++) {
+ ls.push_back(new bloom_filter(max*2, fpp, i));
+ for (int j=0; j<max; j++) {
+ ls.back()->insert("ok" + stringify(j) + "_" + stringify(i));
+ if (ls.size() > 1)
+ ls[ls.size() - 2]->insert("ok" + stringify(j) + "_" + stringify(i));
+ }
+ }
+
+ int hit = 0;
+ int test = max * 100;
+ for (int i=0; i<test; ++i) {
+ for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) {
+ if ((*j)->contains("bad" + stringify(i))) {
+ hit++;
+ break;
+ }
+ }
+ }
+
+ double actual = (double)hit / (double)test;
+ std::cout << "seq " << seq << " max " << max << " fpp " << fpp << " actual " << actual << std::endl;
+ }
+}
+
+// test the ffp over a sequence of bloom filters, where actual values
+// are always inserted into a consecutive pair of filters. in order
+// to have a false positive, we need to falsely match two consecutive
+// filters.
+//
+// we expect: actual_fpp = num_filters * per_filter_fpp^2
+TEST(BloomFilter, SequenceDouble) {
+ int max = 1024;
+ double fpp = .01;
+ for (int seq = 2; seq <= 128; seq *= 2) {
+ std::vector<bloom_filter*> ls;
+ for (int i=0; i<seq; i++) {
+ ls.push_back(new bloom_filter(max*2, fpp, i));
+ for (int j=0; j<max; j++) {
+ ls.back()->insert("ok" + stringify(j) + "_" + stringify(i));
+ if (ls.size() > 1)
+ ls[ls.size() - 2]->insert("ok" + stringify(j) + "_" + stringify(i));
+ }
+ }
+
+ int hit = 0;
+ int test = max * 100;
+ int run = 0;
+ for (int i=0; i<test; ++i) {
+ for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) {
+ if ((*j)->contains("bad" + stringify(i))) {
+ run++;
+ if (run >= 2) {
+ hit++;
+ break;
+ }
+ } else {
+ run = 0;
+ }
+ }
+ }
+
+ double actual = (double)hit / (double)test;
+ std::cout << "seq " << seq << " max " << max << " fpp " << fpp << " actual " << actual
+ << " expected " << (fpp*fpp*(double)seq) << std::endl;
+ }
+}
+
+#endif
diff --git a/src/test/common/test_crc32c.cc b/src/test/common/test_crc32c.cc
new file mode 100644
index 00000000000..5cf88de0a80
--- /dev/null
+++ b/src/test/common/test_crc32c.cc
@@ -0,0 +1,84 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+#include <string.h>
+
+#include "include/types.h"
+#include "include/crc32c.h"
+#include "include/utime.h"
+#include "common/Clock.h"
+
+#include "gtest/gtest.h"
+
+#include "common/sctp_crc32.h"
+#include "common/crc32c_intel_baseline.h"
+
+TEST(Crc32c, Small) {
+ const char *a = "foo bar baz";
+ const char *b = "whiz bang boom";
+ ASSERT_EQ(4119623852u, ceph_crc32c(0, (unsigned char *)a, strlen(a)));
+ ASSERT_EQ(881700046u, ceph_crc32c(1234, (unsigned char *)a, strlen(a)));
+ ASSERT_EQ(2360230088u, ceph_crc32c(0, (unsigned char *)b, strlen(b)));
+ ASSERT_EQ(3743019208u, ceph_crc32c(5678, (unsigned char *)b, strlen(b)));
+}
+
+TEST(Crc32c, PartialWord) {
+ const char *a = (const char *)malloc(5);
+ const char *b = (const char *)malloc(35);
+ memset((void *)a, 1, 5);
+ memset((void *)b, 1, 35);
+ ASSERT_EQ(2715569182u, ceph_crc32c(0, (unsigned char *)a, 5));
+ ASSERT_EQ(440531800u, ceph_crc32c(0, (unsigned char *)b, 35));
+}
+
+TEST(Crc32c, Big) {
+ int len = 4096000;
+ char *a = (char *)malloc(len);
+ memset(a, 1, len);
+ ASSERT_EQ(31583199u, ceph_crc32c(0, (unsigned char *)a, len));
+ ASSERT_EQ(1400919119u, ceph_crc32c(1234, (unsigned char *)a, len));
+}
+
+TEST(Crc32c, Performance) {
+ int len = 1000 * 1024 * 1024;
+ char *a = (char *)malloc(len);
+ std::cout << "populating large buffer" << std::endl;
+ for (int i=0; i<len; i++)
+ a[i] = i & 0xff;
+ std::cout << "calculating crc" << std::endl;
+
+ {
+ utime_t start = ceph_clock_now(NULL);
+ unsigned val = ceph_crc32c(0, (unsigned char *)a, len);
+ utime_t end = ceph_clock_now(NULL);
+ float rate = (float)len / (float)(1024*1024) / (float)(end - start);
+ std::cout << "best choice = " << rate << " MB/sec" << std::endl;
+ ASSERT_EQ(261108528u, val);
+ }
+ {
+ utime_t start = ceph_clock_now(NULL);
+ unsigned val = ceph_crc32c(0xffffffff, (unsigned char *)a, len);
+ utime_t end = ceph_clock_now(NULL);
+ float rate = (float)len / (float)(1024*1024) / (float)(end - start);
+ std::cout << "best choice 0xffffffff = " << rate << " MB/sec" << std::endl;
+ ASSERT_EQ(3895876243u, val);
+ }
+ {
+ utime_t start = ceph_clock_now(NULL);
+ unsigned val = ceph_crc32c_sctp(0, (unsigned char *)a, len);
+ utime_t end = ceph_clock_now(NULL);
+ float rate = (float)len / (float)(1024*1024) / (float)(end - start);
+ std::cout << "sctp = " << rate << " MB/sec" << std::endl;
+ ASSERT_EQ(261108528u, val);
+ }
+ {
+ utime_t start = ceph_clock_now(NULL);
+ unsigned val = ceph_crc32c_intel_baseline(0, (unsigned char *)a, len);
+ utime_t end = ceph_clock_now(NULL);
+ float rate = (float)len / (float)(1024*1024) / (float)(end - start);
+ std::cout << "intel baseline = " << rate << " MB/sec" << std::endl;
+ ASSERT_EQ(261108528u, val);
+ }
+
+}
diff --git a/src/test/common/test_sharedptr_registry.cc b/src/test/common/test_sharedptr_registry.cc
index aec2107c9e5..b1713a9bd9f 100644
--- a/src/test/common/test_sharedptr_registry.cc
+++ b/src/test/common/test_sharedptr_registry.cc
@@ -137,8 +137,8 @@ TEST_F(SharedPtrRegistry_all, wait_lookup_or_create) {
EXPECT_TRUE(registry.lookup_or_create(key + 12345));
registry.remove(key);
ASSERT_TRUE(wait_for(registry, 0));
- EXPECT_TRUE(t.ptr);
t.join();
+ EXPECT_TRUE(t.ptr);
}
{
unsigned int key = 2;
@@ -163,9 +163,9 @@ TEST_F(SharedPtrRegistry_all, wait_lookup_or_create) {
}
registry.remove(key);
ASSERT_TRUE(wait_for(registry, 0));
+ t.join();
EXPECT_TRUE(t.ptr);
EXPECT_EQ(value, *t.ptr);
- t.join();
}
}
@@ -200,8 +200,8 @@ TEST_F(SharedPtrRegistry_all, wait_lookup) {
EXPECT_FALSE(registry.lookup(key + 12345));
registry.remove(key);
ASSERT_TRUE(wait_for(registry, 0));
- EXPECT_FALSE(t.ptr);
t.join();
+ EXPECT_FALSE(t.ptr);
}
TEST_F(SharedPtrRegistry_all, get_next) {
@@ -238,6 +238,24 @@ TEST_F(SharedPtrRegistry_all, get_next) {
EXPECT_FALSE(registry.get_next(i.first, &i));
}
+ {
+ //
+ // http://tracker.ceph.com/issues/6117
+ // reproduce the issue.
+ //
+ SharedPtrRegistryTest registry;
+ const unsigned int key1 = 111;
+ shared_ptr<int> *ptr1 = new shared_ptr<int>(registry.lookup_or_create(key1));
+ const unsigned int key2 = 222;
+ shared_ptr<int> ptr2 = registry.lookup_or_create(key2);
+
+ pair<unsigned int, shared_ptr<int> > i;
+ EXPECT_TRUE(registry.get_next(i.first, &i));
+ EXPECT_EQ(key1, i.first);
+ delete ptr1;
+ EXPECT_TRUE(registry.get_next(i.first, &i));
+ EXPECT_EQ(key2, i.first);
+ }
}
class SharedPtrRegistry_destructor : public ::testing::Test {
diff --git a/src/test/common/test_sloppy_crc_map.cc b/src/test/common/test_sloppy_crc_map.cc
new file mode 100644
index 00000000000..2650f4f960d
--- /dev/null
+++ b/src/test/common/test_sloppy_crc_map.cc
@@ -0,0 +1,113 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/SloppyCRCMap.h"
+#include "common/Formatter.h"
+#include <gtest/gtest.h>
+
+void dump(const SloppyCRCMap& scm)
+{
+ Formatter *f = new_formatter("json-pretty");
+ f->open_object_section("map");
+ scm.dump(f);
+ f->close_section();
+ f->flush(cout);
+ delete f;
+}
+
+TEST(SloppyCRCMap, basic) {
+ SloppyCRCMap scm(4);
+
+ bufferlist a, b;
+ a.append("The quick brown fox jumped over a fence whose color I forget.");
+ b.append("asdf");
+
+ scm.write(0, a.length(), a);
+ if (0)
+ dump(scm);
+ ASSERT_EQ(0, scm.read(0, a.length(), a, &cout));
+
+ scm.write(12, b.length(), b);
+ if (0)
+ dump(scm);
+
+ ASSERT_EQ(0, scm.read(12, b.length(), b, &cout));
+ ASSERT_EQ(1, scm.read(0, a.length(), a, &cout));
+}
+
+TEST(SloppyCRCMap, truncate) {
+ SloppyCRCMap scm(4);
+
+ bufferlist a, b;
+ a.append("asdf");
+ b.append("qwer");
+
+ scm.write(0, a.length(), a);
+ scm.write(4, a.length(), a);
+ ASSERT_EQ(0, scm.read(4, 4, a, &cout));
+ ASSERT_EQ(1, scm.read(4, 4, b, &cout));
+ scm.truncate(4);
+ ASSERT_EQ(0, scm.read(4, 4, b, &cout));
+}
+
+TEST(SloppyCRCMap, zero) {
+ SloppyCRCMap scm(4);
+
+ bufferlist a, b;
+ a.append("asdf");
+ b.append("qwer");
+
+ scm.write(0, a.length(), a);
+ scm.write(4, a.length(), a);
+ ASSERT_EQ(0, scm.read(4, 4, a, &cout));
+ ASSERT_EQ(1, scm.read(4, 4, b, &cout));
+ scm.zero(4, 4);
+ ASSERT_EQ(1, scm.read(4, 4, a, &cout));
+ ASSERT_EQ(1, scm.read(4, 4, b, &cout));
+
+ bufferptr bp(4);
+ bp.zero();
+ bufferlist c;
+ c.append(bp);
+ ASSERT_EQ(0, scm.read(0, 4, a, &cout));
+ ASSERT_EQ(0, scm.read(4, 4, c, &cout));
+ scm.zero(0, 15);
+ ASSERT_EQ(1, scm.read(0, 4, a, &cout));
+ ASSERT_EQ(0, scm.read(0, 4, c, &cout));
+}
+
+TEST(SloppyCRCMap, clone_range) {
+ SloppyCRCMap src(4);
+ SloppyCRCMap dst(4);
+
+ bufferlist a, b;
+ a.append("asdfghjkl");
+ b.append("qwertyui");
+
+ src.write(0, a.length(), a);
+ src.write(8, a.length(), a);
+ src.write(16, a.length(), a);
+
+ dst.write(0, b.length(), b);
+ dst.clone_range(0, 8, 0, src);
+ ASSERT_EQ(2, dst.read(0, 8, b, &cout));
+ ASSERT_EQ(0, dst.read(8, 8, b, &cout));
+
+ dst.write(16, b.length(), b);
+ ASSERT_EQ(2, dst.read(16, 8, a, &cout));
+ dst.clone_range(16, 8, 16, src);
+ ASSERT_EQ(0, dst.read(16, 8, a, &cout));
+
+ dst.write(16, b.length(), b);
+ ASSERT_EQ(1, dst.read(16, 4, a, &cout));
+ dst.clone_range(16, 8, 2, src);
+ ASSERT_EQ(0, dst.read(16, 4, a, &cout));
+
+ dst.write(0, b.length(), b);
+ dst.write(8, b.length(), b);
+ ASSERT_EQ(2, dst.read(0, 8, a, &cout));
+ ASSERT_EQ(2, dst.read(8, 8, a, &cout));
+ dst.clone_range(2, 8, 0, src);
+ ASSERT_EQ(0, dst.read(0, 8, a, &cout));
+ ASSERT_EQ(0, dst.read(8, 4, a, &cout));
+}
diff --git a/src/test/common/test_util.cc b/src/test/common/test_util.cc
index 16713077cfc..cb22047c600 100644
--- a/src/test/common/test_util.cc
+++ b/src/test/common/test_util.cc
@@ -21,6 +21,7 @@ TEST(util, unit_to_bytesize)
{
ASSERT_EQ(1234ll, unit_to_bytesize("1234", &cerr));
ASSERT_EQ(1024ll, unit_to_bytesize("1K", &cerr));
+ ASSERT_EQ(1024ll, unit_to_bytesize("1k", &cerr));
ASSERT_EQ(1048576ll, unit_to_bytesize("1M", &cerr));
ASSERT_EQ(1073741824ll, unit_to_bytesize("1G", &cerr));
ASSERT_EQ(1099511627776ll, unit_to_bytesize("1T", &cerr));
diff --git a/src/test/encoding/ceph_dencoder.cc b/src/test/encoding/ceph_dencoder.cc
index 81abcd1de9e..dbed6f524d8 100644
--- a/src/test/encoding/ceph_dencoder.cc
+++ b/src/test/encoding/ceph_dencoder.cc
@@ -93,7 +93,7 @@ public:
// allow 0- or 1-based (by wrapping)
if (i == 0)
i = m_list.size();
- if (i > m_list.size())
+ if ((i == 0) || (i > m_list.size()))
return "invalid id for generated object";
typename list<T*>::iterator p = m_list.begin();
for (i--; i > 0 && p != m_list.end(); ++p, --i) ;
@@ -177,7 +177,7 @@ public:
// allow 0- or 1-based (by wrapping)
if (i == 0)
i = m_list.size();
- if (i > m_list.size())
+ if ((i == 0) || (i > m_list.size()))
return "invalid id for generated object";
typename list<T*>::iterator p = m_list.begin();
for (i--; i > 0 && p != m_list.end(); ++p, --i) ;
diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h
index 213da6fcccc..18ed795c3ef 100644
--- a/src/test/encoding/types.h
+++ b/src/test/encoding/types.h
@@ -4,6 +4,10 @@ TYPE(CompatSet)
#include "include/filepath.h"
TYPE(filepath)
+#include "common/bloom_filter.hpp"
+TYPE(bloom_filter)
+TYPE(compressible_bloom_filter)
+
#include "common/snap_types.h"
TYPE(SnapContext)
TYPE(SnapRealmInfo)
@@ -16,6 +20,9 @@ TYPE(LogEntryKey)
TYPE(LogEntry)
TYPE(LogSummary)
+#include "common/SloppyCRCMap.h"
+TYPE(SloppyCRCMap)
+
#include "msg/msg_types.h"
TYPE(entity_name_t)
TYPE(entity_addr_t)
@@ -29,12 +36,15 @@ TYPEWITHSTRAYDATA(OSDMap::Incremental)
#include "crush/CrushWrapper.h"
TYPE(CrushWrapper)
+#include "include/histogram.h"
+TYPE(pow2_hist_t)
+
#include "osd/osd_types.h"
TYPE(osd_reqid_t)
TYPE(object_locator_t)
+TYPE(request_redirect_t)
TYPE(pg_t)
TYPE(coll_t)
-TYPE(pow2_hist_t)
TYPE(filestore_perf_stat_t)
TYPE(osd_stat_t)
TYPE(OSDSuperblock)
@@ -53,6 +63,7 @@ TYPE(pg_log_t)
TYPE(pg_missing_t::item)
TYPE(pg_missing_t)
TYPE(pg_ls_response_t)
+TYPE(object_copy_cursor_t)
TYPE(pg_create_t)
TYPE(watch_info_t)
TYPE(object_info_t)
@@ -74,8 +85,9 @@ TYPE(ObjectStore::Transaction)
#include "os/SequencerPosition.h"
TYPE(SequencerPosition)
-#include "os/hobject.h"
+#include "common/hobject.h"
TYPE(hobject_t)
+TYPE(ghobject_t)
#include "mon/AuthMonitor.h"
TYPE(AuthMonitor::Incremental)
diff --git a/src/test/filestore/FileStoreDiff.cc b/src/test/filestore/FileStoreDiff.cc
index b2419f5e298..40c0b32d30c 100644
--- a/src/test/filestore/FileStoreDiff.cc
+++ b/src/test/filestore/FileStoreDiff.cc
@@ -131,7 +131,7 @@ bool FileStoreDiff::diff_objects(FileStore *a_store, FileStore *b_store, coll_t
bool ret = false;
int err;
- std::vector<hobject_t> b_objects, a_objects;
+ std::vector<ghobject_t> b_objects, a_objects;
err = b_store->collection_list(coll, b_objects);
if (err < 0) {
dout(0) << "diff_objects list on verify coll " << coll.to_str()
@@ -151,11 +151,11 @@ bool FileStoreDiff::diff_objects(FileStore *a_store, FileStore *b_store, coll_t
ret = true;
}
- std::vector<hobject_t>::iterator b_it = b_objects.begin();
- std::vector<hobject_t>::iterator a_it = b_objects.begin();
+ std::vector<ghobject_t>::iterator b_it = b_objects.begin();
+ std::vector<ghobject_t>::iterator a_it = b_objects.begin();
for (; b_it != b_objects.end(); ++b_it, ++a_it) {
- hobject_t b_obj = *b_it, a_obj = *a_it;
- if (b_obj.oid.name != a_obj.oid.name) {
+ ghobject_t b_obj = *b_it, a_obj = *a_it;
+ if (b_obj.hobj.oid.name != a_obj.hobj.oid.name) {
dout(0) << "diff_objects name mismatch on A object "
<< coll << "/" << a_obj << " and B object "
<< coll << "/" << b_obj << dendl;
@@ -167,7 +167,7 @@ bool FileStoreDiff::diff_objects(FileStore *a_store, FileStore *b_store, coll_t
err = b_store->stat(coll, b_obj, &b_stat);
if (err < 0) {
dout(0) << "diff_objects error stating B object "
- << coll.to_str() << "/" << b_obj.oid.name << dendl;
+ << coll.to_str() << "/" << b_obj.hobj.oid.name << dendl;
ret = true;
}
err = a_store->stat(coll, a_obj, &a_stat);
diff --git a/src/test/filestore/run_seed_to_range.sh b/src/test/filestore/run_seed_to_range.sh
index c5b399d7aae..365b34918d2 100755
--- a/src/test/filestore/run_seed_to_range.sh
+++ b/src/test/filestore/run_seed_to_range.sh
@@ -12,7 +12,7 @@ mydir=`dirname $0`
for f in `seq $from $to`
do
if ! $mydir/run_seed_to.sh $seed $f; then
- if -d $dir; then
+ if [ -d $dir ]; then
echo copying evidence to $dir
cp -a . $dir
else
diff --git a/src/test/filestore/store_test.cc b/src/test/filestore/store_test.cc
index 80c775052ec..50450f467ff 100644
--- a/src/test/filestore/store_test.cc
+++ b/src/test/filestore/store_test.cc
@@ -51,9 +51,9 @@ public:
}
};
-bool sorted(const vector<hobject_t> &in) {
- hobject_t start;
- for (vector<hobject_t>::const_iterator i = in.begin();
+bool sorted(const vector<ghobject_t> &in) {
+ ghobject_t start;
+ for (vector<ghobject_t>::const_iterator i = in.begin();
i != in.end();
++i) {
if (start > *i) return false;
@@ -105,7 +105,7 @@ TEST_F(StoreTest, SimpleObjectTest) {
r = store->apply_transaction(t);
ASSERT_EQ(r, 0);
}
- hobject_t hoid(sobject_t("Object 1", CEPH_NOSNAP));
+ ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
{
ObjectStore::Transaction t;
t.touch(cid, hoid);
@@ -133,7 +133,7 @@ TEST_F(StoreTest, SimpleObjectLongnameTest) {
r = store->apply_transaction(t);
ASSERT_EQ(r, 0);
}
- hobject_t hoid(sobject_t("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaObjectaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa 1", CEPH_NOSNAP));
+ ghobject_t hoid(hobject_t(sobject_t("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaObjectaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa 1", CEPH_NOSNAP)));
{
ObjectStore::Transaction t;
t.touch(cid, hoid);
@@ -157,7 +157,7 @@ TEST_F(StoreTest, ManyObjectTest) {
coll_t cid("blah");
string base = "";
for (int i = 0; i < 100; ++i) base.append("aaaaa");
- set<hobject_t> created;
+ set<ghobject_t> created;
{
ObjectStore::Transaction t;
t.create_collection(cid);
@@ -171,27 +171,27 @@ TEST_F(StoreTest, ManyObjectTest) {
ObjectStore::Transaction t;
char buf[100];
snprintf(buf, sizeof(buf), "%d", i);
- hobject_t hoid(sobject_t(string(buf) + base, CEPH_NOSNAP));
+ ghobject_t hoid(hobject_t(sobject_t(string(buf) + base, CEPH_NOSNAP)));
t.touch(cid, hoid);
created.insert(hoid);
r = store->apply_transaction(t);
ASSERT_EQ(r, 0);
}
- for (set<hobject_t>::iterator i = created.begin();
+ for (set<ghobject_t>::iterator i = created.begin();
i != created.end();
++i) {
struct stat buf;
ASSERT_TRUE(!store->stat(cid, *i, &buf));
}
- set<hobject_t> listed;
- vector<hobject_t> objects;
+ set<ghobject_t> listed;
+ vector<ghobject_t> objects;
r = store->collection_list(cid, objects);
ASSERT_EQ(r, 0);
cerr << "objects.size() is " << objects.size() << std::endl;
- for (vector<hobject_t> ::iterator i = objects.begin();
+ for (vector<ghobject_t> ::iterator i = objects.begin();
i != objects.end();
++i) {
listed.insert(*i);
@@ -199,11 +199,11 @@ TEST_F(StoreTest, ManyObjectTest) {
}
ASSERT_TRUE(listed.size() == created.size());
- hobject_t start, next;
+ ghobject_t start, next;
objects.clear();
r = store->collection_list_partial(
cid,
- hobject_t::get_max(),
+ ghobject_t::get_max(),
50,
60,
0,
@@ -234,13 +234,13 @@ TEST_F(StoreTest, ManyObjectTest) {
}
cerr << "listed.size() is " << listed.size() << std::endl;
ASSERT_TRUE(listed.size() == created.size());
- for (set<hobject_t>::iterator i = listed.begin();
+ for (set<ghobject_t>::iterator i = listed.begin();
i != listed.end();
++i) {
ASSERT_TRUE(created.count(*i));
}
- for (set<hobject_t>::iterator i = created.begin();
+ for (set<ghobject_t>::iterator i = created.begin();
i != created.end();
++i) {
ObjectStore::Transaction t;
@@ -259,7 +259,7 @@ TEST_F(StoreTest, ManyObjectTest) {
class ObjectGenerator {
public:
- virtual hobject_t create_object(gen_type *gen) = 0;
+ virtual ghobject_t create_object(gen_type *gen) = 0;
virtual ~ObjectGenerator() {}
};
@@ -267,7 +267,7 @@ class MixedGenerator : public ObjectGenerator {
public:
unsigned seq;
MixedGenerator() : seq(0) {}
- hobject_t create_object(gen_type *gen) {
+ ghobject_t create_object(gen_type *gen) {
char buf[100];
snprintf(buf, sizeof(buf), "%u", seq);
@@ -283,7 +283,7 @@ public:
// hash
//boost::binomial_distribution<uint32_t> bin(0xFFFFFF, 0.5);
++seq;
- return hobject_t(name, string(), rand() & 2 ? CEPH_NOSNAP : rand(), rand() & 0xFF, 0, "");
+ return ghobject_t(hobject_t(name, string(), rand() & 2 ? CEPH_NOSNAP : rand(), rand() & 0xFF, 0, ""));
}
};
@@ -293,8 +293,8 @@ public:
static const unsigned max_objects = 3000;
coll_t cid;
unsigned in_flight;
- set<hobject_t> available_objects;
- set<hobject_t> in_use_objects;
+ set<ghobject_t> available_objects;
+ set<ghobject_t> in_use_objects;
ObjectGenerator *object_gen;
gen_type *rng;
ObjectStore *store;
@@ -307,9 +307,9 @@ public:
public:
SyntheticWorkloadState *state;
ObjectStore::Transaction *t;
- hobject_t hoid;
+ ghobject_t hoid;
C_SyntheticOnReadable(SyntheticWorkloadState *state,
- ObjectStore::Transaction *t, hobject_t hoid)
+ ObjectStore::Transaction *t, ghobject_t hoid)
: state(state), t(t), hoid(hoid) {}
void finish(int r) {
@@ -339,14 +339,14 @@ public:
return store->apply_transaction(t);
}
- hobject_t get_uniform_random_object() {
+ ghobject_t get_uniform_random_object() {
while (in_flight >= max_in_flight || available_objects.empty())
cond.Wait(lock);
boost::uniform_int<> choose(0, available_objects.size() - 1);
int index = choose(*rng);
- set<hobject_t>::iterator i = available_objects.begin();
+ set<ghobject_t>::iterator i = available_objects.begin();
for ( ; index > 0; --index, ++i) ;
- hobject_t ret = *i;
+ ghobject_t ret = *i;
available_objects.erase(i);
return ret;
}
@@ -375,7 +375,7 @@ public:
if (!can_create())
return -ENOSPC;
wait_for_ready();
- hobject_t new_obj = object_gen->create_object(rng);
+ ghobject_t new_obj = object_gen->create_object(rng);
in_use_objects.insert(new_obj);
available_objects.erase(new_obj);
ObjectStore::Transaction *t = new ObjectStore::Transaction;
@@ -388,9 +388,9 @@ public:
Mutex::Locker locker(lock);
while (in_flight)
cond.Wait(lock);
- vector<hobject_t> objects;
- set<hobject_t> objects_set, objects_set2;
- hobject_t next, current;
+ vector<ghobject_t> objects;
+ set<ghobject_t> objects_set, objects_set2;
+ ghobject_t next, current;
while (1) {
cerr << "scanning..." << std::endl;
int r = store->collection_list_partial(cid, current, 50, 100,
@@ -403,7 +403,7 @@ public:
current = next;
}
ASSERT_EQ(objects_set.size(), available_objects.size());
- for (set<hobject_t>::iterator i = objects_set.begin();
+ for (set<ghobject_t>::iterator i = objects_set.begin();
i != objects_set.end();
++i) {
ASSERT_GT(available_objects.count(*i), (unsigned)0);
@@ -413,7 +413,7 @@ public:
ASSERT_EQ(r, 0);
objects_set2.insert(objects.begin(), objects.end());
ASSERT_EQ(objects_set2.size(), available_objects.size());
- for (set<hobject_t>::iterator i = objects_set2.begin();
+ for (set<ghobject_t>::iterator i = objects_set2.begin();
i != objects_set2.end();
++i) {
ASSERT_GT(available_objects.count(*i), (unsigned)0);
@@ -421,7 +421,7 @@ public:
}
int stat() {
- hobject_t hoid;
+ ghobject_t hoid;
{
Mutex::Locker locker(lock);
if (!can_unlink())
@@ -446,7 +446,7 @@ public:
Mutex::Locker locker(lock);
if (!can_unlink())
return -ENOENT;
- hobject_t to_remove = get_uniform_random_object();
+ ghobject_t to_remove = get_uniform_random_object();
ObjectStore::Transaction *t = new ObjectStore::Transaction;
t->remove(cid, to_remove);
++in_flight;
@@ -505,7 +505,7 @@ TEST_F(StoreTest, HashCollisionTest) {
}
string base = "";
for (int i = 0; i < 100; ++i) base.append("aaaaa");
- set<hobject_t> created;
+ set<ghobject_t> created;
for (int n = 0; n < 10; ++n) {
char nbuf[100];
sprintf(nbuf, "n%d", n);
@@ -515,7 +515,7 @@ TEST_F(StoreTest, HashCollisionTest) {
if (!(i % 5)) {
cerr << "Object n" << n << " "<< i << std::endl;
}
- hobject_t hoid(string(buf) + base, string(), CEPH_NOSNAP, 0, 0, string(nbuf));
+ ghobject_t hoid(hobject_t(string(buf) + base, string(), CEPH_NOSNAP, 0, 0, string(nbuf)));
{
ObjectStore::Transaction t;
t.touch(cid, hoid);
@@ -525,21 +525,21 @@ TEST_F(StoreTest, HashCollisionTest) {
created.insert(hoid);
}
}
- vector<hobject_t> objects;
+ vector<ghobject_t> objects;
r = store->collection_list(cid, objects);
ASSERT_EQ(r, 0);
- set<hobject_t> listed(objects.begin(), objects.end());
+ set<ghobject_t> listed(objects.begin(), objects.end());
cerr << "listed.size() is " << listed.size() << " and created.size() is " << created.size() << std::endl;
ASSERT_TRUE(listed.size() == created.size());
objects.clear();
listed.clear();
- hobject_t current, next;
+ ghobject_t current, next;
while (1) {
r = store->collection_list_partial(cid, current, 50, 60,
0, &objects, &next);
ASSERT_EQ(r, 0);
ASSERT_TRUE(sorted(objects));
- for (vector<hobject_t>::iterator i = objects.begin();
+ for (vector<ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i) {
if (listed.count(*i))
@@ -555,13 +555,13 @@ TEST_F(StoreTest, HashCollisionTest) {
}
cerr << "listed.size() is " << listed.size() << std::endl;
ASSERT_TRUE(listed.size() == created.size());
- for (set<hobject_t>::iterator i = listed.begin();
+ for (set<ghobject_t>::iterator i = listed.begin();
i != listed.end();
++i) {
ASSERT_TRUE(created.count(*i));
}
- for (set<hobject_t>::iterator i = created.begin();
+ for (set<ghobject_t>::iterator i = created.begin();
i != created.end();
++i) {
ObjectStore::Transaction t;
@@ -576,7 +576,7 @@ TEST_F(StoreTest, HashCollisionTest) {
TEST_F(StoreTest, OMapTest) {
coll_t cid("blah");
- hobject_t hoid("tesomap", "", CEPH_NOSNAP, 0, 0, "");
+ ghobject_t hoid(hobject_t("tesomap", "", CEPH_NOSNAP, 0, 0, ""));
int r;
{
ObjectStore::Transaction t;
@@ -672,7 +672,7 @@ TEST_F(StoreTest, OMapTest) {
TEST_F(StoreTest, XattrTest) {
coll_t cid("blah");
- hobject_t hoid("tesomap", "", CEPH_NOSNAP, 0, 0, "");
+ ghobject_t hoid(hobject_t("tesomap", "", CEPH_NOSNAP, 0, 0, ""));
bufferlist big;
for (unsigned i = 0; i < 10000; ++i) {
big.append('\0');
@@ -769,12 +769,12 @@ void colsplittest(
for (uint32_t i = 0; i < 2*num_objects; ++i) {
stringstream objname;
objname << "obj" << i;
- t.touch(cid, hobject_t(
+ t.touch(cid, ghobject_t(hobject_t(
objname.str(),
"",
CEPH_NOSNAP,
i<<common_suffix_size,
- 0, ""));
+ 0, "")));
}
r = store->apply_transaction(t);
ASSERT_EQ(r, 0);
@@ -788,14 +788,14 @@ void colsplittest(
}
ObjectStore::Transaction t;
- vector<hobject_t> objects;
+ vector<ghobject_t> objects;
r = store->collection_list(cid, objects);
ASSERT_EQ(r, 0);
ASSERT_EQ(objects.size(), num_objects);
- for (vector<hobject_t>::iterator i = objects.begin();
+ for (vector<ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i) {
- ASSERT_EQ(!(i->hash & (1<<common_suffix_size)), 0u);
+ ASSERT_EQ(!(i->hobj.hash & (1<<common_suffix_size)), 0u);
t.remove(cid, *i);
}
@@ -803,10 +803,10 @@ void colsplittest(
r = store->collection_list(tid, objects);
ASSERT_EQ(r, 0);
ASSERT_EQ(objects.size(), num_objects);
- for (vector<hobject_t>::iterator i = objects.begin();
+ for (vector<ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i) {
- ASSERT_EQ(i->hash & (1<<common_suffix_size), 0u);
+ ASSERT_EQ(i->hobj.hash & (1<<common_suffix_size), 0u);
t.remove(tid, *i);
}
@@ -848,12 +848,12 @@ TEST_F(StoreTest, TwoHash) {
std::cout << "Making objects" << std::endl;
for (int i = 0; i < 360; ++i) {
ObjectStore::Transaction t;
- hobject_t o;
+ ghobject_t o;
if (i < 8) {
- o.hash = (i << 16) | 0xA1;
+ o.hobj.hash = (i << 16) | 0xA1;
t.touch(cid, o);
}
- o.hash = (i << 16) | 0xB1;
+ o.hobj.hash = (i << 16) | 0xB1;
t.touch(cid, o);
r = store->apply_transaction(t);
ASSERT_EQ(r, 0);
@@ -861,8 +861,8 @@ TEST_F(StoreTest, TwoHash) {
std::cout << "Removing half" << std::endl;
for (int i = 1; i < 8; ++i) {
ObjectStore::Transaction t;
- hobject_t o;
- o.hash = (i << 16) | 0xA1;
+ ghobject_t o;
+ o.hobj.hash = (i << 16) | 0xA1;
t.remove(cid, o);
r = store->apply_transaction(t);
ASSERT_EQ(r, 0);
@@ -870,24 +870,24 @@ TEST_F(StoreTest, TwoHash) {
std::cout << "Checking" << std::endl;
for (int i = 1; i < 8; ++i) {
ObjectStore::Transaction t;
- hobject_t o;
- o.hash = (i << 16) | 0xA1;
+ ghobject_t o;
+ o.hobj.hash = (i << 16) | 0xA1;
bool exists = store->exists(cid, o);
ASSERT_EQ(exists, false);
}
{
- hobject_t o;
- o.hash = 0xA1;
+ ghobject_t o;
+ o.hobj.hash = 0xA1;
bool exists = store->exists(cid, o);
ASSERT_EQ(exists, true);
}
std::cout << "Cleanup" << std::endl;
for (int i = 0; i < 360; ++i) {
ObjectStore::Transaction t;
- hobject_t o;
- o.hash = (i << 16) | 0xA1;
+ ghobject_t o;
+ o.hobj.hash = (i << 16) | 0xA1;
t.remove(cid, o);
- o.hash = (i << 16) | 0xB1;
+ o.hobj.hash = (i << 16) | 0xB1;
t.remove(cid, o);
r = store->apply_transaction(t);
ASSERT_EQ(r, 0);
@@ -898,6 +898,65 @@ TEST_F(StoreTest, TwoHash) {
ASSERT_EQ(r, 0);
}
+TEST_F(StoreTest, MoveRename) {
+ coll_t temp_cid("mytemp");
+ hobject_t temp_oid("tmp_oid", "", CEPH_NOSNAP, 0, 0, "");
+ coll_t cid("dest");
+ hobject_t oid("dest_oid", "", CEPH_NOSNAP, 0, 0, "");
+ int r;
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid);
+ t.touch(cid, oid);
+ r = store->apply_transaction(t);
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_TRUE(store->exists(cid, oid));
+ bufferlist data, attr;
+ map<string, bufferlist> omap;
+ data.append("data payload");
+ attr.append("attr value");
+ omap["omap_key"].append("omap value");
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(temp_cid);
+ t.touch(temp_cid, temp_oid);
+ t.write(temp_cid, temp_oid, 0, data.length(), data);
+ t.setattr(temp_cid, temp_oid, "attr", attr);
+ t.omap_setkeys(temp_cid, temp_oid, omap);
+ r = store->apply_transaction(t);
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_TRUE(store->exists(temp_cid, temp_oid));
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, oid);
+ t.collection_move_rename(temp_cid, temp_oid, cid, oid);
+ r = store->apply_transaction(t);
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_TRUE(store->exists(cid, oid));
+ ASSERT_FALSE(store->exists(temp_cid, temp_oid));
+ {
+ bufferlist newdata;
+ r = store->read(cid, oid, 0, 1000, newdata);
+ ASSERT_GE(r, 0);
+ ASSERT_TRUE(newdata.contents_equal(data));
+ bufferlist newattr;
+ r = store->getattr(cid, oid, "attr", newattr);
+ ASSERT_GE(r, 0);
+ ASSERT_TRUE(newattr.contents_equal(attr));
+ set<string> keys;
+ keys.insert("omap_key");
+ map<string, bufferlist> newomap;
+ r = store->omap_get_values(cid, oid, keys, &newomap);
+ ASSERT_GE(r, 0);
+ ASSERT_EQ(1u, newomap.size());
+ ASSERT_TRUE(newomap.count("omap_key"));
+ ASSERT_TRUE(newomap["omap_key"].contents_equal(omap["omap_key"]));
+ }
+}
+
//
// support tests for qa/workunits/filestore/filestore.sh
//
diff --git a/src/test/filestore/workload_generator.cc b/src/test/filestore/workload_generator.cc
index 496379d7ad1..704d93021e2 100644
--- a/src/test/filestore/workload_generator.cc
+++ b/src/test/filestore/workload_generator.cc
@@ -344,12 +344,12 @@ void WorkloadGenerator::do_destroy_collection(ObjectStore::Transaction *t,
{
m_nr_runs.set(0);
entry->m_osr.flush();
- vector<hobject_t> ls;
+ vector<ghobject_t> ls;
m_store->collection_list(entry->m_coll, ls);
dout(2) << __func__ << " coll " << entry->m_coll
<< " (" << ls.size() << " objects)" << dendl;
- for (vector<hobject_t>::iterator it = ls.begin(); it < ls.end(); ++it) {
+ for (vector<ghobject_t>::iterator it = ls.begin(); it < ls.end(); ++it) {
t->remove(entry->m_coll, *it);
}
diff --git a/src/fooclass.cc b/src/test/fooclass.cc
index 2db2d815bb0..2db2d815bb0 100644
--- a/src/fooclass.cc
+++ b/src/test/fooclass.cc
diff --git a/src/test/libcephfs/caps.cc b/src/test/libcephfs/caps.cc
index 96f1a90024d..9fa92cf29dc 100644
--- a/src/test/libcephfs/caps.cc
+++ b/src/test/libcephfs/caps.cc
@@ -11,11 +11,11 @@
* Foundation. See file COPYING.
*
*/
+#include "include/int_types.h"
#include "gtest/gtest.h"
#include "include/cephfs/libcephfs.h"
#include <linux/types.h>
-#include <inttypes.h>
#include "include/ceph_fs.h"
#include <errno.h>
#include <sys/fcntl.h>
diff --git a/src/test/librados/misc.cc b/src/test/librados/misc.cc
index 6cb7cf5452a..9abac9c412a 100644
--- a/src/test/librados/misc.cc
+++ b/src/test/librados/misc.cc
@@ -538,21 +538,25 @@ TEST(LibRadosMisc, BigAttrPP) {
bufferlist got;
- bl.clear();
- got.clear();
- bl.append(buffer::create(g_conf->osd_max_attr_size));
- ASSERT_EQ(0, ioctx.setxattr("foo", "one", bl));
- ASSERT_EQ((int)bl.length(), ioctx.getxattr("foo", "one", got));
- ASSERT_TRUE(bl.contents_equal(got));
+ if (g_conf->osd_max_attr_size) {
+ bl.clear();
+ got.clear();
+ bl.append(buffer::create(g_conf->osd_max_attr_size));
+ ASSERT_EQ(0, ioctx.setxattr("foo", "one", bl));
+ ASSERT_EQ((int)bl.length(), ioctx.getxattr("foo", "one", got));
+ ASSERT_TRUE(bl.contents_equal(got));
- bl.clear();
- bl.append(buffer::create(g_conf->osd_max_attr_size+1));
- ASSERT_EQ(-EFBIG, ioctx.setxattr("foo", "one", bl));
+ bl.clear();
+ bl.append(buffer::create(g_conf->osd_max_attr_size+1));
+ ASSERT_EQ(-EFBIG, ioctx.setxattr("foo", "one", bl));
+ } else {
+ cout << "osd_max_attr_size == 0; skipping test" << std::endl;
+ }
for (int i=0; i<1000; i++) {
bl.clear();
got.clear();
- bl.append(buffer::create(g_conf->osd_max_attr_size));
+ bl.append(buffer::create(MIN(g_conf->osd_max_attr_size, 1024)));
char n[10];
snprintf(n, sizeof(n), "a%d", i);
ASSERT_EQ(0, ioctx.setxattr("foo", n, bl));
@@ -564,6 +568,139 @@ TEST(LibRadosMisc, BigAttrPP) {
ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
}
+TEST(LibRadosMisc, CopyPP) {
+ Rados cluster;
+ std::string pool_name = get_temp_pool_name();
+ ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
+ IoCtx ioctx;
+ ASSERT_EQ(0, cluster.ioctx_create(pool_name.c_str(), ioctx));
+
+ bufferlist bl, x;
+ bl.append("hi there");
+ x.append("bar");
+
+ // small object
+ bufferlist blc = bl;
+ bufferlist xc = x;
+ ASSERT_EQ(0, ioctx.write_full("foo", blc));
+ ASSERT_EQ(0, ioctx.setxattr("foo", "myattr", xc));
+
+ {
+ ObjectWriteOperation op;
+ op.copy_from("foo", ioctx, ioctx.get_last_version());
+ ASSERT_EQ(0, ioctx.operate("foo.copy", &op));
+
+ bufferlist bl2, x2;
+ ASSERT_EQ((int)bl.length(), ioctx.read("foo.copy", bl2, 10000, 0));
+ ASSERT_TRUE(bl.contents_equal(bl2));
+ ASSERT_EQ((int)x.length(), ioctx.getxattr("foo.copy", "myattr", x2));
+ ASSERT_TRUE(x.contents_equal(x2));
+ }
+
+ // small object without a version
+ {
+ ObjectWriteOperation op;
+ op.copy_from("foo", ioctx, 0);
+ ASSERT_EQ(0, ioctx.operate("foo.copy2", &op));
+
+ bufferlist bl2, x2;
+ ASSERT_EQ((int)bl.length(), ioctx.read("foo.copy2", bl2, 10000, 0));
+ ASSERT_TRUE(bl.contents_equal(bl2));
+ ASSERT_EQ((int)x.length(), ioctx.getxattr("foo.copy2", "myattr", x2));
+ ASSERT_TRUE(x.contents_equal(x2));
+ }
+
+ // do a big object
+ bl.append(buffer::create(g_conf->osd_copyfrom_max_chunk * 3));
+ bl.zero();
+ bl.append("tail");
+ blc = bl;
+ xc = x;
+ ASSERT_EQ(0, ioctx.write_full("big", blc));
+ ASSERT_EQ(0, ioctx.setxattr("big", "myattr", xc));
+
+ {
+ ObjectWriteOperation op;
+ op.copy_from("big", ioctx, ioctx.get_last_version());
+ ASSERT_EQ(0, ioctx.operate("big.copy", &op));
+
+ bufferlist bl2, x2;
+ ASSERT_EQ((int)bl.length(), ioctx.read("big.copy", bl2, bl.length(), 0));
+ ASSERT_TRUE(bl.contents_equal(bl2));
+ ASSERT_EQ((int)x.length(), ioctx.getxattr("foo.copy", "myattr", x2));
+ ASSERT_TRUE(x.contents_equal(x2));
+ }
+
+ {
+ ObjectWriteOperation op;
+ op.copy_from("big", ioctx, 0);
+ ASSERT_EQ(0, ioctx.operate("big.copy2", &op));
+
+ bufferlist bl2, x2;
+ ASSERT_EQ((int)bl.length(), ioctx.read("big.copy2", bl2, bl.length(), 0));
+ ASSERT_TRUE(bl.contents_equal(bl2));
+ ASSERT_EQ((int)x.length(), ioctx.getxattr("foo.copy2", "myattr", x2));
+ ASSERT_TRUE(x.contents_equal(x2));
+ }
+
+ ioctx.close();
+ ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
+}
+
+TEST(LibRadosMisc, Dirty) {
+ Rados cluster;
+ std::string pool_name = get_temp_pool_name();
+ ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
+ IoCtx ioctx;
+ ASSERT_EQ(0, cluster.ioctx_create(pool_name.c_str(), ioctx));
+
+ {
+ ObjectWriteOperation op;
+ op.create(true);
+ ASSERT_EQ(0, ioctx.operate("foo", &op));
+ }
+ {
+ bool dirty = false;
+ int r = -1;
+ ObjectReadOperation op;
+ op.is_dirty(&dirty, &r);
+ ASSERT_EQ(0, ioctx.operate("foo", &op, NULL));
+ ASSERT_TRUE(dirty);
+ ASSERT_EQ(0, r);
+ }
+ {
+ ObjectWriteOperation op;
+ op.undirty();
+ ASSERT_EQ(0, ioctx.operate("foo", &op));
+ }
+ {
+ bool dirty = false;
+ int r = -1;
+ ObjectReadOperation op;
+ op.is_dirty(&dirty, &r);
+ ASSERT_EQ(0, ioctx.operate("foo", &op, NULL));
+ ASSERT_FALSE(dirty);
+ ASSERT_EQ(0, r);
+ }
+ {
+ ObjectWriteOperation op;
+ op.truncate(0); // still a write even tho it is a no-op
+ ASSERT_EQ(0, ioctx.operate("foo", &op));
+ }
+ {
+ bool dirty = false;
+ int r = -1;
+ ObjectReadOperation op;
+ op.is_dirty(&dirty, &r);
+ ASSERT_EQ(0, ioctx.operate("foo", &op, NULL));
+ ASSERT_TRUE(dirty);
+ ASSERT_EQ(0, r);
+ }
+
+ ioctx.close();
+ ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
+}
+
int main(int argc, char **argv)
{
::testing::InitGoogleTest(&argc, argv);
diff --git a/src/test/librbd/test_librbd.cc b/src/test/librbd/test_librbd.cc
index 562be6a6bcf..84bf3477aff 100644
--- a/src/test/librbd/test_librbd.cc
+++ b/src/test/librbd/test_librbd.cc
@@ -12,6 +12,7 @@
*
*/
+#include "include/int_types.h"
#include "include/rados/librados.h"
#include "include/rbd_types.h"
#include "include/rbd/librbd.h"
@@ -20,7 +21,6 @@
#include "gtest/gtest.h"
#include <errno.h>
-#include <inttypes.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
diff --git a/src/test/os/TestFlatIndex.cc b/src/test/os/TestFlatIndex.cc
index 6db4f6c4aa5..53d2bbe6376 100644
--- a/src/test/os/TestFlatIndex.cc
+++ b/src/test/os/TestFlatIndex.cc
@@ -49,8 +49,8 @@ TEST(FlatIndex, collection) {
uint64_t hash = 111;
uint64_t pool = 222;
const std::string object_name(10, 'A');
- hobject_t hoid(object_t(object_name), key, CEPH_NOSNAP, hash, pool, "");
- vector<hobject_t> ls;
+ ghobject_t hoid(hobject_t(object_t(object_name), key, CEPH_NOSNAP, hash, pool, ""));
+ vector<ghobject_t> ls;
ASSERT_DEATH(index.collection_list_partial(hoid, 0, 0, 0, &ls, &hoid), "0");
}
@@ -70,7 +70,7 @@ TEST(FlatIndex, created_unlink) {
CollectionIndex::IndexedPath indexed_path;
index->set_ref(index);
const std::string object_name(10, 'A');
- hobject_t hoid(object_t(object_name), key, CEPH_NOSNAP, hash, pool, "");
+ ghobject_t hoid(hobject_t(object_t(object_name), key, CEPH_NOSNAP, hash, pool, ""));
int exists;
EXPECT_EQ(0, index->lookup(hoid, &indexed_path, &exists));
EXPECT_EQ(0, exists);
@@ -88,7 +88,7 @@ TEST(FlatIndex, created_unlink) {
CollectionIndex::IndexedPath indexed_path;
index->set_ref(index);
const std::string object_name(1024, 'A');
- hobject_t hoid(object_t(object_name), key, CEPH_NOSNAP, hash, pool, "");
+ ghobject_t hoid(hobject_t(object_t(object_name), key, CEPH_NOSNAP, hash, pool, ""));
int exists;
EXPECT_EQ(0, index->lookup(hoid, &indexed_path, &exists));
EXPECT_EQ(0, exists);
@@ -110,10 +110,10 @@ TEST(FlatIndex, collection_list) {
const std::string filename("PATH/" + object_name + "_head");
EXPECT_EQ(0, ::close(::creat(filename.c_str(), 0600)));
std::tr1::shared_ptr<CollectionIndex> index(new FlatIndex(collection, base_path));
- vector<hobject_t> ls;
+ vector<ghobject_t> ls;
index->collection_list(&ls);
EXPECT_EQ((unsigned)1, ls.size());
- EXPECT_EQ(object_name, ls[0].oid.name);
+ EXPECT_EQ(object_name, ls[0].hobj.oid.name);
EXPECT_EQ(0, ::system("rm -fr PATH"));
}
diff --git a/src/test/os/TestLFNIndex.cc b/src/test/os/TestLFNIndex.cc
index 33dbfe532a9..02578eb4a71 100644
--- a/src/test/os/TestLFNIndex.cc
+++ b/src/test/os/TestLFNIndex.cc
@@ -45,10 +45,10 @@ public:
std::tr1::shared_ptr<CollectionIndex> dest
) { return 0; }
- void test_generate_and_parse(const hobject_t &hoid, const std::string &mangled_expected) {
+ void test_generate_and_parse(const ghobject_t &hoid, const std::string &mangled_expected) {
const std::string mangled_name = lfn_generate_object_name(hoid);
EXPECT_EQ(mangled_expected, mangled_name);
- hobject_t hoid_parsed;
+ ghobject_t hoid_parsed;
EXPECT_TRUE(lfn_parse_object_name(mangled_name, &hoid_parsed));
EXPECT_EQ(hoid, hoid_parsed);
}
@@ -58,34 +58,34 @@ protected:
virtual int _created(
const vector<string> &path,
- const hobject_t &hoid,
+ const ghobject_t &hoid,
const string &mangled_name
) { return 0; }
virtual int _remove(
const vector<string> &path,
- const hobject_t &hoid,
+ const ghobject_t &hoid,
const string &mangled_name
) { return 0; }
virtual int _lookup(
- const hobject_t &hoid,
+ const ghobject_t &hoid,
vector<string> *path,
string *mangled_name,
int *exists
) { return 0; }
virtual int _collection_list(
- vector<hobject_t> *ls
+ vector<ghobject_t> *ls
) { return 0; }
virtual int _collection_list_partial(
- const hobject_t &start,
+ const ghobject_t &start,
int min_count,
int max_count,
snapid_t seq,
- vector<hobject_t> *ls,
- hobject_t *next
+ vector<ghobject_t> *ls,
+ ghobject_t *next
) { return 0; }
};
@@ -101,9 +101,9 @@ TEST_F(TestHASH_INDEX_TAG, generate_and_parse_name) {
uint64_t hash = 0xABABABAB;
uint64_t pool = -1;
- test_generate_and_parse(hobject_t(object_t(".A/B_\\C.D"), key, CEPH_NOSNAP, hash, pool, ""),
+ test_generate_and_parse(ghobject_t(hobject_t(object_t(".A/B_\\C.D"), key, CEPH_NOSNAP, hash, pool, "")),
"\\.A\\sB_\\\\C.D_head_ABABABAB");
- test_generate_and_parse(hobject_t(object_t("DIR_A"), key, CEPH_NOSNAP, hash, pool, ""),
+ test_generate_and_parse(ghobject_t(hobject_t(object_t("DIR_A"), key, CEPH_NOSNAP, hash, pool, "")),
"\\dA_head_ABABABAB");
}
@@ -123,11 +123,11 @@ TEST_F(TestHASH_INDEX_TAG_2, generate_and_parse_name) {
{
std::string name(".XA/B_\\C.D");
name[1] = '\0';
- hobject_t hoid(object_t(name), key, CEPH_NOSNAP, hash, pool, "");
+ ghobject_t hoid(hobject_t(object_t(name), key, CEPH_NOSNAP, hash, pool, ""));
test_generate_and_parse(hoid, "\\.\\nA\\sB\\u\\\\C.D_KEY_head_ABABABAB");
}
- test_generate_and_parse(hobject_t(object_t("DIR_A"), key, CEPH_NOSNAP, hash, pool, ""),
+ test_generate_and_parse(ghobject_t(hobject_t(object_t("DIR_A"), key, CEPH_NOSNAP, hash, pool, "")),
"\\dA_KEY_head_ABABABAB");
}
@@ -143,36 +143,52 @@ TEST_F(TestHOBJECT_WITH_POOL, generate_and_parse_name) {
const std::string key("KEY");
uint64_t hash = 0xABABABAB;
uint64_t pool = 0xCDCDCDCD;
+ int64_t gen = 0xefefefefef;
+ int8_t shard_id = 0xb;
{
std::string name(".XA/B_\\C.D");
name[1] = '\0';
- hobject_t hoid(object_t(name), key, CEPH_NOSNAP, hash, pool, "");
- hoid.nspace = "NSPACE";
+ ghobject_t hoid(hobject_t(object_t(name), key, CEPH_NOSNAP, hash, pool, ""));
+ hoid.hobj.nspace = "NSPACE";
test_generate_and_parse(hoid, "\\.\\nA\\sB\\u\\\\C.D_KEY_head_ABABABAB_NSPACE_cdcdcdcd");
}
{
- hobject_t hoid(object_t("DIR_A"), key, CEPH_NOSNAP, hash, pool, "");
- hoid.nspace = "NSPACE";
+ ghobject_t hoid(hobject_t(object_t("DIR_A"), key, CEPH_NOSNAP, hash, pool, ""));
+ hoid.hobj.nspace = "NSPACE";
test_generate_and_parse(hoid, "\\dA_KEY_head_ABABABAB_NSPACE_cdcdcdcd");
}
+ {
+ std::string name(".XA/B_\\C.D");
+ name[1] = '\0';
+ ghobject_t hoid(hobject_t(object_t(name), key, CEPH_NOSNAP, hash, pool, ""), gen, shard_id);
+ hoid.hobj.nspace = "NSPACE";
+
+ test_generate_and_parse(hoid, "\\.\\nA\\sB\\u\\\\C.D_KEY_head_ABABABAB_NSPACE_cdcdcdcd_efefefefef_b");
+ }
+ {
+ ghobject_t hoid(hobject_t(object_t("DIR_A"), key, CEPH_NOSNAP, hash, pool, ""), gen, shard_id);
+ hoid.hobj.nspace = "NSPACE";
+
+ test_generate_and_parse(hoid, "\\dA_KEY_head_ABABABAB_NSPACE_cdcdcdcd_efefefefef_b");
+ }
}
class TestLFNIndex : public TestWrapLFNIndex, public ::testing::Test {
public:
- TestLFNIndex() : TestWrapLFNIndex(coll_t("ABC"), "PATH", CollectionIndex::HASH_INDEX_TAG) {
+ TestLFNIndex() : TestWrapLFNIndex(coll_t("ABC"), "PATH", CollectionIndex::HOBJECT_WITH_POOL) {
}
virtual void SetUp() {
::chmod("PATH", 0700);
- ::system("rm -fr PATH");
- ::mkdir("PATH", 0700);
+ ASSERT_EQ(0, ::system("rm -fr PATH"));
+ ASSERT_EQ(0, ::mkdir("PATH", 0700));
}
virtual void TearDown() {
- ::system("rm -fr PATH");
+ ASSERT_EQ(0, ::system("rm -fr PATH"));
}
};
@@ -185,7 +201,7 @@ TEST_F(TestLFNIndex, remove_object) {
{
std::string mangled_name;
int exists = 666;
- hobject_t hoid(sobject_t("ABC", CEPH_NOSNAP));
+ ghobject_t hoid(hobject_t(sobject_t("ABC", CEPH_NOSNAP)));
EXPECT_EQ(0, ::chmod("PATH", 0000));
EXPECT_EQ(-EACCES, remove_object(path, hoid));
@@ -205,7 +221,7 @@ TEST_F(TestLFNIndex, remove_object) {
std::string mangled_name;
int exists;
const std::string object_name(1024, 'A');
- hobject_t hoid(sobject_t(object_name, CEPH_NOSNAP));
+ ghobject_t hoid(hobject_t(sobject_t(object_name, CEPH_NOSNAP)));
EXPECT_EQ(0, get_mangled_name(path, hoid, &mangled_name, &exists));
EXPECT_EQ(0, exists);
@@ -226,7 +242,7 @@ TEST_F(TestLFNIndex, remove_object) {
std::string mangled_name;
int exists;
const std::string object_name(1024, 'A');
- hobject_t hoid(sobject_t(object_name, CEPH_NOSNAP));
+ ghobject_t hoid(hobject_t(sobject_t(object_name, CEPH_NOSNAP)));
//
// PATH/AAA..._0_long => does not match long object name
@@ -237,7 +253,12 @@ TEST_F(TestLFNIndex, remove_object) {
std::string pathname("PATH/" + mangled_name);
EXPECT_EQ(0, ::close(::creat(pathname.c_str(), 0600)));
EXPECT_EQ(0, created(hoid, pathname.c_str()));
- const string LFN_ATTR = "user.cephos.lfn";
+ string LFN_ATTR = "user.cephos.lfn";
+ if (index_version != HASH_INDEX_TAG) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%d", index_version);
+ LFN_ATTR += string(buf);
+ }
const std::string object_name_1 = object_name + "SUFFIX";
EXPECT_EQ(object_name_1.size(), (unsigned)chain_setxattr(pathname.c_str(), LFN_ATTR.c_str(), object_name_1.c_str(), object_name_1.size()));
@@ -270,7 +291,7 @@ TEST_F(TestLFNIndex, remove_object) {
std::string mangled_name;
int exists;
const std::string object_name(1024, 'A');
- hobject_t hoid(sobject_t(object_name, CEPH_NOSNAP));
+ ghobject_t hoid(hobject_t(sobject_t(object_name, CEPH_NOSNAP)));
//
// PATH/AAA..._0_long => matches long object name
@@ -318,16 +339,16 @@ TEST_F(TestLFNIndex, get_mangled_name) {
{
std::string mangled_name;
int exists = 666;
- hobject_t hoid(sobject_t("ABC", CEPH_NOSNAP));
+ ghobject_t hoid(hobject_t(sobject_t("ABC", CEPH_NOSNAP)));
EXPECT_EQ(0, get_mangled_name(path, hoid, &mangled_name, &exists));
- EXPECT_NE(std::string::npos, mangled_name.find("ABC_head"));
+ EXPECT_NE(std::string::npos, mangled_name.find("ABC__head"));
EXPECT_EQ(std::string::npos, mangled_name.find("0_long"));
EXPECT_EQ(0, exists);
const std::string pathname("PATH/" + mangled_name);
EXPECT_EQ(0, ::close(::creat(pathname.c_str(), 0600)));
EXPECT_EQ(0, get_mangled_name(path, hoid, &mangled_name, &exists));
- EXPECT_NE(std::string::npos, mangled_name.find("ABC_head"));
+ EXPECT_NE(std::string::npos, mangled_name.find("ABC__head"));
EXPECT_EQ(1, exists);
EXPECT_EQ(0, ::unlink(pathname.c_str()));
}
@@ -338,7 +359,7 @@ TEST_F(TestLFNIndex, get_mangled_name) {
std::string mangled_name;
int exists;
const std::string object_name(1024, 'A');
- hobject_t hoid(sobject_t(object_name, CEPH_NOSNAP));
+ ghobject_t hoid(hobject_t(sobject_t(object_name, CEPH_NOSNAP)));
//
// long version of the mangled name and no matching
@@ -399,7 +420,12 @@ TEST_F(TestLFNIndex, get_mangled_name) {
// are not identical and it so happens that their SHA1 is
// identical : a collision number is used to differentiate them
//
- const string LFN_ATTR = "user.cephos.lfn";
+ string LFN_ATTR = "user.cephos.lfn";
+ if (index_version != HASH_INDEX_TAG) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%d", index_version);
+ LFN_ATTR += string(buf);
+ }
const std::string object_name_same_prefix = object_name + "SUFFIX";
EXPECT_EQ(object_name_same_prefix.size(), (unsigned)chain_setxattr(pathname.c_str(), LFN_ATTR.c_str(), object_name_same_prefix.c_str(), object_name_same_prefix.size()));
std::string mangled_name_same_prefix;
@@ -431,6 +457,11 @@ int main(int argc, char **argv) {
}
}
-// Local Variables:
-// compile-command: "cd ../.. ; make unittest_lfnindex ; valgrind --tool=memcheck ./unittest_lfnindex # --gtest_filter=TestLFNIndex.* --log-to-stderr=true --debug-filestore=20"
-// End:
+/*
+ * Local Variables:
+ * compile-command: "cd ../.. ;
+ * make unittest_lfnindex &&
+ * valgrind --tool=memcheck ./unittest_lfnindex \
+ * # --gtest_filter=TestLFNIndex.* --log-to-stderr=true --debug-filestore=20"
+ * End:
+ */
diff --git a/src/test/osd/ErasureCodeExample.h b/src/test/osd/ErasureCodeExample.h
new file mode 100644
index 00000000000..0fd55187559
--- /dev/null
+++ b/src/test/osd/ErasureCodeExample.h
@@ -0,0 +1,161 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef CEPH_ERASURE_CODE_EXAMPLE_H
+#define CEPH_ERASURE_CODE_EXAMPLE_H
+
+#include <unistd.h>
+#include <errno.h>
+#include <algorithm>
+#include <sstream>
+#include "osd/ErasureCodeInterface.h"
+
+#define FIRST_DATA_CHUNK 0
+#define SECOND_DATA_CHUNK 1
+#define DATA_CHUNKS 2u
+
+#define CODING_CHUNK 2
+#define CODING_CHUNKS 1u
+
+#define MINIMUM_TO_RECOVER 2u
+
+class ErasureCodeExample : public ErasureCodeInterface {
+public:
+ virtual ~ErasureCodeExample() {}
+
+ virtual int minimum_to_decode(const set<int> &want_to_read,
+ const set<int> &available_chunks,
+ set<int> *minimum) {
+ if (includes(available_chunks.begin(), available_chunks.end(),
+ want_to_read.begin(), want_to_read.end())) {
+ *minimum = want_to_read;
+ return 0;
+ } else if (available_chunks.size() >= MINIMUM_TO_RECOVER) {
+ *minimum = available_chunks;
+ return 0;
+ } else {
+ return -EIO;
+ }
+ }
+
+ virtual int minimum_to_decode_with_cost(const set<int> &want_to_read,
+ const map<int, int> &available,
+ set<int> *minimum) {
+ //
+ // If one chunk is more expensive to fetch than the others,
+ // recover it instead. For instance, if the cost reflects the
+ // time it takes for a chunk to be retrieved from a remote
+ // OSD and if CPU is cheap, it could make sense to recover
+ // instead of fetching the chunk.
+ //
+ map<int, int> c2c(available);
+ if (c2c.size() > DATA_CHUNKS) {
+ if (c2c[FIRST_DATA_CHUNK] > c2c[SECOND_DATA_CHUNK] &&
+ c2c[FIRST_DATA_CHUNK] > c2c[CODING_CHUNK])
+ c2c.erase(FIRST_DATA_CHUNK);
+ else if(c2c[SECOND_DATA_CHUNK] > c2c[FIRST_DATA_CHUNK] &&
+ c2c[SECOND_DATA_CHUNK] > c2c[CODING_CHUNK])
+ c2c.erase(SECOND_DATA_CHUNK);
+ else if(c2c[CODING_CHUNK] > c2c[FIRST_DATA_CHUNK] &&
+ c2c[CODING_CHUNK] > c2c[SECOND_DATA_CHUNK])
+ c2c.erase(CODING_CHUNK);
+ }
+ set <int> available_chunks;
+ for (map<int, int>::const_iterator i = c2c.begin();
+ i != c2c.end();
+ i++)
+ available_chunks.insert(i->first);
+ return minimum_to_decode(want_to_read, available_chunks, minimum);
+ }
+
+ virtual int encode(const set<int> &want_to_encode,
+ const bufferlist &in,
+ map<int, bufferlist> *encoded) {
+ //
+ // make sure all data chunks have the same length, allocating
+ // padding if necessary.
+ //
+ unsigned chunk_length = ( in.length() / DATA_CHUNKS ) + 1;
+ unsigned length = chunk_length * ( DATA_CHUNKS + CODING_CHUNKS );
+ bufferlist out(in);
+ bufferptr pad(length - in.length());
+ pad.zero(0, DATA_CHUNKS);
+ out.push_back(pad);
+ //
+ // compute the coding chunk with first chunk ^ second chunk
+ //
+ char *p = out.c_str();
+ for (unsigned i = 0; i < chunk_length; i++)
+ p[i + CODING_CHUNK * chunk_length] =
+ p[i + FIRST_DATA_CHUNK * chunk_length] ^
+ p[i + SECOND_DATA_CHUNK * chunk_length];
+ //
+ // populate the bufferlist with bufferptr pointing
+ // to chunk boundaries
+ //
+ const bufferptr ptr = out.buffers().front();
+ for (set<int>::iterator j = want_to_encode.begin();
+ j != want_to_encode.end();
+ j++) {
+ bufferptr chunk(ptr, (*j) * chunk_length, chunk_length);
+ (*encoded)[*j].push_front(chunk);
+ }
+ return 0;
+ }
+
+ virtual int decode(const set<int> &want_to_read,
+ const map<int, bufferlist> &chunks,
+ map<int, bufferlist> *decoded) {
+ //
+ // All chunks have the same size
+ //
+ unsigned chunk_length = (*chunks.begin()).second.length();
+ for (set<int>::iterator i = want_to_read.begin();
+ i != want_to_read.end();
+ i++) {
+ if (chunks.find(*i) != chunks.end()) {
+ //
+ // If the chunk is available, just copy the bufferptr pointer
+ // to the decoded argument.
+ //
+ (*decoded)[*i] = chunks.find(*i)->second;
+ } else if(chunks.size() != 2) {
+ //
+ // If a chunk is missing and there are not enough chunks
+ // to recover, abort.
+ //
+ return -ERANGE;
+ } else {
+ //
+ // No matter what the missing chunk is, XOR of the other
+ // two recovers it.
+ //
+ bufferptr chunk(chunk_length);
+ map<int, bufferlist>::const_iterator k = chunks.begin();
+ const char *a = k->second.buffers().front().c_str();
+ k++;
+ const char *b = k->second.buffers().front().c_str();
+ for (unsigned j = 0; j < chunk_length; j++) {
+ chunk[j] = a[j] ^ b[j];
+ }
+ (*decoded)[*i].push_front(chunk);
+ }
+ }
+ return 0;
+ }
+};
+
+#endif
diff --git a/src/test/osd/ErasureCodePluginExample.cc b/src/test/osd/ErasureCodePluginExample.cc
new file mode 100644
index 00000000000..6ae61c0a18d
--- /dev/null
+++ b/src/test/osd/ErasureCodePluginExample.cc
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <unistd.h>
+
+#include "osd/ErasureCodePlugin.h"
+#include "ErasureCodeExample.h"
+
+class ErasureCodePluginExample : public ErasureCodePlugin {
+public:
+ virtual int factory(const map<std::string,std::string> &parameters,
+ ErasureCodeInterfaceRef *erasure_code)
+ {
+ *erasure_code = ErasureCodeInterfaceRef(new ErasureCodeExample());
+ return 0;
+ }
+};
+
+int __erasure_code_init(char *plugin_name)
+{
+ ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+ return instance.add(plugin_name, new ErasureCodePluginExample());
+}
diff --git a/src/test/osd/ErasureCodePluginFailToInitialize.cc b/src/test/osd/ErasureCodePluginFailToInitialize.cc
new file mode 100644
index 00000000000..cded6eef556
--- /dev/null
+++ b/src/test/osd/ErasureCodePluginFailToInitialize.cc
@@ -0,0 +1,23 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <errno.h>
+#include "osd/ErasureCodePlugin.h"
+
+int __erasure_code_init(char *plugin_name)
+{
+ return -ESRCH;
+}
diff --git a/src/test/osd/ErasureCodePluginFailToRegister.cc b/src/test/osd/ErasureCodePluginFailToRegister.cc
new file mode 100644
index 00000000000..ea980b722ae
--- /dev/null
+++ b/src/test/osd/ErasureCodePluginFailToRegister.cc
@@ -0,0 +1,22 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include "osd/ErasureCodePlugin.h"
+
+int __erasure_code_init(char *plugin_name)
+{
+ return 0;
+}
diff --git a/src/test/osd/ErasureCodePluginHangs.cc b/src/test/osd/ErasureCodePluginHangs.cc
new file mode 100644
index 00000000000..ea73786b526
--- /dev/null
+++ b/src/test/osd/ErasureCodePluginHangs.cc
@@ -0,0 +1,24 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <unistd.h>
+#include "osd/ErasureCodePlugin.h"
+
+int __erasure_code_init(char *plugin_name)
+{
+ sleep(1000);
+ return 0;
+}
diff --git a/src/test/osd/ErasureCodePluginMissingEntryPoint.cc b/src/test/osd/ErasureCodePluginMissingEntryPoint.cc
new file mode 100644
index 00000000000..fc60f866086
--- /dev/null
+++ b/src/test/osd/ErasureCodePluginMissingEntryPoint.cc
@@ -0,0 +1 @@
+// missing int __erasure_code_init(char *plugin_name) {}
diff --git a/src/test/osd/Object.cc b/src/test/osd/Object.cc
index 408cc63ac02..d4be4df1bdd 100644
--- a/src/test/osd/Object.cc
+++ b/src/test/osd/Object.cc
@@ -9,10 +9,11 @@
ostream &operator<<(ostream &out, const ContDesc &rhs)
{
- return out << "ObjNum: " << rhs.objnum
- << " snap: " << rhs.cursnap
- << " seqnum: " << rhs.seqnum
- << " prefix: " << rhs.prefix;
+ return out << "(ObjNum " << rhs.objnum
+ << " snap " << rhs.cursnap
+ << " seq_num " << rhs.seqnum
+ //<< " prefix " << rhs.prefix
+ << ")";
}
void VarLenGenerator::get_ranges(const ContDesc &cont, interval_set<uint64_t> &out) {
diff --git a/src/test/osd/Object.h b/src/test/osd/Object.h
index 39acf1e2175..09f0a5f2e4c 100644
--- a/src/test/osd/Object.h
+++ b/src/test/osd/Object.h
@@ -240,9 +240,9 @@ public:
class ObjectDesc {
public:
ObjectDesc(ContentsGenerator *cont_gen) :
- exists(false), tmap(false), layers(), cont_gen(cont_gen) {};
+ exists(false), tmap(false), version(0), layers(), cont_gen(cont_gen) {};
ObjectDesc(const ContDesc &init, ContentsGenerator *cont_gen) :
- exists(false), tmap(false), layers(), cont_gen(cont_gen) {
+ exists(false), tmap(false), version(0), layers(), cont_gen(cont_gen) {
layers.push_front(init);
};
@@ -314,6 +314,7 @@ public:
bool exists;
bool tmap;
bufferlist tmap_contents;
+ uint64_t version;
private:
list<ContDesc> layers;
ContentsGenerator *cont_gen;
diff --git a/src/test/osd/RadosModel.h b/src/test/osd/RadosModel.h
index b022d24dc91..ac2f336f110 100644
--- a/src/test/osd/RadosModel.h
+++ b/src/test/osd/RadosModel.h
@@ -1,4 +1,6 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+#include "include/int_types.h"
+
#include "common/Mutex.h"
#include "common/Cond.h"
#include "include/rados/librados.hpp"
@@ -15,7 +17,6 @@
#include <time.h>
#include "Object.h"
#include "TestOpStat.h"
-#include "inttypes.h"
#include "test/librados/test.h"
#ifndef RADOSMODEL_H
@@ -48,7 +49,8 @@ enum TestOpType {
TEST_OP_SETATTR,
TEST_OP_RMATTR,
TEST_OP_TMAPPUT,
- TEST_OP_WATCH
+ TEST_OP_WATCH,
+ TEST_OP_COPY_FROM
};
class TestWatchContext : public librados::WatchCtx {
@@ -81,14 +83,16 @@ public:
class TestOp {
public:
+ int num;
RadosTestContext *context;
TestOpStat *stat;
bool done;
- TestOp(RadosTestContext *context,
- TestOpStat *stat = 0) :
- context(context),
- stat(stat),
- done(0)
+ TestOp(int n, RadosTestContext *context,
+ TestOpStat *stat = 0)
+ : num(n),
+ context(context),
+ stat(stat),
+ done(0)
{}
virtual ~TestOp() {};
@@ -139,6 +143,7 @@ public:
map<int, map<string,ObjectDesc> > pool_obj_cont;
set<string> oid_in_use;
set<string> oid_not_in_use;
+ set<int> snaps_in_use;
int current_snap;
string pool_name;
librados::IoCtx io_ctx;
@@ -228,6 +233,7 @@ public:
for (list<TestOp*>::iterator i = inflight.begin();
i != inflight.end();) {
if ((*i)->finished()) {
+ cout << (*i)->num << ": done (" << (inflight.size()-1) << " left)" << std::endl;
delete *i;
inflight.erase(i++);
} else {
@@ -236,7 +242,7 @@ public:
}
if (inflight.size() >= (unsigned) max_in_flight || (!next && !inflight.empty())) {
- cout << "Waiting on " << inflight.size() << std::endl;
+ cout << " waiting on " << inflight.size() << std::endl;
wait();
} else {
break;
@@ -396,6 +402,27 @@ public:
pool_obj_cont[current_snap].insert(pair<string,ObjectDesc>(oid, new_obj));
}
+ void update_object_full(const string &oid, const ObjectDesc &contents)
+ {
+ pool_obj_cont[current_snap].erase(oid);
+ pool_obj_cont[current_snap].insert(pair<string,ObjectDesc>(oid, contents));
+ }
+
+ void update_object_version(const string &oid, uint64_t version)
+ {
+ for (map<int, map<string,ObjectDesc> >::reverse_iterator i =
+ pool_obj_cont.rbegin();
+ i != pool_obj_cont.rend();
+ ++i) {
+ map<string,ObjectDesc>::iterator j = i->second.find(oid);
+ if (j != i->second.end()) {
+ j->second.version = version;
+ cout << __func__ << " oid " << oid << " v " << version << " " << j->second.most_recent() << std::endl;
+ break;
+ }
+ }
+ }
+
void remove_object(const string &oid)
{
assert(!get_watch_context(oid));
@@ -465,11 +492,11 @@ public:
librados::ObjectWriteOperation op;
librados::AioCompletion *comp;
bool done;
- RemoveAttrsOp(RadosTestContext *context,
+ RemoveAttrsOp(int n, RadosTestContext *context,
const string &oid,
- TestOpStat *stat) :
- TestOp(context, stat), oid(oid), comp(NULL), done(false)
- {}
+ TestOpStat *stat)
+ : TestOp(n, context, stat), oid(oid), comp(NULL), done(false)
+ {}
void _begin()
{
@@ -554,11 +581,12 @@ public:
librados::ObjectWriteOperation op;
librados::AioCompletion *comp;
bool done;
- TmapPutOp(RadosTestContext *context,
- const string &oid,
- TestOpStat *stat) :
- TestOp(context, stat), oid(oid), comp(NULL), done(false)
- {}
+ TmapPutOp(int n,
+ RadosTestContext *context,
+ const string &oid,
+ TestOpStat *stat)
+ : TestOp(n, context, stat), oid(oid), comp(NULL), done(false)
+ {}
void _begin()
{
@@ -624,6 +652,7 @@ public:
assert(0);
}
done = true;
+ context->update_object_version(oid, comp->get_version64());
context->oid_in_use.erase(oid);
context->oid_not_in_use.insert(oid);
context->kick();
@@ -646,11 +675,13 @@ public:
librados::ObjectWriteOperation op;
librados::AioCompletion *comp;
bool done;
- SetAttrsOp(RadosTestContext *context,
- const string &oid,
- TestOpStat *stat) :
- TestOp(context, stat), oid(oid), comp(NULL), done(false)
- {}
+ SetAttrsOp(int n,
+ RadosTestContext *context,
+ const string &oid,
+ TestOpStat *stat)
+ : TestOp(n, context, stat),
+ oid(oid), comp(NULL), done(false)
+ {}
void _begin()
{
@@ -714,6 +745,7 @@ public:
assert(0);
}
done = true;
+ context->update_object_version(oid, comp->get_version64());
context->oid_in_use.erase(oid);
context->oid_not_in_use.insert(oid);
context->kick();
@@ -735,14 +767,19 @@ public:
string oid;
ContDesc cont;
set<librados::AioCompletion *> waiting;
+ librados::AioCompletion *rcompletion;
uint64_t waiting_on;
uint64_t last_acked_tid;
- WriteOp(RadosTestContext *context,
+ librados::ObjectReadOperation read_op;
+ bufferlist rbuffer;
+
+ WriteOp(int n,
+ RadosTestContext *context,
const string &oid,
- TestOpStat *stat = 0) :
- TestOp(context, stat),
- oid(oid), waiting_on(0), last_acked_tid(0)
+ TestOpStat *stat = 0)
+ : TestOp(n, context, stat),
+ oid(oid), waiting_on(0), last_acked_tid(0)
{}
void _begin()
@@ -760,27 +797,14 @@ public:
context->oid_in_use.insert(oid);
context->oid_not_in_use.erase(oid);
- context->seq_num++;
-
- vector<uint64_t> snapset(context->snaps.size());
- int j = 0;
- for (map<int,uint64_t>::reverse_iterator i = context->snaps.rbegin();
- i != context->snaps.rend();
- ++i, ++j) {
- snapset[j] = i->second;
- }
interval_set<uint64_t> ranges;
context->cont_gen.get_ranges(cont, ranges);
+ std::cout << num << ": seq_num " << context->seq_num << " ranges " << ranges << std::endl;
+ context->seq_num++;
context->state_lock.Unlock();
- int r = context->io_ctx.selfmanaged_snap_set_write_ctx(context->seq, snapset);
- if (r) {
- cerr << "r is " << r << " snapset is " << snapset << " seq is " << context->seq << std::endl;
- assert(0);
- }
-
waiting_on = ranges.num_intervals();
- cout << "waiting_on = " << waiting_on << std::endl;
+ //cout << " waiting_on = " << waiting_on << std::endl;
ContentsGenerator::iterator gen_pos = context->cont_gen.get_iterator(cont);
uint64_t tid = 1;
for (interval_set<uint64_t>::iterator i = ranges.begin();
@@ -793,9 +817,8 @@ public:
}
assert(to_write.length() == i.get_len());
assert(to_write.length() > 0);
- std::cout << "Writing " << context->prefix+oid << " from " << i.get_start()
- << " to " << i.get_len() + i.get_start() << " tid " << tid
- << " ranges are " << ranges << std::endl;
+ std::cout << num << ": writing " << context->prefix+oid << " from " << i.get_start()
+ << " to " << i.get_len() + i.get_start() << " tid " << tid << std::endl;
pair<TestOp*, TestOp::CallbackInfo*> *cb_arg =
new pair<TestOp*, TestOp::CallbackInfo*>(this,
new TestOp::CallbackInfo(tid));
@@ -805,6 +828,21 @@ public:
context->io_ctx.aio_write(context->prefix+oid, completion,
to_write, i.get_len(), i.get_start());
}
+
+ pair<TestOp*, TestOp::CallbackInfo*> *cb_arg =
+ new pair<TestOp*, TestOp::CallbackInfo*>(
+ this,
+ new TestOp::CallbackInfo(tid));
+ rcompletion = context->rados.aio_create_completion(
+ (void*) cb_arg, &write_callback, NULL);
+ waiting_on++;
+ read_op.read(0, 1, &rbuffer, 0);
+ context->io_ctx.aio_operate(
+ context->prefix+oid, rcompletion,
+ &read_op,
+ librados::SNAP_HEAD,
+ librados::OPERATION_ORDER_READS_WRITES, // order wrt previous write/update
+ 0);
}
void _finish(CallbackInfo *info)
@@ -813,7 +851,7 @@ public:
context->state_lock.Lock();
uint64_t tid = info->id;
- cout << "finishing write tid " << tid << " to " << context->prefix + oid << std::endl;
+ cout << num << ": finishing write tid " << tid << " to " << context->prefix + oid << std::endl;
if (tid <= last_acked_tid) {
cerr << "Error: finished tid " << tid
@@ -825,6 +863,7 @@ public:
assert(!done);
waiting_on--;
if (waiting_on == 0) {
+ uint64_t version = 0;
for (set<librados::AioCompletion *>::iterator i = waiting.begin();
i != waiting.end();
) {
@@ -833,10 +872,20 @@ public:
cerr << "Error: oid " << oid << " write returned error code "
<< err << std::endl;
}
+ if ((*i)->get_version64() > version)
+ version = (*i)->get_version64();
(*i)->release();
waiting.erase(i++);
}
+ context->update_object_version(oid, version);
+ if (rcompletion->get_version64() != version) {
+ cerr << "Error: racing read on " << oid << " returned version "
+ << rcompletion->get_version64() << " rather than version "
+ << version << std::endl;
+ assert(0 == "racing read got wrong version");
+ }
+ rcompletion->release();
context->oid_in_use.erase(oid);
context->oid_not_in_use.insert(oid);
context->kick();
@@ -860,10 +909,11 @@ class DeleteOp : public TestOp {
public:
string oid;
- DeleteOp(RadosTestContext *context,
+ DeleteOp(int n,
+ RadosTestContext *context,
const string &oid,
- TestOpStat *stat = 0) :
- TestOp(context, stat), oid(oid)
+ TestOpStat *stat = 0)
+ : TestOp(n, context, stat), oid(oid)
{}
void _begin()
@@ -885,23 +935,10 @@ public:
context->remove_object(oid);
- vector<uint64_t> snapset(context->snaps.size());
- int j = 0;
- for (map<int,uint64_t>::reverse_iterator i = context->snaps.rbegin();
- i != context->snaps.rend();
- ++i, ++j) {
- snapset[j] = i->second;
- }
interval_set<uint64_t> ranges;
context->state_lock.Unlock();
- int r = context->io_ctx.selfmanaged_snap_set_write_ctx(context->seq, snapset);
- if (r) {
- cerr << "r is " << r << " snapset is " << snapset << " seq is " << context->seq << std::endl;
- assert(0);
- }
-
- r = context->io_ctx.remove(context->prefix+oid);
+ int r = context->io_ctx.remove(context->prefix+oid);
if (r && !(r == -ENOENT && !present)) {
cerr << "r is " << r << " while deleting " << oid << " and present is " << present << std::endl;
assert(0);
@@ -941,16 +978,17 @@ public:
bufferlist header;
map<string, bufferlist> xattrs;
- ReadOp(RadosTestContext *context,
+ ReadOp(int n,
+ RadosTestContext *context,
const string &oid,
- TestOpStat *stat = 0) :
- TestOp(context, stat),
- completion(NULL),
- oid(oid),
- old_value(&context->cont_gen),
- snap(0),
- retval(0),
- attrretval(0)
+ TestOpStat *stat = 0)
+ : TestOp(n, context, stat),
+ completion(NULL),
+ oid(oid),
+ old_value(&context->cont_gen),
+ snap(0),
+ retval(0),
+ attrretval(0)
{}
void _begin()
@@ -973,9 +1011,9 @@ public:
if (ctx) {
assert(old_value.exists);
TestAlarm alarm;
- std::cerr << "about to start" << std::endl;
+ std::cerr << num << ": about to start" << std::endl;
ctx->start();
- std::cerr << "started" << std::endl;
+ std::cerr << num << ": started" << std::endl;
bufferlist bl;
context->io_ctx.set_notify_timeout(600);
int r = context->io_ctx.notify(context->prefix+oid, 0, bl);
@@ -983,7 +1021,7 @@ public:
std::cerr << "r is " << r << std::endl;
assert(0);
}
- std::cerr << "notified, waiting" << std::endl;
+ std::cerr << num << ": notified, waiting" << std::endl;
ctx->wait();
}
if (snap >= 0) {
@@ -1027,27 +1065,30 @@ public:
context->oid_in_use.erase(oid);
context->oid_not_in_use.insert(oid);
assert(completion->is_complete());
+ uint64_t version = completion->get_version64();
if (int err = completion->get_return_value()) {
if (!(err == -ENOENT && old_value.deleted())) {
- cerr << "Error: oid " << oid << " read returned error code "
+ cerr << num << ": Error: oid " << oid << " read returned error code "
<< err << std::endl;
+ context->errors++;
}
} else {
+ cout << num << ": expect " << old_value.most_recent() << std::endl;
assert(!old_value.deleted());
if (old_value.has_contents()) {
ContDesc to_check;
bufferlist::iterator p = result.begin();
if (!context->cont_gen.read_header(p, to_check)) {
- cerr << "Unable to decode oid " << oid << " at snap " << context->current_snap << std::endl;
+ cerr << num << ": Unable to decode oid " << oid << " at snap " << context->current_snap << std::endl;
context->errors++;
}
if (to_check != old_value.most_recent()) {
- cerr << "Found incorrect object contents " << to_check
- << ", expected " << old_value.most_recent() << " oid " << oid << std::endl;
+ cerr << num << ": oid " << oid << " found incorrect object contents " << to_check
+ << ", expected " << old_value.most_recent() << std::endl;
context->errors++;
}
if (!old_value.check(result)) {
- cerr << "Object " << oid << " contents " << to_check << " corrupt" << std::endl;
+ cerr << num << ": oid " << oid << " contents " << to_check << " corrupt" << std::endl;
context->errors++;
}
if (context->errors) assert(0);
@@ -1055,26 +1096,31 @@ public:
// Attributes
if (!(old_value.header == header)) {
- cerr << "oid: " << oid << " header does not match, old size: "
+ cerr << num << ": oid " << oid << " header does not match, old size: "
<< old_value.header.length() << " new size " << header.length()
<< std::endl;
assert(old_value.header == header);
}
if (omap.size() != old_value.attrs.size()) {
- cerr << "oid: " << oid << " tmap.size() is " << omap.size()
+ cerr << num << ": oid " << oid << " tmap.size() is " << omap.size()
<< " and old is " << old_value.attrs.size() << std::endl;
assert(omap.size() == old_value.attrs.size());
}
if (omap_keys.size() != old_value.attrs.size()) {
- cerr << "oid: " << oid << " tmap.size() is " << omap_keys.size()
+ cerr << num << ": oid " << oid << " tmap.size() is " << omap_keys.size()
<< " and old is " << old_value.attrs.size() << std::endl;
assert(omap_keys.size() == old_value.attrs.size());
}
if (xattrs.size() != old_value.attrs.size()) {
- cerr << "oid: " << oid << " xattrs.size() is " << xattrs.size()
+ cerr << num << ": oid " << oid << " xattrs.size() is " << xattrs.size()
<< " and old is " << old_value.attrs.size() << std::endl;
assert(xattrs.size() == old_value.attrs.size());
}
+ if (version != old_value.version) {
+ cerr << num << ": oid " << oid << " version is " << version
+ << " and expected " << old_value.version << std::endl;
+ assert(version == old_value.version);
+ }
for (map<string, bufferlist>::iterator omap_iter = omap.begin();
omap_iter != omap.end();
++omap_iter) {
@@ -1128,9 +1174,10 @@ public:
class SnapCreateOp : public TestOp {
public:
- SnapCreateOp(RadosTestContext *context,
- TestOpStat *stat = 0) :
- TestOp(context, stat)
+ SnapCreateOp(int n,
+ RadosTestContext *context,
+ TestOpStat *stat = 0)
+ : TestOp(n, context, stat)
{}
void _begin()
@@ -1166,11 +1213,11 @@ public:
class SnapRemoveOp : public TestOp {
public:
int to_remove;
- SnapRemoveOp(RadosTestContext *context,
+ SnapRemoveOp(int n, RadosTestContext *context,
int snap,
- TestOpStat *stat = 0) :
- TestOp(context, stat),
- to_remove(snap)
+ TestOpStat *stat = 0)
+ : TestOp(n, context, stat),
+ to_remove(snap)
{}
void _begin()
@@ -1206,11 +1253,12 @@ public:
class WatchOp : public TestOp {
string oid;
public:
- WatchOp(RadosTestContext *context,
- const string &_oid,
- TestOpStat *stat = 0) :
- TestOp(context, stat),
- oid(_oid)
+ WatchOp(int n,
+ RadosTestContext *context,
+ const string &_oid,
+ TestOpStat *stat = 0)
+ : TestOp(n, context, stat),
+ oid(_oid)
{}
void _begin()
@@ -1226,17 +1274,8 @@ public:
context->oid_in_use.insert(oid);
context->oid_not_in_use.erase(oid);
- vector<uint64_t> snapset(context->snaps.size());
- int j = 0;
- for (map<int,uint64_t>::reverse_iterator i = context->snaps.rbegin();
- i != context->snaps.rend();
- ++i, ++j) {
- snapset[j] = i->second;
- }
-
TestWatchContext *ctx = context->get_watch_context(oid);
context->state_lock.Unlock();
- assert(!context->io_ctx.selfmanaged_snap_set_write_ctx(context->seq, snapset));
int r;
if (!ctx) {
{
@@ -1279,13 +1318,18 @@ class RollbackOp : public TestOp {
public:
string oid;
int roll_back_to;
- RollbackOp(RadosTestContext *context,
+ bool done;
+ librados::ObjectWriteOperation op;
+ librados::AioCompletion *comp;
+
+ RollbackOp(int n,
+ RadosTestContext *context,
const string &_oid,
int snap,
- TestOpStat *stat = 0) :
- TestOp(context, stat),
- oid(_oid),
- roll_back_to(snap)
+ TestOpStat *stat = 0)
+ : TestOp(n, context, stat),
+ oid(_oid),
+ roll_back_to(snap), done(false)
{}
void _begin()
@@ -1298,38 +1342,179 @@ public:
}
context->oid_in_use.insert(oid);
context->oid_not_in_use.erase(oid);
+ context->snaps_in_use.insert(roll_back_to);
+
context->roll_back(oid, roll_back_to);
uint64_t snap = context->snaps[roll_back_to];
- vector<uint64_t> snapset(context->snaps.size());
- int j = 0;
- for (map<int,uint64_t>::reverse_iterator i = context->snaps.rbegin();
- i != context->snaps.rend();
- ++i, ++j) {
- snapset[j] = i->second;
- }
context->state_lock.Unlock();
- assert(!context->io_ctx.selfmanaged_snap_set_write_ctx(context->seq, snapset));
-
- int r = context->io_ctx.selfmanaged_snap_rollback(context->prefix+oid,
- snap);
- if (r) {
- cerr << "r is " << r << std::endl;
+ op.selfmanaged_snap_rollback(snap);
+
+ pair<TestOp*, TestOp::CallbackInfo*> *cb_arg =
+ new pair<TestOp*, TestOp::CallbackInfo*>(this,
+ new TestOp::CallbackInfo(0));
+ comp = context->rados.aio_create_completion((void*) cb_arg, &write_callback,
+ NULL);
+ context->io_ctx.aio_operate(context->prefix+oid, comp, &op);
+ }
+
+ void _finish(CallbackInfo *info)
+ {
+ Mutex::Locker l(context->state_lock);
+ int r;
+ if ((r = comp->get_return_value())) {
+ cerr << "err " << r << std::endl;
assert(0);
}
+ done = true;
+ context->update_object_version(oid, comp->get_version64());
+ context->oid_in_use.erase(oid);
+ context->oid_not_in_use.insert(oid);
+ context->snaps_in_use.erase(roll_back_to);
+ context->kick();
+ }
+ bool finished()
+ {
+ return done;
+ }
+
+ string getType()
+ {
+ return "RollBackOp";
+ }
+};
+
+class CopyFromOp : public TestOp {
+public:
+ string oid, oid_src;
+ ObjectDesc src_value;
+ librados::ObjectWriteOperation op;
+ librados::ObjectReadOperation rd_op;
+ librados::AioCompletion *comp;
+ librados::AioCompletion *comp_racing_read;
+ int snap;
+ int done;
+ uint64_t version;
+ int r;
+ CopyFromOp(int n,
+ RadosTestContext *context,
+ const string &oid,
+ const string &oid_src,
+ TestOpStat *stat)
+ : TestOp(n, context, stat),
+ oid(oid), oid_src(oid_src),
+ src_value(&context->cont_gen),
+ comp(NULL), done(0), version(0), r(0)
+ {}
+
+ void _begin()
+ {
+ ContDesc cont;
{
Mutex::Locker l(context->state_lock);
+ cont = ContDesc(context->seq_num, context->current_snap,
+ context->seq_num, "");
+ context->oid_in_use.insert(oid);
+ context->oid_not_in_use.erase(oid);
+ context->oid_in_use.insert(oid_src);
+ context->oid_not_in_use.erase(oid_src);
+ }
+
+ // choose source snap
+ if (0 && !(rand() % 4) && !context->snaps.empty()) {
+ snap = rand_choose(context->snaps)->first;
+ } else {
+ snap = -1;
+ }
+ context->find_object(oid_src, &src_value, snap);
+ if (!src_value.deleted())
+ context->update_object_full(oid, src_value);
+
+ string src = context->prefix+oid_src;
+ op.copy_from(src.c_str(), context->io_ctx, src_value.version);
+
+ pair<TestOp*, TestOp::CallbackInfo*> *cb_arg =
+ new pair<TestOp*, TestOp::CallbackInfo*>(this,
+ new TestOp::CallbackInfo(0));
+ comp = context->rados.aio_create_completion((void*) cb_arg, &write_callback,
+ NULL);
+ context->io_ctx.aio_operate(context->prefix+oid, comp, &op);
+
+ // queue up a racing read, too.
+ pair<TestOp*, TestOp::CallbackInfo*> *read_cb_arg =
+ new pair<TestOp*, TestOp::CallbackInfo*>(this,
+ new TestOp::CallbackInfo(1));
+ comp_racing_read = context->rados.aio_create_completion((void*) read_cb_arg, &write_callback,
+ NULL);
+ rd_op.stat(NULL, NULL, NULL);
+ context->io_ctx.aio_operate(context->prefix+oid, comp_racing_read, &rd_op,
+ librados::SNAP_HEAD,
+ librados::OPERATION_ORDER_READS_WRITES, // order wrt previous write/update
+ NULL);
+
+ }
+
+ void _finish(CallbackInfo *info)
+ {
+ Mutex::Locker l(context->state_lock);
+
+ // note that the read can (and atm will) come back before the
+ // write reply, but will reflect the update and the versions will
+ // match.
+
+ if (info->id == 0) {
+ // copy_from
+ assert(comp->is_complete());
+ cout << num << ": finishing copy_from to " << context->prefix + oid << std::endl;
+ if ((r = comp->get_return_value())) {
+ if (r == -ENOENT && src_value.deleted()) {
+ cout << num << ": got expected ENOENT (src dne)" << std::endl;
+ } else {
+ cerr << "Error: oid " << oid << " copy_from " << oid_src << " returned error code "
+ << r << std::endl;
+ assert(0);
+ }
+ } else {
+ assert(!version || comp->get_version64() == version);
+ version = comp->get_version64();
+ context->update_object_version(oid, comp->get_version64());
+ }
+ } else if (info->id == 1) {
+ // racing read
+ assert(comp_racing_read->is_complete());
+ cout << num << ": finishing copy_from racing read to " << context->prefix + oid << std::endl;
+ if ((r = comp_racing_read->get_return_value())) {
+ if (!(r == -ENOENT && src_value.deleted())) {
+ cerr << "Error: oid " << oid << " copy_from " << oid_src << " returned error code "
+ << r << std::endl;
+ }
+ } else {
+ assert(comp_racing_read->get_return_value() == 0);
+ assert(!version || comp_racing_read->get_version64() == version);
+ version = comp_racing_read->get_version64();
+ }
+ }
+ if (++done == 2) {
context->oid_in_use.erase(oid);
context->oid_not_in_use.insert(oid);
+ context->oid_in_use.erase(oid_src);
+ context->oid_not_in_use.insert(oid_src);
+ context->kick();
}
}
+ bool finished()
+ {
+ return done == 2;
+ }
+
string getType()
{
- return "RollBackOp";
+ return "TmapPutOp";
}
};
+
#endif
diff --git a/src/test/osd/TestErasureCodeExample.cc b/src/test/osd/TestErasureCodeExample.cc
new file mode 100644
index 00000000000..f12e80c8cd0
--- /dev/null
+++ b/src/test/osd/TestErasureCodeExample.cc
@@ -0,0 +1,173 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include "global/global_init.h"
+#include "ErasureCodeExample.h"
+#include "common/ceph_argparse.h"
+#include "global/global_context.h"
+#include "gtest/gtest.h"
+
+TEST(ErasureCodeExample, minimum_to_decode)
+{
+ ErasureCodeExample example;
+ set<int> available_chunks;
+ set<int> want_to_read;
+ want_to_read.insert(1);
+ {
+ set<int> minimum;
+ EXPECT_EQ(-EIO, example.minimum_to_decode(want_to_read,
+ available_chunks,
+ &minimum));
+ }
+ available_chunks.insert(0);
+ available_chunks.insert(2);
+ {
+ set<int> minimum;
+ EXPECT_EQ(0, example.minimum_to_decode(want_to_read,
+ available_chunks,
+ &minimum));
+ EXPECT_EQ(available_chunks, minimum);
+ EXPECT_EQ(2u, minimum.size());
+ EXPECT_EQ(1u, minimum.count(0));
+ EXPECT_EQ(1u, minimum.count(2));
+ }
+ {
+ set<int> minimum;
+ available_chunks.insert(1);
+ EXPECT_EQ(0, example.minimum_to_decode(want_to_read,
+ available_chunks,
+ &minimum));
+ EXPECT_EQ(1u, minimum.size());
+ EXPECT_EQ(1u, minimum.count(1));
+ }
+}
+
+TEST(ErasureCodeExample, minimum_to_decode_with_cost)
+{
+ ErasureCodeExample example;
+ map<int,int> available;
+ set<int> want_to_read;
+ want_to_read.insert(1);
+ {
+ set<int> minimum;
+ EXPECT_EQ(-EIO, example.minimum_to_decode_with_cost(want_to_read,
+ available,
+ &minimum));
+ }
+ available[0] = 1;
+ available[2] = 1;
+ {
+ set<int> minimum;
+ EXPECT_EQ(0, example.minimum_to_decode_with_cost(want_to_read,
+ available,
+ &minimum));
+ EXPECT_EQ(2u, minimum.size());
+ EXPECT_EQ(1u, minimum.count(0));
+ EXPECT_EQ(1u, minimum.count(2));
+ }
+ {
+ set<int> minimum;
+ available[1] = 1;
+ EXPECT_EQ(0, example.minimum_to_decode_with_cost(want_to_read,
+ available,
+ &minimum));
+ EXPECT_EQ(1u, minimum.size());
+ EXPECT_EQ(1u, minimum.count(1));
+ }
+ {
+ set<int> minimum;
+ available[1] = 2;
+ EXPECT_EQ(0, example.minimum_to_decode_with_cost(want_to_read,
+ available,
+ &minimum));
+ EXPECT_EQ(2u, minimum.size());
+ EXPECT_EQ(1u, minimum.count(0));
+ EXPECT_EQ(1u, minimum.count(2));
+ }
+}
+
+TEST(ErasureCodeExample, encode_decode)
+{
+ ErasureCodeExample example;
+
+ bufferlist in;
+ in.append("ABCDE");
+ int want_to_encode[] = { 0, 1, 2 };
+ map<int, bufferlist> encoded;
+ EXPECT_EQ(0, example.encode(set<int>(want_to_encode, want_to_encode+3),
+ in,
+ &encoded));
+ EXPECT_EQ(3u, encoded.size());
+ EXPECT_EQ(3u, encoded[0].length());
+ EXPECT_EQ('A', encoded[0][0]);
+ EXPECT_EQ('B', encoded[0][1]);
+ EXPECT_EQ('C', encoded[0][2]);
+ EXPECT_EQ('D', encoded[1][0]);
+ EXPECT_EQ('E', encoded[1][1]);
+ EXPECT_EQ('A'^'D', encoded[2][0]);
+ EXPECT_EQ('B'^'E', encoded[2][1]);
+ EXPECT_EQ('C'^0, encoded[2][2]);
+
+ // all chunks are available
+ {
+ int want_to_decode[] = { 0, 1 };
+ map<int, bufferlist> decoded;
+ EXPECT_EQ(0, example.decode(set<int>(want_to_decode, want_to_decode+2),
+ encoded,
+ &decoded));
+ EXPECT_EQ(2u, decoded.size());
+ EXPECT_EQ(3u, decoded[0].length());
+ EXPECT_EQ('A', decoded[0][0]);
+ EXPECT_EQ('B', decoded[0][1]);
+ EXPECT_EQ('C', decoded[0][2]);
+ EXPECT_EQ('D', decoded[1][0]);
+ EXPECT_EQ('E', decoded[1][1]);
+ }
+
+ // one chunk is missing
+ {
+ map<int, bufferlist> degraded = encoded;
+ degraded.erase(0);
+ EXPECT_EQ(2u, degraded.size());
+ int want_to_decode[] = { 0, 1 };
+ map<int, bufferlist> decoded;
+ EXPECT_EQ(0, example.decode(set<int>(want_to_decode, want_to_decode+2),
+ degraded,
+ &decoded));
+ EXPECT_EQ(2u, decoded.size());
+ EXPECT_EQ(3u, decoded[0].length());
+ EXPECT_EQ('A', decoded[0][0]);
+ EXPECT_EQ('B', decoded[0][1]);
+ EXPECT_EQ('C', decoded[0][2]);
+ EXPECT_EQ('D', decoded[1][0]);
+ EXPECT_EQ('E', decoded[1][1]);
+ }
+}
+
+int main(int argc, char **argv) {
+ vector<const char*> args;
+ argv_to_vec(argc, (const char **)argv, args);
+
+ global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
+ common_init_finish(g_ceph_context);
+
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+// Local Variables:
+// compile-command: "cd ../.. ; make -j4 && make unittest_erasure_code_example && valgrind --leak-check=full --tool=memcheck ./unittest_erasure_code_example --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
+// End:
diff --git a/src/test/osd/TestErasureCodeJerasure.cc b/src/test/osd/TestErasureCodeJerasure.cc
new file mode 100644
index 00000000000..a51cb853c86
--- /dev/null
+++ b/src/test/osd/TestErasureCodeJerasure.cc
@@ -0,0 +1,306 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <errno.h>
+#include "global/global_init.h"
+#include "osd/ErasureCodePluginJerasure/ErasureCodeJerasure.h"
+#include "common/ceph_argparse.h"
+#include "global/global_context.h"
+#include "gtest/gtest.h"
+
+template <typename T>
+class ErasureCodeTest : public ::testing::Test {
+ public:
+};
+
+typedef ::testing::Types<
+ ErasureCodeJerasureReedSolomonVandermonde,
+ ErasureCodeJerasureReedSolomonRAID6,
+ ErasureCodeJerasureCauchyOrig,
+ ErasureCodeJerasureCauchyGood,
+ ErasureCodeJerasureLiberation,
+ ErasureCodeJerasureBlaumRoth,
+ ErasureCodeJerasureLiber8tion
+> JerasureTypes;
+TYPED_TEST_CASE(ErasureCodeTest, JerasureTypes);
+
+TYPED_TEST(ErasureCodeTest, encode_decode)
+{
+ TypeParam jerasure;
+ map<std::string,std::string> parameters;
+ parameters["erasure-code-k"] = "2";
+ parameters["erasure-code-m"] = "2";
+ parameters["erasure-code-w"] = "7";
+ parameters["erasure-code-packetsize"] = "8";
+ jerasure.init(parameters);
+
+#define LARGE_ENOUGH 2048
+ bufferptr in_ptr(LARGE_ENOUGH);
+ in_ptr.zero();
+ in_ptr.set_length(0);
+ const char *payload =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
+ in_ptr.append(payload, strlen(payload));
+ bufferlist in;
+ in.push_front(in_ptr);
+ int want_to_encode[] = { 0, 1, 2, 3 };
+ map<int, bufferlist> encoded;
+ EXPECT_EQ(0, jerasure.encode(set<int>(want_to_encode, want_to_encode+4),
+ in,
+ &encoded));
+ EXPECT_EQ(4u, encoded.size());
+ unsigned length = encoded[0].length();
+ EXPECT_EQ(0, strncmp(encoded[0].c_str(), in.c_str(), length));
+ EXPECT_EQ(0, strncmp(encoded[1].c_str(), in.c_str() + length,
+ in.length() - length));
+
+
+ // all chunks are available
+ {
+ int want_to_decode[] = { 0, 1 };
+ map<int, bufferlist> decoded;
+ EXPECT_EQ(0, jerasure.decode(set<int>(want_to_decode, want_to_decode+2),
+ encoded,
+ &decoded));
+ // always decode all, regardless of want_to_decode
+ EXPECT_EQ(4u, decoded.size());
+ EXPECT_EQ(length, decoded[0].length());
+ EXPECT_EQ(0, strncmp(decoded[0].c_str(), in.c_str(), length));
+ EXPECT_EQ(0, strncmp(decoded[1].c_str(), in.c_str() + length,
+ in.length() - length));
+ }
+
+ // two chunks are missing
+ {
+ map<int, bufferlist> degraded = encoded;
+ degraded.erase(0);
+ degraded.erase(1);
+ EXPECT_EQ(2u, degraded.size());
+ int want_to_decode[] = { 0, 1 };
+ map<int, bufferlist> decoded;
+ EXPECT_EQ(0, jerasure.decode(set<int>(want_to_decode, want_to_decode+2),
+ degraded,
+ &decoded));
+ // always decode all, regardless of want_to_decode
+ EXPECT_EQ(4u, decoded.size());
+ EXPECT_EQ(length, decoded[0].length());
+ EXPECT_EQ(0, strncmp(decoded[0].c_str(), in.c_str(), length));
+ EXPECT_EQ(0, strncmp(decoded[1].c_str(), in.c_str() + length,
+ in.length() - length));
+ }
+}
+
+TYPED_TEST(ErasureCodeTest, minimum_to_decode)
+{
+ TypeParam jerasure;
+ map<std::string,std::string> parameters;
+ parameters["erasure-code-k"] = "2";
+ parameters["erasure-code-m"] = "2";
+ parameters["erasure-code-w"] = "7";
+ parameters["erasure-code-packetsize"] = "8";
+ jerasure.init(parameters);
+
+ //
+ // If trying to read nothing, the minimum is empty.
+ //
+ {
+ set<int> want_to_read;
+ set<int> available_chunks;
+ set<int> minimum;
+
+ EXPECT_EQ(0, jerasure.minimum_to_decode(want_to_read,
+ available_chunks,
+ &minimum));
+ EXPECT_TRUE(minimum.empty());
+ }
+ //
+ // There is no way to read a chunk if none are available.
+ //
+ {
+ set<int> want_to_read;
+ set<int> available_chunks;
+ set<int> minimum;
+
+ want_to_read.insert(0);
+
+ EXPECT_EQ(-EIO, jerasure.minimum_to_decode(want_to_read,
+ available_chunks,
+ &minimum));
+ }
+ //
+ // Reading a subset of the available chunks is always possible.
+ //
+ {
+ set<int> want_to_read;
+ set<int> available_chunks;
+ set<int> minimum;
+
+ want_to_read.insert(0);
+ available_chunks.insert(0);
+
+ EXPECT_EQ(0, jerasure.minimum_to_decode(want_to_read,
+ available_chunks,
+ &minimum));
+ EXPECT_EQ(want_to_read, minimum);
+ }
+ //
+ // There is no way to read a missing chunk if there is less than k
+ // chunks available.
+ //
+ {
+ set<int> want_to_read;
+ set<int> available_chunks;
+ set<int> minimum;
+
+ want_to_read.insert(0);
+ want_to_read.insert(1);
+ available_chunks.insert(0);
+
+ EXPECT_EQ(-EIO, jerasure.minimum_to_decode(want_to_read,
+ available_chunks,
+ &minimum));
+ }
+ //
+ // When chunks are not available, the minimum can be made of any
+ // chunks. For instance, to read 1 and 3 below the minimum could be
+ // 2 and 3 which may seem better because it contains one of the
+ // chunks to be read. But it won't be more efficient than retrieving
+ // 0 and 2 instead because, in both cases, the decode function will
+ // need to run the same recovery operation and use the same amount
+ // of CPU and memory.
+ //
+ {
+ set<int> want_to_read;
+ set<int> available_chunks;
+ set<int> minimum;
+
+ want_to_read.insert(1);
+ want_to_read.insert(3);
+ available_chunks.insert(0);
+ available_chunks.insert(2);
+ available_chunks.insert(3);
+
+ EXPECT_EQ(0, jerasure.minimum_to_decode(want_to_read,
+ available_chunks,
+ &minimum));
+ EXPECT_EQ(2u, minimum.size());
+ EXPECT_EQ(0u, minimum.count(3));
+ }
+}
+
+TEST(ErasureCodeTest, encode)
+{
+ ErasureCodeJerasureReedSolomonVandermonde jerasure;
+ map<std::string,std::string> parameters;
+ parameters["erasure-code-k"] = "2";
+ parameters["erasure-code-m"] = "2";
+ parameters["erasure-code-w"] = "8";
+ jerasure.init(parameters);
+
+ unsigned alignment = jerasure.get_alignment();
+ {
+ //
+ // When the input bufferlist is perfectly aligned, it is
+ // pointed to unmodified by the returned encoded chunks.
+ //
+ bufferlist in;
+ map<int,bufferlist> encoded;
+ int want_to_encode[] = { 0, 1, 2, 3 };
+ in.append(string(alignment * 2, 'X'));
+ EXPECT_EQ(alignment * 2, in.length());
+ EXPECT_EQ(0, jerasure.encode(set<int>(want_to_encode, want_to_encode+4),
+ in,
+ &encoded));
+ EXPECT_EQ(4u, encoded.size());
+ for(int i = 0; i < 4; i++)
+ EXPECT_EQ(alignment, encoded[i].length());
+ EXPECT_EQ(in.c_str(), encoded[0].c_str());
+ EXPECT_EQ(in.c_str() + alignment, encoded[1].c_str());
+ }
+
+ {
+ //
+ // When the input bufferlist needs to be padded because
+ // it is not properly aligned, it is padded with zeros.
+ // The beginning of the input bufferlist is pointed to
+ // unmodified by the returned encoded chunk, only the
+ // trailing chunk is allocated and copied.
+ //
+ bufferlist in;
+ map<int,bufferlist> encoded;
+ int want_to_encode[] = { 0, 1, 2, 3 };
+ int trail_length = 10;
+ in.append(string(alignment + trail_length, 'X'));
+ EXPECT_EQ(0, jerasure.encode(set<int>(want_to_encode, want_to_encode+4),
+ in,
+ &encoded));
+ EXPECT_EQ(4u, encoded.size());
+ for(int i = 0; i < 4; i++)
+ EXPECT_EQ(alignment, encoded[i].length());
+ EXPECT_EQ(in.c_str(), encoded[0].c_str());
+ EXPECT_NE(in.c_str() + alignment, encoded[1].c_str());
+ char *last_chunk = encoded[1].c_str();
+ EXPECT_EQ('X', last_chunk[0]);
+ EXPECT_EQ('\0', last_chunk[trail_length]);
+ }
+
+ {
+ //
+ // When only the first chunk is required, the encoded map only
+ // contains the first chunk. Although the jerasure encode
+ // internally allocated a buffer because of padding requirements
+ // and also computes the coding chunks, they are released before
+ // the return of the method, as shown when running the tests thru
+ // valgrind that shows there is no leak.
+ //
+ bufferlist in;
+ map<int,bufferlist> encoded;
+ set<int> want_to_encode;
+ want_to_encode.insert(0);
+ int trail_length = 10;
+ in.append(string(alignment + trail_length, 'X'));
+ EXPECT_EQ(0, jerasure.encode(want_to_encode, in, &encoded));
+ EXPECT_EQ(1u, encoded.size());
+ EXPECT_EQ(alignment, encoded[0].length());
+ EXPECT_EQ(in.c_str(), encoded[0].c_str());
+ }
+}
+
+int main(int argc, char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, (const char **)argv, args);
+
+ global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
+ common_init_finish(g_ceph_context);
+
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+/*
+ * Local Variables:
+ * compile-command: "cd ../.. ; make -j4 &&
+ * make unittest_erasure_code_jerasure &&
+ * valgrind --tool=memcheck --leak-check=full \
+ * ./unittest_erasure_code_jerasure \
+ * --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
+ * End:
+ */
diff --git a/src/test/osd/TestErasureCodePlugin.cc b/src/test/osd/TestErasureCodePlugin.cc
new file mode 100644
index 00000000000..46ed4b1730d
--- /dev/null
+++ b/src/test/osd/TestErasureCodePlugin.cc
@@ -0,0 +1,108 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <errno.h>
+#include <signal.h>
+#include "common/Thread.h"
+#include "global/global_init.h"
+#include "osd/ErasureCodePlugin.h"
+#include "common/ceph_argparse.h"
+#include "global/global_context.h"
+#include "gtest/gtest.h"
+
+class ErasureCodePluginRegistryTest : public ::testing::Test {
+protected:
+
+ class Thread_factory : public Thread {
+ public:
+ virtual void *entry() {
+ map<std::string,std::string> parameters;
+ parameters["erasure-code-directory"] = ".libs";
+ ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+ ErasureCodeInterfaceRef erasure_code;
+ instance.factory("hangs", parameters, &erasure_code);
+ return NULL;
+ }
+ };
+
+};
+
+TEST_F(ErasureCodePluginRegistryTest, factory_mutex) {
+ ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+
+ EXPECT_TRUE(instance.lock.TryLock());
+ instance.lock.Unlock();
+
+ //
+ // Test that the loading of a plugin is protected by a mutex.
+ //
+ useconds_t delay = 0;
+ const useconds_t DELAY_MAX = 20 * 1000 * 1000;
+ Thread_factory sleep_forever;
+ sleep_forever.create();
+ do {
+ cout << "Trying (1) with delay " << delay << "us\n";
+ if (delay > 0)
+ usleep(delay);
+ if (!instance.loading)
+ delay = ( delay + 1 ) * 2;
+ } while(!instance.loading && delay < DELAY_MAX);
+ ASSERT_TRUE(delay < DELAY_MAX);
+
+ EXPECT_FALSE(instance.lock.TryLock());
+
+ EXPECT_EQ(0, pthread_cancel(sleep_forever.get_thread_id()));
+ EXPECT_EQ(0, sleep_forever.join());
+}
+
+TEST_F(ErasureCodePluginRegistryTest, all)
+{
+ map<std::string,std::string> parameters;
+ parameters["erasure-code-directory"] = ".libs";
+ ErasureCodeInterfaceRef erasure_code;
+ ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+ EXPECT_FALSE(erasure_code);
+ EXPECT_EQ(-EIO, instance.factory("invalid", parameters, &erasure_code));
+ EXPECT_FALSE(erasure_code);
+ EXPECT_EQ(-ENOENT, instance.factory("missing_entry_point", parameters,
+ &erasure_code));
+ EXPECT_FALSE(erasure_code);
+ EXPECT_EQ(-ESRCH, instance.factory("fail_to_initialize", parameters,
+ &erasure_code));
+ EXPECT_FALSE(erasure_code);
+ EXPECT_EQ(-EBADF, instance.factory("fail_to_register", parameters,
+ &erasure_code));
+ EXPECT_FALSE(erasure_code);
+ EXPECT_EQ(0, instance.factory("example", parameters, &erasure_code));
+ EXPECT_TRUE(erasure_code);
+ ErasureCodePlugin *plugin = 0;
+ EXPECT_EQ(-EEXIST, instance.load("example", parameters, &plugin));
+}
+
+int main(int argc, char **argv) {
+ vector<const char*> args;
+ argv_to_vec(argc, (const char **)argv, args);
+
+ global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
+ common_init_finish(g_ceph_context);
+
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+// Local Variables:
+// compile-command: "cd ../.. ; make -j4 && make unittest_erasure_code_plugin && valgrind --leak-check=full --tool=memcheck ./unittest_erasure_code_plugin --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
+// End:
diff --git a/src/test/osd/TestErasureCodePluginJerasure.cc b/src/test/osd/TestErasureCodePluginJerasure.cc
new file mode 100644
index 00000000000..2f558937595
--- /dev/null
+++ b/src/test/osd/TestErasureCodePluginJerasure.cc
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <errno.h>
+#include "global/global_init.h"
+#include "osd/ErasureCodePlugin.h"
+#include "common/ceph_argparse.h"
+#include "global/global_context.h"
+#include "gtest/gtest.h"
+
+TEST(ErasureCodePlugin, factory)
+{
+ ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+ map<std::string,std::string> parameters;
+ parameters["erasure-code-directory"] = ".libs";
+ {
+ ErasureCodeInterfaceRef erasure_code;
+ EXPECT_FALSE(erasure_code);
+ EXPECT_EQ(-ENOENT, instance.factory("jerasure", parameters, &erasure_code));
+ EXPECT_FALSE(erasure_code);
+ }
+ const char *techniques[] = {
+ "reed_sol_van",
+ "reed_sol_r6_op",
+ "cauchy_orig",
+ "cauchy_good",
+ "liberation",
+ "blaum_roth",
+ "liber8tion",
+ 0
+ };
+ for(const char **technique = techniques; *technique; technique++) {
+ ErasureCodeInterfaceRef erasure_code;
+ parameters["erasure-code-technique"] = *technique;
+ EXPECT_FALSE(erasure_code);
+ EXPECT_EQ(0, instance.factory("jerasure", parameters, &erasure_code));
+ EXPECT_TRUE(erasure_code);
+ }
+}
+
+int main(int argc, char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, (const char **)argv, args);
+
+ global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
+ common_init_finish(g_ceph_context);
+
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+/*
+ * Local Variables:
+ * compile-command: "cd ../.. ; make -j4 &&
+ * make unittest_erasure_code_plugin_jerasure &&
+ * valgrind --tool=memcheck ./unittest_erasure_code_plugin_jerasure \
+ * --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
+ * End:
+ */
+
diff --git a/src/test/osd/TestRados.cc b/src/test/osd/TestRados.cc
index 6ac661c0629..7158f50a74a 100644
--- a/src/test/osd/TestRados.cc
+++ b/src/test/osd/TestRados.cc
@@ -48,8 +48,8 @@ public:
if (m_op <= m_objects) {
stringstream oid;
oid << m_op;
- cout << m_op << ": Writing initial " << oid.str() << std::endl;
- return new WriteOp(&context, oid.str());
+ cout << m_op << ": write initial oid " << oid.str() << std::endl;
+ return new WriteOp(m_op, &context, oid.str());
} else if (m_op >= m_ops) {
return NULL;
}
@@ -71,7 +71,6 @@ public:
it != m_weight_sums.end();
++it) {
if (rand_val < it->second) {
- cout << m_op << ": ";
retval = gen_op(context, it->first);
break;
}
@@ -84,73 +83,87 @@ private:
TestOp *gen_op(RadosTestContext &context, TestOpType type)
{
- string oid;
- cout << "oids not in use " << context.oid_not_in_use.size() << std::endl;
+ string oid, oid2;
+ //cout << "oids not in use " << context.oid_not_in_use.size() << std::endl;
assert(context.oid_not_in_use.size());
+
+ cout << m_op << ": ";
switch (type) {
case TEST_OP_READ:
oid = *(rand_choose(context.oid_not_in_use));
- cout << "Reading " << oid << std::endl;
- return new ReadOp(&context, oid, m_stats);
+ cout << "read oid " << oid << std::endl;
+ return new ReadOp(m_op, &context, oid, m_stats);
case TEST_OP_WRITE:
oid = *(rand_choose(context.oid_not_in_use));
- cout << "Writing " << oid << " current snap is "
+ cout << "write oid " << oid << " current snap is "
<< context.current_snap << std::endl;
- return new WriteOp(&context, oid, m_stats);
+ return new WriteOp(m_op, &context, oid, m_stats);
case TEST_OP_DELETE:
oid = *(rand_choose(context.oid_not_in_use));
- cout << "Deleting " << oid << " current snap is "
+ cout << "delete oid " << oid << " current snap is "
<< context.current_snap << std::endl;
- return new DeleteOp(&context, oid, m_stats);
+ return new DeleteOp(m_op, &context, oid, m_stats);
case TEST_OP_SNAP_CREATE:
- cout << "Snapping" << std::endl;
- return new SnapCreateOp(&context, m_stats);
+ cout << "snap_create" << std::endl;
+ return new SnapCreateOp(m_op, &context, m_stats);
case TEST_OP_SNAP_REMOVE:
if (context.snaps.empty()) {
return NULL;
} else {
int snap = rand_choose(context.snaps)->first;
- cout << "RemovingSnap " << snap << std::endl;
- return new SnapRemoveOp(&context, snap, m_stats);
+ cout << "snap_remove snap " << snap << std::endl;
+ return new SnapRemoveOp(m_op, &context, snap, m_stats);
}
case TEST_OP_ROLLBACK:
- if (context.snaps.empty()) {
+ if (context.snaps.size() <= context.snaps_in_use.size()) {
return NULL;
- } else {
+ }
+ while (true) {
int snap = rand_choose(context.snaps)->first;
+ if (context.snaps_in_use.count(snap))
+ continue; // in use; try again!
string oid = *(rand_choose(context.oid_not_in_use));
- cout << "RollingBack " << oid << " to " << snap << std::endl;
- return new RollbackOp(&context, oid, snap);
+ cout << "rollback oid " << oid << " to " << snap << std::endl;
+ return new RollbackOp(m_op, &context, oid, snap);
}
case TEST_OP_SETATTR:
oid = *(rand_choose(context.oid_not_in_use));
- cout << "Setting attrs on " << oid
+ cout << "setattr oid " << oid
<< " current snap is " << context.current_snap << std::endl;
- return new SetAttrsOp(&context, oid, m_stats);
+ return new SetAttrsOp(m_op, &context, oid, m_stats);
case TEST_OP_RMATTR:
oid = *(rand_choose(context.oid_not_in_use));
- cout << "Removing attrs on " << oid
+ cout << "rmattr oid " << oid
<< " current snap is " << context.current_snap << std::endl;
- return new RemoveAttrsOp(&context, oid, m_stats);
+ return new RemoveAttrsOp(m_op, &context, oid, m_stats);
case TEST_OP_TMAPPUT:
oid = *(rand_choose(context.oid_not_in_use));
- cout << "Setting tmap on " << oid
+ cout << "tmapput oid " << oid
<< " current snap is " << context.current_snap << std::endl;
- return new TmapPutOp(&context, oid, m_stats);
+ return new TmapPutOp(m_op, &context, oid, m_stats);
case TEST_OP_WATCH:
oid = *(rand_choose(context.oid_not_in_use));
- cout << "Watching " << oid
+ cout << "watch oid " << oid
+ << " current snap is " << context.current_snap << std::endl;
+ return new WatchOp(m_op, &context, oid, m_stats);
+
+ case TEST_OP_COPY_FROM:
+ oid = *(rand_choose(context.oid_not_in_use));
+ do {
+ oid2 = *(rand_choose(context.oid_not_in_use));
+ } while (oid == oid2);
+ cout << "copy_from oid " << oid << " from oid " << oid2
<< " current snap is " << context.current_snap << std::endl;
- return new WatchOp(&context, oid, m_stats);
+ return new CopyFromOp(m_op, &context, oid, oid2, m_stats);
default:
cerr << "Invalid op type " << type << std::endl;
@@ -192,6 +205,7 @@ int main(int argc, char **argv)
{ TEST_OP_RMATTR, "rmattr" },
{ TEST_OP_TMAPPUT, "tmapput" },
{ TEST_OP_WATCH, "watch" },
+ { TEST_OP_COPY_FROM, "copy_from" },
{ TEST_OP_READ /* grr */, NULL },
};
@@ -273,8 +287,8 @@ int main(int argc, char **argv)
return 1;
}
- if (max_in_flight > objects) {
- cerr << "Error: max_in_flight must be less than the number of objects"
+ if (max_in_flight * 2 > objects) {
+ cerr << "Error: max_in_flight must be <= than the number of objects / 2"
<< std::endl;
return 1;
}
diff --git a/src/test/perf_counters.cc b/src/test/perf_counters.cc
index d0b05f9f049..c44a15ef856 100644
--- a/src/test/perf_counters.cc
+++ b/src/test/perf_counters.cc
@@ -11,6 +11,10 @@
* Foundation. See file COPYING.
*
*/
+#include "include/int_types.h"
+#include "include/types.h" // FIXME: ordering shouldn't be important, but right
+ // now, this include has to come before the others.
+
#include "common/perf_counters.h"
#include "common/admin_socket_client.h"
@@ -19,9 +23,6 @@
#include "common/errno.h"
#include "common/safe_io.h"
-#include "include/types.h" // FIXME: ordering shouldn't be important, but right
- // now, this include has to come before the others.
-
#include "common/code_environment.h"
#include "global/global_context.h"
#include "global/global_init.h"
@@ -30,7 +31,6 @@
#include <errno.h>
#include <fcntl.h>
-#include <inttypes.h>
#include <map>
#include <poll.h>
#include <sstream>
diff --git a/src/test/pybind/test_ceph_argparse.py b/src/test/pybind/test_ceph_argparse.py
new file mode 100755
index 00000000000..9b16d91108d
--- /dev/null
+++ b/src/test/pybind/test_ceph_argparse.py
@@ -0,0 +1,1056 @@
+#!/usr/bin/nosetests --nocapture
+# -*- mode:python; tab-width:4; indent-tabs-mode:t -*-
+# vim: ts=4 sw=4 smarttab expandtab
+#
+# Ceph - scalable distributed file system
+#
+# Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+
+from nose.tools import eq_ as eq
+from nose.tools import *
+
+from ceph_argparse import validate_command, parse_json_funcsigs
+
+import os
+import re
+import json
+
+def get_command_descriptions(what):
+ buffer = os.popen("./get_command_descriptions " + "--" + what
+ + " 2>&1 | grep cmd000").read()
+ return re.sub(r'^.*?(\{.*\})', '\g<1>', buffer)
+
+def test_parse_json_funcsigs():
+ commands = get_command_descriptions("all")
+ cmd_json = parse_json_funcsigs(commands, 'cli')
+
+ # syntax error https://github.com/ceph/ceph/pull/585
+ commands = get_command_descriptions("pull585")
+ assert_raises(TypeError, parse_json_funcsigs, commands, 'cli')
+
+sigdict = parse_json_funcsigs(get_command_descriptions("all"), 'cli')
+
+
+class TestArgparse:
+
+ def assert_valid_command(self, args):
+ result = validate_command(sigdict, args)
+ assert_not_in(result, [None, {}])
+
+ def check_1_natural_arg(self, prefix, command):
+ self.assert_valid_command([prefix, command, '1'])
+ assert_equal({}, validate_command(sigdict, [prefix, command]))
+ assert_equal({}, validate_command(sigdict, [prefix, command, '-1']))
+ assert_equal({}, validate_command(sigdict, [prefix, command, '1',
+ '1']))
+
+ def check_0_or_1_natural_arg(self, prefix, command):
+ self.assert_valid_command([prefix, command, '1'])
+ self.assert_valid_command([prefix, command])
+ assert_equal({}, validate_command(sigdict, [prefix, command, '-1']))
+ assert_equal({}, validate_command(sigdict, [prefix, command, '1',
+ '1']))
+
+ def check_1_string_arg(self, prefix, command):
+ assert_equal({}, validate_command(sigdict, [prefix, command]))
+ self.assert_valid_command([prefix, command, 'string'])
+ assert_equal({}, validate_command(sigdict, [prefix,
+ command,
+ 'string',
+ 'toomany']))
+
+ def check_1_or_more_string_args(self, prefix, command):
+ assert_equal({}, validate_command(sigdict, [prefix,
+ command]))
+ self.assert_valid_command([prefix,
+ command,
+ 'string'])
+ self.assert_valid_command([prefix,
+ command,
+ 'string',
+ 'more string'])
+
+ def check_no_arg(self, prefix, command):
+ self.assert_valid_command([prefix,
+ command])
+ assert_equal({}, validate_command(sigdict, [prefix,
+ command,
+ 'toomany']))
+
+
+class TestPG(TestArgparse):
+
+ def test_stat(self):
+ self.assert_valid_command(['pg', 'stat'])
+
+ def test_getmap(self):
+ self.assert_valid_command(['pg', 'getmap'])
+
+ def test_send_pg_creates(self):
+ self.assert_valid_command(['pg', 'send_pg_creates'])
+
+ def test_dump(self):
+ self.assert_valid_command(['pg', 'dump'])
+ self.assert_valid_command(['pg', 'dump',
+ 'all',
+ 'summary',
+ 'sum',
+ 'delta',
+ 'pools',
+ 'osds',
+ 'pgs',
+ 'pgs_brief'])
+ assert_equal({}, validate_command(sigdict, ['pg', 'dump', 'invalid']))
+
+ def test_dump_json(self):
+ self.assert_valid_command(['pg', 'dump_json'])
+ self.assert_valid_command(['pg', 'dump_json',
+ 'all',
+ 'summary',
+ 'sum',
+ 'pools',
+ 'osds',
+ 'pgs'])
+ assert_equal({}, validate_command(sigdict, ['pg', 'dump_json',
+ 'invalid']))
+
+ def test_dump_pools_json(self):
+ self.assert_valid_command(['pg', 'dump_pools_json'])
+
+ def test_dump_pools_stuck(self):
+ self.assert_valid_command(['pg', 'dump_stuck'])
+ self.assert_valid_command(['pg', 'dump_stuck',
+ 'inactive',
+ 'unclean',
+ 'stale'])
+ assert_equal({}, validate_command(sigdict, ['pg', 'dump_stuck',
+ 'invalid']))
+ self.assert_valid_command(['pg', 'dump_stuck',
+ 'inactive',
+ '1234'])
+
+ def one_pgid(self, command):
+ self.assert_valid_command(['pg', command, '1.1'])
+ assert_equal({}, validate_command(sigdict, ['pg', command]))
+ assert_equal({}, validate_command(sigdict, ['pg', command, '1']))
+
+ def test_map(self):
+ self.one_pgid('map')
+
+ def test_scrub(self):
+ self.one_pgid('scrub')
+
+ def test_deep_scrub(self):
+ self.one_pgid('deep-scrub')
+
+ def test_repair(self):
+ self.one_pgid('repair')
+
+ def test_debug(self):
+ self.assert_valid_command(['pg',
+ 'debug',
+ 'unfound_objects_exist'])
+ self.assert_valid_command(['pg',
+ 'debug',
+ 'degraded_pgs_exist'])
+ assert_equal({}, validate_command(sigdict, ['pg', 'debug']))
+ assert_equal({}, validate_command(sigdict, ['pg', 'debug',
+ 'invalid']))
+
+ def test_force_create_pg(self):
+ self.one_pgid('force_create_pg')
+
+ def set_ratio(self, command):
+ self.assert_valid_command(['pg',
+ command,
+ '0.0'])
+ assert_equal({}, validate_command(sigdict, ['pg', command]))
+ assert_equal({}, validate_command(sigdict, ['pg',
+ command,
+ '2.0']))
+
+ def test_set_full_ratio(self):
+ self.set_ratio('set_full_ratio')
+
+ def test_set_nearfull_ratio(self):
+ self.set_ratio('set_nearfull_ratio')
+
+
+class TestAuth(TestArgparse):
+
+ def test_export(self):
+ self.assert_valid_command(['auth', 'export'])
+ self.assert_valid_command(['auth',
+ 'export',
+ 'string'])
+ assert_equal({}, validate_command(sigdict, ['auth',
+ 'export',
+ 'string',
+ 'toomany']))
+
+ def test_get(self):
+ self.check_1_string_arg('auth', 'get')
+
+ def test_get_key(self):
+ self.check_1_string_arg('auth', 'get-key')
+
+ def test_print_key(self):
+ self.check_1_string_arg('auth', 'print-key')
+ self.check_1_string_arg('auth', 'print_key')
+
+ def test_list(self):
+ self.check_no_arg('auth', 'list')
+
+ def test_import(self):
+ self.check_no_arg('auth', 'import')
+
+ def test_add(self):
+ self.check_1_or_more_string_args('auth', 'add')
+
+ def test_get_or_create_key(self):
+ self.check_1_or_more_string_args('auth', 'get-or-create-key')
+
+ def test_get_or_create(self):
+ self.check_1_or_more_string_args('auth', 'get-or-create')
+
+ def test_caps(self):
+ assert_equal({}, validate_command(sigdict, ['auth',
+ 'caps']))
+ assert_equal({}, validate_command(sigdict, ['auth',
+ 'caps',
+ 'string']))
+ self.assert_valid_command(['auth',
+ 'caps',
+ 'string',
+ 'more string'])
+
+ def test_del(self):
+ self.check_1_string_arg('auth', 'del')
+
+
+class TestMonitor(TestArgparse):
+
+ def test_compact(self):
+ self.assert_valid_command(['compact'])
+
+ def test_scrub(self):
+ self.assert_valid_command(['scrub'])
+
+ def test_fsid(self):
+ self.assert_valid_command(['fsid'])
+
+ def test_log(self):
+ assert_equal({}, validate_command(sigdict, ['log']))
+ self.assert_valid_command(['log', 'a logtext'])
+ self.assert_valid_command(['log', 'a logtext', 'and another'])
+
+ def test_injectargs(self):
+ assert_equal({}, validate_command(sigdict, ['injectargs']))
+ self.assert_valid_command(['injectargs', 'one'])
+ self.assert_valid_command(['injectargs', 'one', 'two'])
+
+ def test_status(self):
+ self.assert_valid_command(['status'])
+
+ def test_health(self):
+ self.assert_valid_command(['health'])
+ self.assert_valid_command(['health', 'detail'])
+ assert_equal({}, validate_command(sigdict, ['health', 'invalid']))
+ assert_equal({}, validate_command(sigdict, ['health', 'detail',
+ 'toomany']))
+
+ def test_df(self):
+ self.assert_valid_command(['df'])
+ self.assert_valid_command(['df', 'detail'])
+ assert_equal({}, validate_command(sigdict, ['df', 'invalid']))
+ assert_equal({}, validate_command(sigdict, ['df', 'detail',
+ 'toomany']))
+
+ def test_report(self):
+ self.assert_valid_command(['report'])
+ self.assert_valid_command(['report', 'tag1'])
+ self.assert_valid_command(['report', 'tag1', 'tag2'])
+
+ def test_quorum_status(self):
+ self.assert_valid_command(['quorum_status'])
+
+ def test_mon_status(self):
+ self.assert_valid_command(['mon_status'])
+
+ def test_sync_force(self):
+ self.assert_valid_command(['sync',
+ 'force',
+ '--yes-i-really-mean-it',
+ '--i-know-what-i-am-doing'])
+ self.assert_valid_command(['sync',
+ 'force',
+ '--yes-i-really-mean-it'])
+ self.assert_valid_command(['sync',
+ 'force'])
+ assert_equal({}, validate_command(sigdict, ['sync']))
+ assert_equal({}, validate_command(sigdict, ['sync',
+ 'force',
+ '--yes-i-really-mean-it',
+ '--i-know-what-i-am-doing',
+ 'toomany']))
+
+ def test_heap(self):
+ assert_equal({}, validate_command(sigdict, ['heap']))
+ assert_equal({}, validate_command(sigdict, ['heap', 'invalid']))
+ self.assert_valid_command(['heap', 'dump'])
+ self.assert_valid_command(['heap', 'start_profiler'])
+ self.assert_valid_command(['heap', 'stop_profiler'])
+ self.assert_valid_command(['heap', 'release'])
+ self.assert_valid_command(['heap', 'stats'])
+
+ def test_quorum(self):
+ assert_equal({}, validate_command(sigdict, ['quorum']))
+ assert_equal({}, validate_command(sigdict, ['quorum', 'invalid']))
+ self.assert_valid_command(['quorum', 'enter'])
+ self.assert_valid_command(['quorum', 'exit'])
+ assert_equal({}, validate_command(sigdict, ['quorum',
+ 'enter',
+ 'toomany']))
+
+ def test_tell(self):
+ assert_equal({}, validate_command(sigdict, ['tell']))
+ assert_equal({}, validate_command(sigdict, ['tell', 'invalid']))
+ for name in ('osd', 'mon', 'client', 'mds'):
+ assert_equal({}, validate_command(sigdict, ['tell', name]))
+ assert_equal({}, validate_command(sigdict, ['tell',
+ name + ".42"]))
+ self.assert_valid_command(['tell', name + ".42", 'something'])
+ self.assert_valid_command(['tell', name + ".42",
+ 'something',
+ 'something else'])
+
+
+class TestMDS(TestArgparse):
+
+ def test_stat(self):
+ self.check_no_arg('mds', 'stat')
+
+ def test_dump(self):
+ self.check_0_or_1_natural_arg('mds', 'dump')
+
+ def test_tell(self):
+ self.assert_valid_command(['mds', 'tell',
+ 'someone',
+ 'something'])
+ self.assert_valid_command(['mds', 'tell',
+ 'someone',
+ 'something',
+ 'something else'])
+ assert_equal({}, validate_command(sigdict, ['mds', 'tell']))
+ assert_equal({}, validate_command(sigdict, ['mds', 'tell',
+ 'someone']))
+
+ def test_compat_show(self):
+ self.assert_valid_command(['mds', 'compat', 'show'])
+ assert_equal({}, validate_command(sigdict, ['mds', 'compat']))
+ assert_equal({}, validate_command(sigdict, ['mds', 'compat',
+ 'show', 'toomany']))
+
+ def test_stop(self):
+ self.assert_valid_command(['mds', 'stop', 'someone'])
+ assert_equal({}, validate_command(sigdict, ['mds', 'stop']))
+ assert_equal({}, validate_command(sigdict, ['mds', 'stop',
+ 'someone', 'toomany']))
+
+ def test_deactivate(self):
+ self.assert_valid_command(['mds', 'deactivate', 'someone'])
+ assert_equal({}, validate_command(sigdict, ['mds', 'deactivate']))
+ assert_equal({}, validate_command(sigdict, ['mds', 'deactivate',
+ 'someone', 'toomany']))
+
+ def test_set_max_mds(self):
+ self.check_1_natural_arg('mds', 'set_max_mds')
+
+ def test_setmap(self):
+ self.check_1_natural_arg('mds', 'setmap')
+
+ def test_set_state(self):
+ self.assert_valid_command(['mds', 'set_state', '1', '2'])
+ assert_equal({}, validate_command(sigdict, ['mds', 'set_state']))
+ assert_equal({}, validate_command(sigdict, ['mds', 'set_state', '-1']))
+ assert_equal({}, validate_command(sigdict, ['mds', 'set_state',
+ '1', '-1']))
+ assert_equal({}, validate_command(sigdict, ['mds', 'set_state',
+ '1', '21']))
+
+ def test_fail(self):
+ self.check_1_string_arg('mds', 'fail')
+
+ def test_rm(self):
+ assert_equal({}, validate_command(sigdict, ['mds', 'rm']))
+ assert_equal({}, validate_command(sigdict, ['mds', 'rm', '1']))
+ for name in ('osd', 'mon', 'client', 'mds'):
+ self.assert_valid_command(['mds', 'rm', '1', name + '.42'])
+ assert_equal({}, validate_command(sigdict, ['mds', 'rm',
+ '-1', name + '.42']))
+ assert_equal({}, validate_command(sigdict, ['mds', 'rm',
+ '-1', name]))
+ assert_equal({}, validate_command(sigdict, ['mds', 'rm',
+ '1', name + '.42',
+ 'toomany']))
+
+ def test_rmfailed(self):
+ self.check_1_natural_arg('mds', 'rmfailed')
+
+ def test_cluster_down(self):
+ self.check_no_arg('mds', 'cluster_down')
+
+ def test_cluster_up(self):
+ self.check_no_arg('mds', 'cluster_up')
+
+ def test_compat_rm_compat(self):
+ self.assert_valid_command(['mds', 'compat', 'rm_compat', '1'])
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'compat',
+ 'rm_compat']))
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'compat',
+ 'rm_compat', '-1']))
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'compat',
+ 'rm_compat', '1', '1']))
+
+ def test_incompat_rm_incompat(self):
+ self.assert_valid_command(['mds', 'compat', 'rm_incompat', '1'])
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'compat',
+ 'rm_incompat']))
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'compat',
+ 'rm_incompat', '-1']))
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'compat',
+ 'rm_incompat', '1', '1']))
+
+ def test_mds_set(self):
+ self.assert_valid_command(['mds', 'set', 'allow_new_snaps'])
+ self.assert_valid_command(['mds', 'set', 'allow_new_snaps', 'sure'])
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'set',
+ 'invalid']))
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'set',
+ 'allow_new_snaps',
+ 'sure',
+ 'toomany']))
+
+ def test_mds_unset(self):
+ self.assert_valid_command(['mds', 'unset', 'allow_new_snaps'])
+ self.assert_valid_command(['mds', 'unset', 'allow_new_snaps', 'sure'])
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'unset',
+ 'invalid']))
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'unset',
+ 'allow_new_snaps',
+ 'sure',
+ 'toomany']))
+
+ def test_add_data_pool(self):
+ self.check_1_natural_arg('mds', 'add_data_pool')
+
+ def test_remove_data_pool(self):
+ self.check_1_natural_arg('mds', 'remove_data_pool')
+
+ def test_newfs(self):
+ self.assert_valid_command(['mds', 'newfs', '1', '2',
+ '--yes-i-really-mean-it'])
+ self.assert_valid_command(['mds', 'newfs', '1', '2'])
+ assert_equal({}, validate_command(sigdict, ['mds', 'newfs']))
+ assert_equal({}, validate_command(sigdict, ['mds', 'newfs', '1']))
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'newfs',
+ '1',
+ '2',
+ '--yes-i-really-mean-it',
+ 'toomany']))
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'newfs',
+ '-1',
+ '2',
+ '--yes-i-really-mean-it']))
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'newfs',
+ '1',
+ '-1',
+ '--yes-i-really-mean-it']))
+
+
+class TestMon(TestArgparse):
+
+ def test_dump(self):
+ self.check_0_or_1_natural_arg('mon', 'dump')
+
+ def test_stat(self):
+ self.check_no_arg('mon', 'stat')
+
+ def test_getmap(self):
+ self.check_0_or_1_natural_arg('mon', 'getmap')
+
+ def test_add(self):
+ self.assert_valid_command(['mon', 'add', 'name', '1.2.3.4:1234'])
+ assert_equal({}, validate_command(sigdict, ['mon', 'add']))
+ assert_equal({}, validate_command(sigdict, ['mon', 'add', 'name']))
+ assert_equal({}, validate_command(sigdict, ['mon', 'add',
+ 'name',
+ '400.500.600.700']))
+ assert_equal({}, validate_command(sigdict, ['mon', 'add', 'name',
+ '1.2.3.4:1234',
+ 'toomany']))
+
+ def test_remove(self):
+ self.assert_valid_command(['mon', 'remove', 'name'])
+ assert_equal({}, validate_command(sigdict, ['mon', 'remove']))
+ assert_equal({}, validate_command(sigdict, ['mon', 'remove',
+ 'name', 'toomany']))
+
+
+class TestOSD(TestArgparse):
+
+ def test_stat(self):
+ self.check_no_arg('osd', 'stat')
+
+ def test_dump(self):
+ self.check_0_or_1_natural_arg('osd', 'dump')
+
+ def test_osd_tree(self):
+ self.check_0_or_1_natural_arg('osd', 'tree')
+
+ def test_osd_ls(self):
+ self.check_0_or_1_natural_arg('osd', 'ls')
+
+ def test_osd_getmap(self):
+ self.check_0_or_1_natural_arg('osd', 'getmap')
+
+ def test_osd_getcrushmap(self):
+ self.check_0_or_1_natural_arg('osd', 'getcrushmap')
+
+ def test_perf(self):
+ self.check_no_arg('osd', 'perf')
+
+ def test_getmaxosd(self):
+ self.check_no_arg('osd', 'getmaxosd')
+
+ def test_find(self):
+ self.check_1_natural_arg('osd', 'find')
+
+ def test_map(self):
+ self.assert_valid_command(['osd', 'map', 'poolname', 'objectname'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'map']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'map', 'poolname']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'map',
+ 'poolname', 'objectname',
+ 'toomany']))
+
+ def test_scrub(self):
+ self.check_1_string_arg('osd', 'scrub')
+
+ def test_deep_scrub(self):
+ self.check_1_string_arg('osd', 'deep-scrub')
+
+ def test_repair(self):
+ self.check_1_string_arg('osd', 'repair')
+
+ def test_lspools(self):
+ self.assert_valid_command(['osd', 'lspools'])
+ self.assert_valid_command(['osd', 'lspools', '1'])
+ self.assert_valid_command(['osd', 'lspools', '-1'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'lspools',
+ '1', 'toomany']))
+
+ def test_blacklist_ls(self):
+ self.assert_valid_command(['osd', 'blacklist', 'ls'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'blacklist']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'blacklist',
+ 'ls', 'toomany']))
+
+ def test_crush_rule(self):
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush', 'rule']))
+ for subcommand in ('list', 'ls', 'dump'):
+ self.assert_valid_command(['osd', 'crush', 'rule', subcommand])
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'rule', subcommand,
+ 'toomany']))
+
+ def test_crush_dump(self):
+ self.assert_valid_command(['osd', 'crush', 'dump'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'dump', 'toomany']))
+
+ def test_setcrushmap(self):
+ self.check_no_arg('osd', 'setcrushmap')
+
+ def test_crush_add_bucket(self):
+ self.assert_valid_command(['osd', 'crush', 'add-bucket',
+ 'name', 'type'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'add-bucket']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'add-bucket', 'name',
+ 'type',
+ 'toomany']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'add-bucket', '!!!',
+ 'type']))
+
+ def check_crush_setter(self, setter):
+ self.assert_valid_command(['osd', 'crush', setter,
+ '*', '2.3', 'AZaz09-_.='])
+ self.assert_valid_command(['osd', 'crush', setter,
+ 'osd.0', '2.3', 'AZaz09-_.='])
+ self.assert_valid_command(['osd', 'crush', setter,
+ '0', '2.3', 'AZaz09-_.='])
+ self.assert_valid_command(['osd', 'crush', setter,
+ '0', '2.3', 'AZaz09-_.=', 'AZaz09-_.='])
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ setter,
+ 'osd.0']))
+ assert_in(validate_command(sigdict, ['osd', 'crush',
+ setter,
+ 'osd.0',
+ '-1.0']),
+ [None, {}])
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ setter,
+ 'osd.0',
+ '1.0',
+ '!!!']))
+
+ def test_crush_set(self):
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush']))
+ self.check_crush_setter('set')
+
+ def test_crush_add(self):
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush']))
+ self.check_crush_setter('add')
+
+ def test_crush_create_or_move(self):
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush']))
+ self.check_crush_setter('create-or-move')
+
+ def test_crush_move(self):
+ self.assert_valid_command(['osd', 'crush', 'move',
+ 'AZaz09-_.', 'AZaz09-_.='])
+ self.assert_valid_command(['osd', 'crush', 'move',
+ '0', 'AZaz09-_.=', 'AZaz09-_.='])
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'move']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'move', 'AZaz09-_.']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'move', '!!!',
+ 'AZaz09-_.=']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'move', 'AZaz09-_.',
+ '!!!']))
+
+ def test_crush_link(self):
+ self.assert_valid_command(['osd', 'crush', 'link',
+ 'name', 'AZaz09-_.='])
+ self.assert_valid_command(['osd', 'crush', 'link',
+ 'name', 'AZaz09-_.=', 'AZaz09-_.='])
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'link']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'link',
+ 'name']))
+
+ def test_crush_rm(self):
+ for alias in ('rm', 'remove', 'unlink'):
+ self.assert_valid_command(['osd', 'crush', alias, 'AZaz09-_.'])
+ self.assert_valid_command(['osd', 'crush', alias,
+ 'AZaz09-_.', 'AZaz09-_.'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ alias]))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ alias,
+ 'AZaz09-_.',
+ 'AZaz09-_.',
+ 'toomany']))
+
+ def test_crush_reweight(self):
+ self.assert_valid_command(['osd', 'crush', 'reweight',
+ 'AZaz09-_.', '2.3'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'reweight']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'reweight',
+ 'AZaz09-_.']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'reweight',
+ 'AZaz09-_.',
+ '-1.0']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'reweight',
+ '!!!',
+ '2.3']))
+
+ def test_crush_tunables(self):
+ for tunable in ('legacy', 'argonaut', 'bobtail', 'optimal', 'default'):
+ self.assert_valid_command(['osd', 'crush', 'tunables',
+ tunable])
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'tunables']))
+ assert_equal(None, validate_command(sigdict, ['osd', 'crush',
+ 'default', 'toomany']))
+
+ def test_crush_rule_create_simple(self):
+ self.assert_valid_command(['osd', 'crush', 'rule', 'create-simple',
+ 'AZaz09-_.', 'AZaz09-_.', 'AZaz09-_.'])
+ assert_equal(None, validate_command(sigdict, ['osd', 'crush',
+ 'create-simple']))
+ assert_equal(None, validate_command(sigdict, ['osd', 'crush',
+ 'create-simple',
+ 'AZaz09-_.']))
+ assert_equal(None, validate_command(sigdict, ['osd', 'crush',
+ 'create-simple',
+ 'AZaz09-_.',
+ 'AZaz09-_.']))
+ assert_equal(None, validate_command(sigdict, ['osd', 'crush',
+ 'create-simple',
+ '!!!',
+ 'AZaz09-_.',
+ 'AZaz09-_.']))
+ assert_equal(None, validate_command(sigdict, ['osd', 'crush',
+ 'create-simple',
+ 'AZaz09-_.',
+ '|||',
+ 'AZaz09-_.']))
+ assert_equal(None, validate_command(sigdict, ['osd', 'crush',
+ 'create-simple',
+ 'AZaz09-_.',
+ 'AZaz09-_.',
+ '+++']))
+ assert_equal(None, validate_command(sigdict, ['osd', 'crush',
+ 'create-simple',
+ 'AZaz09-_.',
+ 'AZaz09-_.',
+ 'AZaz09-_.',
+ 'toomany']))
+
+ def test_crush_rule_rm(self):
+ self.assert_valid_command(['osd', 'crush', 'rule', 'rm', 'AZaz09-_.'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'rule', 'rm']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'rule', 'rm',
+ '!!!!']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'rule', 'rm',
+ 'AZaz09-_.',
+ 'toomany']))
+
+ def test_setmaxosd(self):
+ self.check_1_natural_arg('osd', 'setmaxosd')
+
+ def test_pause(self):
+ self.check_no_arg('osd', 'pause')
+
+ def test_unpause(self):
+ self.check_no_arg('osd', 'unpause')
+
+ def test_set_unset(self):
+ for action in ('set', 'unset'):
+ for flag in ('pause', 'noup', 'nodown', 'noout', 'noin',
+ 'nobackfill', 'norecover', 'noscrub', 'nodeep-scrub'):
+ self.assert_valid_command(['osd', action, flag])
+ assert_equal({}, validate_command(sigdict, ['osd', action]))
+ assert_equal({}, validate_command(sigdict, ['osd', action,
+ 'invalid']))
+ assert_equal({}, validate_command(sigdict, ['osd', action,
+ 'pause', 'toomany']))
+
+ def test_cluster_snap(self):
+ assert_equal(None, validate_command(sigdict, ['osd', 'cluster_snap']))
+
+ def test_down(self):
+ self.check_1_or_more_string_args('osd', 'down')
+
+ def test_out(self):
+ self.check_1_or_more_string_args('osd', 'out')
+
+ def test_in(self):
+ self.check_1_or_more_string_args('osd', 'in')
+
+ def test_rm(self):
+ self.check_1_or_more_string_args('osd', 'rm')
+
+ def test_reweight(self):
+ self.assert_valid_command(['osd', 'reweight', '1', '0.1'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'reweight']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'reweight',
+ '1']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'reweight',
+ '1', '2.0']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'reweight',
+ '-1', '0.1']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'reweight',
+ '1', '0.1',
+ 'toomany']))
+
+ def test_lost(self):
+ self.assert_valid_command(['osd', 'lost', '1',
+ '--yes-i-really-mean-it'])
+ self.assert_valid_command(['osd', 'lost', '1'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'lost']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'lost',
+ '1',
+ 'what?']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'lost',
+ '-1',
+ '--yes-i-really-mean-it']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'lost',
+ '1',
+ '--yes-i-really-mean-it',
+ 'toomany']))
+
+ def test_create(self):
+ uuid = '12345678123456781234567812345678'
+ self.assert_valid_command(['osd', 'create'])
+ self.assert_valid_command(['osd', 'create',
+ uuid])
+ assert_equal({}, validate_command(sigdict, ['osd', 'create',
+ 'invalid']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'create',
+ uuid,
+ 'toomany']))
+
+ def test_blackist(self):
+ for action in ('add', 'rm'):
+ self.assert_valid_command(['osd', 'blacklist', action,
+ '1.2.3.4/567'])
+ self.assert_valid_command(['osd', 'blacklist', action,
+ '1.2.3.4'])
+ self.assert_valid_command(['osd', 'blacklist', action,
+ '1.2.3.4/567', '600.40'])
+ self.assert_valid_command(['osd', 'blacklist', action,
+ '1.2.3.4', '600.40'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'blacklist',
+ action,
+ 'invalid',
+ '600.40']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'blacklist',
+ action,
+ '1.2.3.4/567',
+ '-1.0']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'blacklist',
+ action,
+ '1.2.3.4/567',
+ '600.40',
+ 'toomany']))
+
+ def test_pool_mksnap(self):
+ self.assert_valid_command(['osd', 'pool', 'mksnap',
+ 'poolname', 'snapname'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'mksnap']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'mksnap',
+ 'poolname']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'mksnap',
+ 'poolname', 'snapname',
+ 'toomany']))
+
+ def test_pool_rmsnap(self):
+ self.assert_valid_command(['osd', 'pool', 'rmsnap',
+ 'poolname', 'snapname'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'rmsnap']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'rmsnap',
+ 'poolname']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'rmsnap',
+ 'poolname', 'snapname',
+ 'toomany']))
+
+ def test_pool_create(self):
+ self.assert_valid_command(['osd', 'pool', 'create',
+ 'poolname', '128'])
+ self.assert_valid_command(['osd', 'pool', 'create',
+ 'poolname', '128', '128'])
+ self.assert_valid_command(['osd', 'pool', 'create',
+ 'poolname', '128', '128',
+ 'foo=bar'])
+ self.assert_valid_command(['osd', 'pool', 'create',
+ 'poolname', '128', '128',
+ 'foo=bar', 'baz=frob'])
+ self.assert_valid_command(['osd', 'pool', 'create',
+ 'poolname', '128',
+ 'foo=bar', 'baz=frob'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'create']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'create',
+ 'poolname']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'create',
+ 'poolname', '-1']))
+
+ def test_pool_delete(self):
+ self.assert_valid_command(['osd', 'pool', 'delete',
+ 'poolname', 'poolname',
+ '--yes-i-really-really-mean-it'])
+ self.assert_valid_command(['osd', 'pool', 'delete',
+ 'poolname', 'poolname'])
+ self.assert_valid_command(['osd', 'pool', 'delete',
+ 'poolname'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'delete']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'delete',
+ 'poolname', 'poolname',
+ 'not really']))
+ assert_equal({}, validate_command(sigdict,
+ ['osd', 'pool', 'delete',
+ 'poolname', 'poolname',
+ '--yes-i-really-really-mean-it',
+ 'toomany']))
+
+ def test_pool_rename(self):
+ self.assert_valid_command(['osd', 'pool', 'rename',
+ 'poolname', 'othername'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'rename']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'rename',
+ 'poolname']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'rename',
+ 'poolname', 'othername',
+ 'toomany']))
+
+ def test_pool_get(self):
+ for var in ('size', 'min_size', 'crash_replay_interval',
+ 'pg_num', 'pgp_num', 'crush_ruleset'):
+ self.assert_valid_command(['osd', 'pool', 'get', 'poolname', var])
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'get']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'get', 'poolname']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'get', 'poolname',
+ 'size', 'toomany']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'get', 'poolname',
+ 'invalid']))
+
+ def test_pool_set(self):
+ for var in ('size', 'min_size', 'crash_replay_interval',
+ 'pg_num', 'pgp_num', 'crush_ruleset',
+ 'hashpspool'):
+ self.assert_valid_command(['osd', 'pool',
+ 'set', 'poolname', var, 'value'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'set']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'set', 'poolname']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'set', 'poolname',
+ 'size', 'value',
+ 'toomany']))
+
+ def test_pool_set_quota(self):
+ for field in ('max_objects', 'max_bytes'):
+ self.assert_valid_command(['osd', 'pool', 'set-quota',
+ 'poolname', field, '10K'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'set-quota']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'set-quota',
+ 'poolname']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'set-quota',
+ 'poolname',
+ 'max_objects']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'set-quota',
+ 'poolname',
+ 'invalid',
+ '10K']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'set-quota',
+ 'poolname',
+ 'max_objects',
+ '10K',
+ 'toomany']))
+
+ def test_reweight_by_utilization(self):
+ self.assert_valid_command(['osd', 'reweight-by-utilization'])
+ self.assert_valid_command(['osd', 'reweight-by-utilization', '100'])
+ assert_equal({}, validate_command(sigdict, ['osd',
+ 'reweight-by-utilization',
+ '50']))
+ assert_equal({}, validate_command(sigdict, ['osd',
+ 'reweight-by-utilization',
+ '100',
+ 'toomany']))
+
+ def test_thrash(self):
+ self.check_1_natural_arg('osd', 'thrash')
+
+ def test_tier_op(self):
+ for op in ('add', 'remove', 'set-overlay'):
+ self.assert_valid_command(['osd', 'tier', op,
+ 'poolname', 'othername'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'tier', op]))
+ assert_equal({}, validate_command(sigdict, ['osd', 'tier', op,
+ 'poolname']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'tier', op,
+ 'poolname',
+ 'othername',
+ 'toomany']))
+
+ def test_tier_cache_mode(self):
+ for mode in ('none', 'writeback', 'invalidate+forward', 'readonly'):
+ self.assert_valid_command(['osd', 'tier', 'cache-mode',
+ 'poolname', mode])
+ assert_equal({}, validate_command(sigdict, ['osd', 'tier',
+ 'cache-mode']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'tier',
+ 'cache-mode',
+ 'invalid']))
+
+ def test_tier_remove_overlay(self):
+ self.assert_valid_command(['osd', 'tier', 'remove-overlay',
+ 'poolname'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'tier',
+ 'remove-overlay']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'tier',
+ 'remove-overlay',
+ 'poolname',
+ 'toomany']))
+
+
+class TestConfigKey(TestArgparse):
+
+ def test_get(self):
+ self.check_1_string_arg('config-key', 'get')
+
+ def test_put(self):
+ self.assert_valid_command(['config-key', 'put',
+ 'key'])
+ self.assert_valid_command(['config-key', 'put',
+ 'key', 'value'])
+ assert_equal({}, validate_command(sigdict, ['config-key', 'put']))
+ assert_equal({}, validate_command(sigdict, ['config-key', 'put',
+ 'key', 'value',
+ 'toomany']))
+
+ def test_del(self):
+ self.check_1_string_arg('config-key', 'del')
+
+ def test_exists(self):
+ self.check_1_string_arg('config-key', 'exists')
+
+ def test_list(self):
+ self.check_no_arg('config-key', 'list')
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 &&
+# PYTHONPATH=pybind nosetests --stop \
+# test/pybind/test_ceph_argparse.py # test_ceph_argparse.py:TestOSD.test_rm"
+# End:
diff --git a/src/streamtest.cc b/src/test/streamtest.cc
index 21693ac8713..21693ac8713 100644
--- a/src/streamtest.cc
+++ b/src/test/streamtest.cc
diff --git a/src/test/test_arch.c b/src/test/test_arch.c
new file mode 100644
index 00000000000..549221e60f9
--- /dev/null
+++ b/src/test/test_arch.c
@@ -0,0 +1,19 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "arch/probe.h"
+#include "arch/intel.h"
+#include "arch/neon.h"
+
+int main(int argc, char **argv)
+{
+ ceph_arch_probe();
+ assert(ceph_arch_probed);
+
+ printf("ceph_arch_intel_sse42 = %d\n", ceph_arch_intel_sse42);
+ printf("ceph_arch_neon = %d\n", ceph_arch_neon);
+
+ return 0;
+}
diff --git a/src/test/test_osd_types.cc b/src/test/test_osd_types.cc
index e07c9e06592..34674358285 100644
--- a/src/test/test_osd_types.cc
+++ b/src/test/test_osd_types.cc
@@ -712,7 +712,8 @@ TEST(pg_missing_t, add_next_event)
eversion_t version(10,5);
eversion_t prior_version(3,4);
pg_log_entry_t sample_e(pg_log_entry_t::DELETE, oid, version, prior_version,
- osd_reqid_t(entity_name_t::CLIENT(777), 8, 999), utime_t(8,9));
+ 0, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
+ utime_t(8,9));
// new object (MODIFY)
{
diff --git a/src/test_trans.cc b/src/test/test_trans.cc
index 43821c13aec..43821c13aec 100644
--- a/src/test_trans.cc
+++ b/src/test/test_trans.cc
diff --git a/src/testclass.cc b/src/test/testclass.cc
index 22a97be6dcb..22a97be6dcb 100644
--- a/src/testclass.cc
+++ b/src/test/testclass.cc
diff --git a/src/testcrypto.cc b/src/test/testcrypto.cc
index 0b7a9d54742..0b7a9d54742 100644
--- a/src/testcrypto.cc
+++ b/src/test/testcrypto.cc
diff --git a/src/testkeys.cc b/src/test/testkeys.cc
index 27c38124ade..27c38124ade 100644
--- a/src/testkeys.cc
+++ b/src/test/testkeys.cc
diff --git a/src/testmsgr.cc b/src/test/testmsgr.cc
index 4de779b5d7f..4de779b5d7f 100644
--- a/src/testmsgr.cc
+++ b/src/test/testmsgr.cc
diff --git a/src/tools/Makefile.am b/src/tools/Makefile.am
new file mode 100644
index 00000000000..4b8da77951a
--- /dev/null
+++ b/src/tools/Makefile.am
@@ -0,0 +1,87 @@
+ceph_osdomap_tool_SOURCES = tools/ceph-osdomap-tool.cc
+ceph_osdomap_tool_LDADD = $(LIBOS) $(CEPH_GLOBAL) -lboost_program_options
+bin_DEBUGPROGRAMS += ceph-osdomap-tool
+
+ceph_monstore_tool_SOURCES = tools/ceph-monstore-tool.cc
+ceph_monstore_tool_LDADD = $(LIBOS) $(CEPH_GLOBAL) -lboost_program_options
+bin_DEBUGPROGRAMS += ceph-monstore-tool
+
+ceph_filestore_dump_SOURCES = tools/ceph-filestore-dump.cc
+ceph_filestore_dump_LDADD = $(LIBOSD) $(LIBOS) $(CEPH_GLOBAL) -lboost_program_options
+if LINUX
+ceph_filestore_dump_LDADD += -ldl
+endif # LINUX
+bin_PROGRAMS += ceph_filestore_dump
+
+monmaptool_SOURCES = tools/monmaptool.cc
+monmaptool_LDADD = $(CEPH_GLOBAL)
+bin_PROGRAMS += monmaptool
+
+crushtool_SOURCES = tools/crushtool.cc
+crushtool_LDADD = $(CEPH_GLOBAL)
+bin_PROGRAMS += crushtool
+
+osdmaptool_SOURCES = tools/osdmaptool.cc
+osdmaptool_LDADD = $(CEPH_GLOBAL)
+bin_PROGRAMS += osdmaptool
+
+ceph_scratchtool_SOURCES = tools/scratchtool.c
+ceph_scratchtool_LDADD = $(LIBRADOS) $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_scratchtool
+
+ceph_scratchtoolpp_SOURCES = tools/scratchtoolpp.cc
+ceph_scratchtoolpp_LDADD = $(LIBRADOS) $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_scratchtoolpp
+
+ceph_psim_SOURCES = tools/psim.cc
+ceph_psim_LDADD = $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_psim
+
+ceph_dupstore_SOURCES = tools/dupstore.cc
+ceph_dupstore_LDADD = $(LIBOS) $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_dupstore
+
+ceph_radosacl_SOURCES = tools/radosacl.cc
+ceph_radosacl_LDADD = $(LIBRADOS) $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_radosacl
+
+rados_SOURCES = \
+ tools/rados/rados.cc \
+ tools/rados/rados_import.cc \
+ tools/rados/rados_export.cc \
+ tools/rados/rados_sync.cc
+rados_SOURCES += common/obj_bencher.cc # needs cleanup so it can go in libcommon.la
+rados_LDADD = libcls_lock_client.la $(LIBRADOS) $(CEPH_GLOBAL)
+bin_PROGRAMS += rados
+
+if WITH_REST_BENCH
+rest_bench_SOURCES = tools/rest_bench.cc
+rest_bench_SOURCES += common/obj_bencher.cc # needs cleanup so it can go in libcommon.la
+rest_bench_LDADD = $(CEPH_GLOBAL)
+bin_PROGRAMS += rest-bench
+
+if WITH_SYSTEM_LIBS3
+rest_bench_LDADD += -ls3
+else
+rest_bench_LDADD += libs3/build/lib/libs3.a -lcurl -lxml2
+rest_bench_CXXFLAGS = ${AM_CXXFLAGS} -I$(top_srcdir)/src/libs3/inc
+SUBDIRS += libs3
+endif # WITH_SYSTEM_LIBS3
+endif # WITH_REST_BENCH
+
+ceph_conf_SOURCES = tools/ceph_conf.cc
+ceph_conf_LDADD = $(CEPH_GLOBAL)
+bin_PROGRAMS += ceph-conf
+
+ceph_authtool_SOURCES = tools/ceph_authtool.cc
+ceph_authtool_LDADD = $(CEPH_GLOBAL)
+bin_PROGRAMS += ceph-authtool
+
+ceph_mon_store_converter_SOURCES = tools/mon_store_converter.cc
+ceph_mon_store_converter_LDADD = $(LIBMON) $(LIBOS) $(CEPH_GLOBAL)
+bin_PROGRAMS += ceph_mon_store_converter
+
+noinst_HEADERS += \
+ tools/rados/rados_sync.h \
+ tools/common.h
+
diff --git a/src/tools/ceph-filestore-dump.cc b/src/tools/ceph-filestore-dump.cc
index 3badc2160b0..b4220bae307 100644
--- a/src/tools/ceph-filestore-dump.cc
+++ b/src/tools/ceph-filestore-dump.cc
@@ -52,6 +52,32 @@ enum {
END_OF_TYPES, //Keep at the end
};
+//#define INTERNAL_TEST
+//#define INTERNAL_TEST2
+
+#ifdef INTERNAL_TEST
+CompatSet get_test_compat_set() {
+ CompatSet::FeatureSet ceph_osd_feature_compat;
+ CompatSet::FeatureSet ceph_osd_feature_ro_compat;
+ CompatSet::FeatureSet ceph_osd_feature_incompat;
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
+#ifdef INTERNAL_TEST2
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
+#endif
+ return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
+ ceph_osd_feature_incompat);
+}
+#endif
+
typedef uint8_t sectiontype_t;
typedef uint32_t mymagic_t;
typedef int64_t mysize_t;
@@ -69,7 +95,7 @@ const int fd_none = INT_MIN;
//can be added to the export format.
struct super_header {
static const uint32_t super_magic = (shortmagic << 16) | shortmagic;
- static const uint32_t super_ver = 1;
+ static const uint32_t super_ver = 2;
static const uint32_t FIXED_LENGTH = 16;
uint32_t magic;
uint32_t version;
@@ -139,18 +165,25 @@ struct footer {
struct pg_begin {
pg_t pgid;
+ OSDSuperblock superblock;
- pg_begin(pg_t pg): pgid(pg) { }
+ pg_begin(pg_t pg, OSDSuperblock sb):
+ pgid(pg), superblock(sb) { }
pg_begin() { }
void encode(bufferlist& bl) const {
- ENCODE_START(1, 1, bl);
+ // New super_ver prevents decode from ver 1
+ ENCODE_START(2, 2, bl);
::encode(pgid, bl);
+ ::encode(superblock, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::iterator& bl) {
- DECODE_START(1, bl);
+ DECODE_START(2, bl);
::decode(pgid, bl);
+ if (struct_v > 1) {
+ ::decode(superblock, bl);
+ }
DECODE_FINISH(bl);
}
};
@@ -347,8 +380,8 @@ void remove_coll(ObjectStore *store, const coll_t &coll)
OSD::make_snapmapper_oid());
SnapMapper mapper(&driver, 0, 0, 0);
- vector<hobject_t> objects;
- hobject_t next;
+ vector<ghobject_t> objects;
+ ghobject_t next;
int r = 0;
int64_t num = 0;
ObjectStore::Transaction *t = new ObjectStore::Transaction;
@@ -358,13 +391,14 @@ void remove_coll(ObjectStore *store, const coll_t &coll)
&objects, &next);
if (r < 0)
goto out;
- for (vector<hobject_t>::iterator i = objects.begin();
+ for (vector<ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i, ++num) {
+ assert(i->generation == ghobject_t::NO_GEN);
OSDriver::OSTransaction _t(driver.get_transaction(t));
cout << "remove " << *i << std::endl;
- int r = mapper.remove_oid(*i, &_t);
+ int r = mapper.remove_oid(i->hobj, &_t);
if (r != 0 && r != -ENOENT) {
assert(0);
}
@@ -621,18 +655,19 @@ int export_file(ObjectStore *store, coll_t cid, hobject_t &obj)
int export_files(ObjectStore *store, coll_t coll)
{
- vector<hobject_t> objects;
- hobject_t next;
+ vector<ghobject_t> objects;
+ ghobject_t next;
while (!next.is_max()) {
int r = store->collection_list_partial(coll, next, 200, 300, 0,
&objects, &next);
if (r < 0)
return r;
- for (vector<hobject_t>::iterator i = objects.begin();
+ for (vector<ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i) {
- r = export_file(store, coll, *i);
+ assert(i->generation == ghobject_t::NO_GEN);
+ r = export_file(store, coll, i->hobj);
if (r < 0)
return r;
}
@@ -664,7 +699,7 @@ void write_super()
}
int do_export(ObjectStore *fs, coll_t coll, pg_t pgid, pg_info_t &info,
- epoch_t map_epoch, __u8 struct_ver)
+ epoch_t map_epoch, __u8 struct_ver, OSDSuperblock superblock)
{
PGLog::IndexedLog log;
pg_missing_t missing;
@@ -675,7 +710,7 @@ int do_export(ObjectStore *fs, coll_t coll, pg_t pgid, pg_info_t &info,
write_super();
- pg_begin pgb(pgid);
+ pg_begin pgb(pgid, superblock);
ret = write_section(TYPE_PG_BEGIN, pgb, file_fd);
if (ret)
return ret;
@@ -909,7 +944,7 @@ int get_pg_metadata(ObjectStore *store, coll_t coll, bufferlist &bl)
return 0;
}
-int do_import(ObjectStore *store)
+int do_import(ObjectStore *store, OSDSuperblock sb)
{
bufferlist ebl;
pg_info_t info;
@@ -943,7 +978,16 @@ int do_import(ObjectStore *store)
pg_begin pgb;
pgb.decode(ebliter);
pg_t pgid = pgb.pgid;
-
+
+ if (debug) {
+ cout << "Exported features: " << pgb.superblock.compat_features << std::endl;
+ }
+ if (sb.compat_features.compare(pgb.superblock.compat_features) == -1) {
+ cout << "Export has incompatible features set "
+ << pgb.superblock.compat_features << std::endl;
+ return 1;
+ }
+
log_oid = OSD::make_pg_log_oid(pgid);
biginfo_oid = OSD::make_pg_biginfo_oid(pgid);
@@ -1017,7 +1061,7 @@ int main(int argc, char **argv)
("pgid", po::value<string>(&pgidstr),
"PG id, mandatory")
("type", po::value<string>(&type),
- "Type which is 'info' or 'log', mandatory")
+ "Type one of info, log, remove, export, or import, mandatory")
("file", po::value<string>(&file),
"path of file to export or import")
("debug", "Enable diagnostic output to stderr")
@@ -1170,14 +1214,67 @@ int main(int argc, char **argv)
return 1;
}
+ bool fs_sharded_objects = fs->get_allow_sharded_objects();
+
int ret = 0;
vector<coll_t> ls;
vector<coll_t>::iterator it;
+ CompatSet supported;
+
+#ifdef INTERNAL_TEST
+ supported = get_test_compat_set();
+#else
+ supported = OSD::get_osd_compat_set();
+#endif
+
+ bufferlist bl;
+ OSDSuperblock superblock;
+ bufferlist::iterator p;
+ ret = fs->read(coll_t::META_COLL, OSD_SUPERBLOCK_POBJECT, 0, 0, bl);
+ if (ret < 0) {
+ cout << "Failure to read OSD superblock error= " << r << std::endl;
+ goto out;
+ }
+
+ p = bl.begin();
+ ::decode(superblock, p);
+
+#ifdef INTERNAL_TEST2
+ fs->set_allow_sharded_objects();
+ assert(fs->get_allow_sharded_objects());
+ fs_sharded_objects = true;
+ superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
+#endif
+
+ if (debug && file_fd != STDOUT_FILENO) {
+ cout << "Supported features: " << supported << std::endl;
+ cout << "On-disk features: " << superblock.compat_features << std::endl;
+ }
+ if (supported.compare(superblock.compat_features) == -1) {
+ cout << "On-disk OSD incompatible features set "
+ << superblock.compat_features << std::endl;
+ ret = EINVAL;
+ goto out;
+ }
+
+ // If there was a crash as an OSD was transitioning to sharded objects
+ // and hadn't completed a set_allow_sharded_objects().
+ // This utility does not want to attempt to finish that transition.
+ if (superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS) != fs_sharded_objects) {
+ // An OSD should never have call set_allow_sharded_objects() before
+ // updating its own OSD features.
+ if (fs_sharded_objects)
+ cout << "FileStore sharded but OSD not set, Corruption?" << std::endl;
+ else
+ cout << "Found incomplete transition to sharded objects" << std::endl;
+ ret = EINVAL;
+ goto out;
+ }
if (type == "import") {
try {
- ret = do_import(fs);
+ ret = do_import(fs, superblock);
}
catch (const buffer::error &e) {
cout << "do_import threw exception error " << e.what() << std::endl;
@@ -1260,7 +1357,7 @@ int main(int argc, char **argv)
cerr << "struct_v " << (int)struct_ver << std::endl;
if (type == "export") {
- ret = do_export(fs, coll, pgid, info, map_epoch, struct_ver);
+ ret = do_export(fs, coll, pgid, info, map_epoch, struct_ver, superblock);
} else if (type == "info") {
formatter->open_object_section("info");
info.dump(formatter);
diff --git a/src/tools/ceph-osdomap-tool.cc b/src/tools/ceph-osdomap-tool.cc
index aedc4c824e7..bde4b28b45f 100644
--- a/src/tools/ceph-osdomap-tool.cc
+++ b/src/tools/ceph-osdomap-tool.cc
@@ -115,30 +115,30 @@ int main(int argc, char **argv) {
i->value().hexdump(std::cout);
}
} else if (cmd == "dump-objects") {
- vector<hobject_t> objects;
+ vector<ghobject_t> objects;
r = omap.list_objects(&objects);
if (r < 0) {
std::cerr << "list_objects got: " << cpp_strerror(r) << std::endl;
goto done;
}
- for (vector<hobject_t>::iterator i = objects.begin();
+ for (vector<ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i) {
std::cout << *i << std::endl;
}
r = 0;
} else if (cmd == "dump-objects-with-keys") {
- vector<hobject_t> objects;
+ vector<ghobject_t> objects;
r = omap.list_objects(&objects);
if (r < 0) {
std::cerr << "list_objects got: " << cpp_strerror(r) << std::endl;
goto done;
}
- for (vector<hobject_t>::iterator i = objects.begin();
+ for (vector<ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i) {
std::cout << "Object: " << *i << std::endl;
- ObjectMap::ObjectMapIterator j = omap.get_iterator(*i);
+ ObjectMap::ObjectMapIterator j = omap.get_iterator(i->hobj);
for (j->seek_to_first(); j->valid(); j->next()) {
std::cout << j->key() << std::endl;
j->value().hexdump(std::cout);
diff --git a/src/ceph_authtool.cc b/src/tools/ceph_authtool.cc
index f66a3c66eee..f66a3c66eee 100644
--- a/src/ceph_authtool.cc
+++ b/src/tools/ceph_authtool.cc
diff --git a/src/ceph_conf.cc b/src/tools/ceph_conf.cc
index b2286f4e094..b2286f4e094 100644
--- a/src/ceph_conf.cc
+++ b/src/tools/ceph_conf.cc
diff --git a/src/crushtool.cc b/src/tools/crushtool.cc
index 75c26c098b6..03c83f24156 100644
--- a/src/crushtool.cc
+++ b/src/tools/crushtool.cc
@@ -569,7 +569,11 @@ int main(int argc, const char **argv)
crush_bucket *b = crush_make_bucket(buckettype, CRUSH_HASH_DEFAULT, type, j, items, weights);
assert(b);
- int id = crush_add_bucket(crush.crush, 0, b);
+ int id;
+ int r = crush_add_bucket(crush.crush, 0, b, &id);
+ if (r < 0) {
+ dout(0) << "Couldn't add root bucket: " << strerror(-r) << dendl;
+ }
rootid = id;
char format[20];
diff --git a/src/dupstore.cc b/src/tools/dupstore.cc
index e17eb2201a7..c8b8ece31c8 100644
--- a/src/dupstore.cc
+++ b/src/tools/dupstore.cc
@@ -27,7 +27,7 @@ int dupstore(ObjectStore* src, ObjectStore* dst)
if (dst->mount() < 0) return 1;
// objects
- hash_map<hobject_t, coll_t> did_object;
+ hash_map<ghobject_t, coll_t> did_object;
// collections
vector<coll_t> collections;
@@ -54,11 +54,11 @@ int dupstore(ObjectStore* src, ObjectStore* dst)
dst->apply_transaction(t);
}
- vector<hobject_t> o;
+ vector<ghobject_t> o;
src->collection_list(*p, o);
int numo = o.size();
int j = 1;
- for (vector<hobject_t>::iterator q = o.begin(); q != o.end(); ++q) {
+ for (vector<ghobject_t>::iterator q = o.begin(); q != o.end(); ++q) {
ObjectStore::Transaction t;
if (did_object.count(*q))
t.collection_add(*p, did_object[*q], *q);
diff --git a/src/mon_store_converter.cc b/src/tools/mon_store_converter.cc
index 1c0d3af98e2..1c0d3af98e2 100644
--- a/src/mon_store_converter.cc
+++ b/src/tools/mon_store_converter.cc
diff --git a/src/monmaptool.cc b/src/tools/monmaptool.cc
index 57843aa350d..57843aa350d 100644
--- a/src/monmaptool.cc
+++ b/src/tools/monmaptool.cc
diff --git a/src/osdmaptool.cc b/src/tools/osdmaptool.cc
index 2e55026076c..2e55026076c 100644
--- a/src/osdmaptool.cc
+++ b/src/tools/osdmaptool.cc
diff --git a/src/psim.cc b/src/tools/psim.cc
index c1adc7580fb..c1adc7580fb 100644
--- a/src/psim.cc
+++ b/src/tools/psim.cc
diff --git a/src/rados.cc b/src/tools/rados/rados.cc
index dd9b4de1794..ad8eaa3e1a4 100644
--- a/src/rados.cc
+++ b/src/tools/rados/rados.cc
@@ -98,6 +98,7 @@ void usage(ostream& out)
" rmomapkey <obj-name> <key>\n"
" getomapheader <obj-name>\n"
" setomapheader <obj-name> <val>\n"
+" tmap-to-omap <obj-name> convert tmap keys/values to omap\n"
" listwatchers <obj-name> list the watchers of this object\n"
"\n"
"IMPORT AND EXPORT\n"
@@ -879,7 +880,8 @@ protected:
}
public:
- RadosBencher(librados::Rados& _r, librados::IoCtx& _i) : completions(NULL), rados(_r), io_ctx(_i), iterator_valid(false) {}
+ RadosBencher(CephContext *cct_, librados::Rados& _r, librados::IoCtx& _i)
+ : ObjBencher(cct_), completions(NULL), rados(_r), io_ctx(_i), iterator_valid(false) {}
~RadosBencher() { }
};
@@ -1812,8 +1814,15 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
bufferlist::iterator p = outdata.begin();
bufferlist header;
map<string, bufferlist> kv;
- ::decode(header, p);
- ::decode(kv, p);
+ try {
+ ::decode(header, p);
+ ::decode(kv, p);
+ }
+ catch (buffer::error& e) {
+ cerr << "error decoding tmap " << pool_name << "/" << oid << std::endl;
+ ret = -EINVAL;
+ goto out;
+ }
cout << "header (" << header.length() << " bytes):\n";
header.hexdump(cout);
cout << "\n";
@@ -1840,6 +1849,50 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
}
}
+ else if (strcmp(nargs[0], "tmap-to-omap") == 0) {
+ if (!pool_name || nargs.size() < 2)
+ usage_exit();
+ string oid(nargs[1]);
+
+ bufferlist bl;
+ int r = io_ctx.tmap_get(oid, bl);
+ if (r < 0) {
+ ret = r;
+ cerr << "error reading tmap " << pool_name << "/" << oid
+ << ": " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ bufferlist hdr;
+ map<string, bufferlist> kv;
+ bufferlist::iterator p = bl.begin();
+ try {
+ ::decode(hdr, p);
+ ::decode(kv, p);
+ }
+ catch (buffer::error& e) {
+ cerr << "error decoding tmap " << pool_name << "/" << oid << std::endl;
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!p.end()) {
+ cerr << "error decoding tmap (stray trailing data) in " << pool_name << "/" << oid << std::endl;
+ ret = -EINVAL;
+ goto out;
+ }
+ librados::ObjectWriteOperation wr;
+ wr.omap_set_header(hdr);
+ wr.omap_set(kv);
+ wr.truncate(0); // delete the old tmap data
+ r = io_ctx.operate(oid, &wr);
+ if (r < 0) {
+ ret = r;
+ cerr << "error writing tmap data as omap on " << pool_name << "/" << oid
+ << ": " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ ret = 0;
+ }
+
else if (strcmp(nargs[0], "mkpool") == 0) {
int auid = 0;
__u8 crush_rule = 0;
@@ -1987,7 +2040,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
operation = OP_RAND_READ;
else
usage_exit();
- RadosBencher bencher(rados, io_ctx);
+ RadosBencher bencher(g_ceph_context, rados, io_ctx);
bencher.set_show_time(show_time);
ret = bencher.aio_bench(operation, seconds, num_objs,
concurrent_ios, op_size, cleanup);
@@ -1998,7 +2051,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
if (!pool_name || nargs.size() < 2)
usage_exit();
const char *prefix = nargs[1];
- RadosBencher bencher(rados, io_ctx);
+ RadosBencher bencher(g_ceph_context, rados, io_ctx);
ret = bencher.clean_up(prefix, concurrent_ios);
if (ret != 0)
cerr << "error during cleanup: " << ret << std::endl;
@@ -2234,8 +2287,9 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
}
} else {
- cerr << "unrecognized command " << nargs[0] << std::endl;
- usage_exit();
+ cerr << "unrecognized command " << nargs[0] << "; -h or --help for usage" << std::endl;
+ ret = -EINVAL;
+ goto out;
}
if (ret < 0)
diff --git a/src/rados_export.cc b/src/tools/rados/rados_export.cc
index 5b8a899d3b8..bf6654114c5 100644
--- a/src/rados_export.cc
+++ b/src/tools/rados/rados_export.cc
@@ -11,6 +11,7 @@
* Foundation. See file COPYING.
*
*/
+#include "include/int_types.h"
#include "rados_sync.h"
#include "common/errno.h"
@@ -20,7 +21,6 @@
#include <dirent.h>
#include <errno.h>
#include <fstream>
-#include <inttypes.h>
#include <iostream>
#include <sstream>
#include <stdlib.h>
diff --git a/src/rados_import.cc b/src/tools/rados/rados_import.cc
index da968a7cc50..a6a398d767b 100644
--- a/src/rados_import.cc
+++ b/src/tools/rados/rados_import.cc
@@ -11,11 +11,11 @@
* Foundation. See file COPYING.
*
*/
+#include "include/int_types.h"
#include <dirent.h>
#include <errno.h>
#include <fstream>
-#include <inttypes.h>
#include <iostream>
#include <sstream>
#include <stdlib.h>
diff --git a/src/rados_sync.cc b/src/tools/rados/rados_sync.cc
index d16894c6ea9..03293d3402a 100644
--- a/src/rados_sync.cc
+++ b/src/tools/rados/rados_sync.cc
@@ -11,6 +11,7 @@
* Foundation. See file COPYING.
*
*/
+#include "include/int_types.h"
#include "common/ceph_argparse.h"
#include "common/config.h"
@@ -27,7 +28,6 @@
#include <dirent.h>
#include <errno.h>
#include <fstream>
-#include <inttypes.h>
#include <iostream>
#include <memory>
#include <sstream>
diff --git a/src/rados_sync.h b/src/tools/rados/rados_sync.h
index 0f7226e0239..0f7226e0239 100644
--- a/src/rados_sync.h
+++ b/src/tools/rados/rados_sync.h
diff --git a/src/radosacl.cc b/src/tools/radosacl.cc
index d2f7ca5c488..d2f7ca5c488 100644
--- a/src/radosacl.cc
+++ b/src/tools/radosacl.cc
diff --git a/src/tools/rest_bench.cc b/src/tools/rest_bench.cc
index 99fd16b5a34..feea4de4932 100644
--- a/src/tools/rest_bench.cc
+++ b/src/tools/rest_bench.cc
@@ -261,10 +261,12 @@ class RESTDispatcher {
} req_wq;
public:
- RESTDispatcher(CephContext *cct, int num_threads)
- : m_tp(cct, "RESTDispatcher::m_tp", num_threads),
- req_wq(this, g_conf->rgw_op_thread_timeout,
- g_conf->rgw_op_thread_suicide_timeout, &m_tp) {
+ CephContext *cct;
+ RESTDispatcher(CephContext *cct_, int num_threads)
+ : m_tp(cct_, "RESTDispatcher::m_tp", num_threads),
+ req_wq(this, cct_->_conf->rgw_op_thread_timeout,
+ cct_->_conf->rgw_op_thread_suicide_timeout, &m_tp),
+ cct(cct_) {
response_handler.propertiesCallback = properties_callback;
@@ -588,6 +590,7 @@ protected:
public:
RESTBencher(RESTDispatcher *_dispatcher) :
+ ObjBencher(_dispatcher->cct),
dispatcher(_dispatcher),
completions(NULL),
list_start(NULL),
diff --git a/src/scratchtool.c b/src/tools/scratchtool.c
index 22cf2bdf531..22cf2bdf531 100644
--- a/src/scratchtool.c
+++ b/src/tools/scratchtool.c
diff --git a/src/scratchtoolpp.cc b/src/tools/scratchtoolpp.cc
index 62096920300..62096920300 100644
--- a/src/scratchtoolpp.cc
+++ b/src/tools/scratchtoolpp.cc
diff --git a/src/vstart.sh b/src/vstart.sh
index 7ce4628d775..4839cc1156d 100755
--- a/src/vstart.sh
+++ b/src/vstart.sh
@@ -17,6 +17,11 @@ set -e
[ -z "$CEPH_NUM_MDS" ] && CEPH_NUM_MDS=3
[ -z "$CEPH_NUM_RGW" ] && CEPH_NUM_RGW=1
+[ -z "$CEPH_DIR" ] && CEPH_DIR="$PWD/"
+[ -z "$CEPH_DEV_DIR" ] && CEPH_DEV_DIR="$CEPH_DIR/dev"
+[ -z "$CEPH_OUT_DIR" ] && CEPH_OUT_DIR="$CEPH_DIR/out"
+[ -z "$CEPH_RGW_PORT" ] && CEPH_RGW_PORT=8000
+
extra_conf=""
new=0
standby=0
@@ -34,9 +39,9 @@ cephx=1 #turn cephx on by default
MON_ADDR=""
-conf="ceph.conf"
+conf="$CEPH_DIR/ceph.conf"
-keyring_fn="$PWD/keyring"
+keyring_fn="$CEPH_DIR/keyring"
osdmap_fn="/tmp/ceph_osdmap.$$"
monmap_fn="/tmp/ceph_monmap.$$"
@@ -194,6 +199,7 @@ else
COSDDEBUG='
debug ms = 1
debug osd = 25
+ debug objecter = 20
debug monc = 20
debug journal = 20
debug filestore = 20
@@ -223,7 +229,7 @@ fi
# sudo if btrfs
-test -d dev/osd0/. && test -e dev/sudo && SUDO="sudo"
+test -d $CEPH_DEV_DIR/osd0/. && test -e $CEPH_DEV_DIR/sudo && SUDO="sudo"
if [ "$start_all" -eq 1 ]; then
$SUDO $CEPH_BIN/init-ceph stop
@@ -231,6 +237,7 @@ fi
$SUDO rm -f core*
test -d out || mkdir out
+test -d dev || mkdir dev
$SUDO rm -rf out/*
test -d gmon && $SUDO rm -rf gmon/*
@@ -243,11 +250,11 @@ if [ -n "$ip" ]; then
IP="$ip"
else
echo hostname $HOSTNAME
- RAW_IP=`hostname --ip-address`
+ RAW_IP=`hostname -I`
# filter out IPv6 and localhost addresses
IP="$(echo "$RAW_IP"|tr ' ' '\012'|grep -v :|grep -v '^127\.'|head -n1)"
# if that left nothing, then try to use the raw thing, it might work
- if [ -z "IP" ]; then IP="$RAW_IP"; fi
+ if [ -z "$IP" ]; then IP="$RAW_IP"; fi
echo ip $IP
fi
echo "ip $IP"
@@ -275,11 +282,11 @@ do
done
DAEMONOPTS="
- log file = out/\$name.log
- admin socket = out/\$name.asok
+ log file = $CEPH_OUT_DIR/\$name.log
+ admin socket = $CEPH_OUT_DIR/\$name.asok
chdir = \"\"
- pid file = out/\$name.pid
- heartbeat file = out/\$name.heartbeat
+ pid file = $CEPH_OUT_DIR/\$name.pid
+ heartbeat file = $CEPH_OUT_DIR/\$name.heartbeat
"
@@ -294,7 +301,7 @@ if [ "$start_mon" -eq 1 ]; then
osd pgp bits = 5 ; (invalid, but ceph should cope!)
osd crush chooseleaf type = 0
osd pool default min size = 1
- run dir = out
+ run dir = $CEPH_OUT_DIR
EOF
if [ "$cephx" -eq 1 ] ; then
cat <<EOF >> $conf
@@ -311,7 +318,7 @@ fi
[client]
keyring = $keyring_fn
- log file = out/\$name.\$pid.log
+ log file = $CEPH_OUT_DIR/\$name.\$pid.log
[mds]
$DAEMONOPTS
@@ -319,12 +326,12 @@ $CMDSDEBUG
mds debug frag = true
mds debug auth pins = true
mds debug subtrees = true
- mds data = dev/mds.\$id
+ mds data = $CEPH_DEV_DIR/mds.\$id
$extra_conf
[osd]
$DAEMONOPTS
- osd data = dev/osd\$id
- osd journal = dev/osd\$id.journal
+ osd data = $CEPH_DEV_DIR/osd\$id
+ osd journal = $CEPH_DEV_DIR/osd\$id.journal
osd journal size = 100
osd class tmp = out
osd class dir = .libs
@@ -333,10 +340,11 @@ $DAEMONOPTS
$COSDDEBUG
$extra_conf
[mon]
+ mon pg warn min per osd = 10
$DAEMONOPTS
$CMONDEBUG
$extra_conf
- mon cluster log file = out/cluster.mon.\$id.log
+ mon cluster log file = $CEPH_OUT_DIR/cluster.mon.\$id.log
[global]
$extra_conf
EOF
@@ -368,7 +376,7 @@ EOF
cat <<EOF >> $conf
[mon.$f]
host = $HOSTNAME
- mon data = dev/mon.$f
+ mon data = $CEPH_DEV_DIR/mon.$f
mon addr = $IP:$(($CEPH_PORT+$count))
EOF
fi
@@ -380,10 +388,10 @@ EOF
for f in $MONS
do
- cmd="rm -rf dev/mon.$f"
+ cmd="rm -rf $CEPH_DEV_DIR/mon.$f"
echo $cmd
$cmd
- cmd="mkdir dev/mon.$f"
+ cmd="mkdir -p $CEPH_DEV_DIR/mon.$f"
echo $cmd
$cmd
cmd="$CEPH_BIN/ceph-mon --mkfs -c $conf -i $f --monmap=$monmap_fn"
@@ -414,9 +422,9 @@ if [ "$start_osd" -eq 1 ]; then
[osd.$osd]
host = $HOSTNAME
EOF
- rm -rf dev/osd$osd || true
- for f in dev/osd$osd/* ; do btrfs sub delete $f || true ; done || true
- mkdir -p dev/osd$osd
+ rm -rf $CEPH_DEV_DIR/osd$osd || true
+ for f in $CEPH_DEV_DIR/osd$osd/* ; do btrfs sub delete $f || true ; done || true
+ mkdir -p $CEPH_DEV_DIR/osd$osd
fi
uuid=`uuidgen`
@@ -425,7 +433,7 @@ EOF
$SUDO $CEPH_ADM osd crush add osd.$osd 1.0 host=localhost rack=localrack root=default
$SUDO $CEPH_BIN/ceph-osd -i $osd $ARGS --mkfs --mkkey --osd-uuid $uuid
- key_fn=dev/osd$osd/keyring
+ key_fn=$CEPH_DEV_DIR/osd$osd/keyring
echo adding osd$osd key to auth repository
$SUDO $CEPH_ADM -i $key_fn auth add osd.$osd osd "allow *" mon "allow profile osd"
fi
@@ -448,15 +456,15 @@ if [ "$start_mds" -eq 1 ]; then
for name in a b c d e f g h i j k l m n o p
do
if [ "$new" -eq 1 ]; then
- mkdir -p dev/mds.$name
- key_fn=dev/mds.$name/keyring
+ mkdir -p $CEPH_DEV_DIR/mds.$name
+ key_fn=$CEPH_DEV_DIR/mds.$name/keyring
if [ $overwrite_conf -eq 1 ]; then
cat <<EOF >> $conf
[mds.$name]
host = $HOSTNAME
EOF
if [ "$standby" -eq 1 ]; then
- mkdir -p dev/mds.${name}s
+ mkdir -p $CEPH_DEV_DIR/mds.${name}s
cat <<EOF >> $conf
mds standby for rank = $mds
[mds.${name}s]
@@ -469,8 +477,8 @@ EOF
$SUDO $CEPH_ADM -i $key_fn auth add mds.$name mon 'allow profile mds' osd 'allow *' mds 'allow'
if [ "$standby" -eq 1 ]; then
$SUDO $CEPH_BIN/ceph-authtool --create-keyring --gen-key --name=mds.${name}s \
- dev/mds.${name}s/keyring
- $SUDO $CEPH_ADM -i dev/mds.${name}s/keyring auth add mds.${name}s \
+ $CEPH_DEV_DIR/mds.${name}s/keyring
+ $SUDO $CEPH_ADM -i $CEPH_DEV_DIR/mds.${name}s/keyring auth add mds.${name}s \
mon 'allow *' osd 'allow *' mds 'allow'
fi
fi
@@ -497,7 +505,7 @@ fi
if [ "$start_rgw" -eq 1 ]; then
for rgw in `seq 0 $((CEPH_NUM_RGW-1))`
do
- rgwport=$(( 8000 + $rgw ))
+ rgwport=$(( $CEPH_RGW_PORT + $rgw ))
if [ "$new" -eq 1 ]; then
if [ $overwrite_conf -eq 1 ]; then
dnsname=`hostname -f`
@@ -505,13 +513,13 @@ if [ "$start_rgw" -eq 1 ]; then
[client.radosgw.rgw$rgw]
host = $HOSTNAME
$DAEMONOPTS
- keyring = out/keyring.client.radosgw.rgw$rgw
- rgw socket path = out/sock.client.radosgw.rgw$rgw
+ keyring = $CEPH_OUT_DIR/keyring.client.radosgw.rgw$rgw
+ rgw socket path = $CEPH_OUT_DIR/sock.client.radosgw.rgw$rgw
rgw dns name = $dnsname
EOF
- mkdir -p out/htdocs
- mkdir -p out/fastcgi_sock
- cat <<EOF > out/apache.conf
+ mkdir -p $CEPH_OUT_DIR/htdocs
+ mkdir -p $CEPH_OUT_DIR/fastcgi_sock
+ cat <<EOF > $CEPH_OUT_DIR/apache.conf
LoadModule env_module /usr/lib/apache2/modules/mod_env.so
LoadModule rewrite_module /usr/lib/apache2/modules/mod_rewrite.so
LoadModule fastcgi_module /usr/lib/apache2/modules/mod_fastcgi.so
@@ -519,14 +527,14 @@ LoadModule fastcgi_module /usr/lib/apache2/modules/mod_fastcgi.so
Listen $rgwport
ServerName rgwtest.example.com
-ServerRoot $PWD/out
-ErrorLog $PWD/out/apache.error.log
+ServerRoot $CEPH_OUT_DIR
+ErrorLog $CEPH_OUT_DIR/apache.error.log
LogFormat "%h l %u %t \"%r\" %>s %b \"{Referer}i\" \"%{User-agent}i\"" combined
-CustomLog $PWD/out/apache.access.log combined
-PidFile $PWD/out/apache.pid
-DocumentRoot $PWD/out/htdocs
-FastCgiIPCDir $PWD/out/fastcgi_sock
-FastCgiExternalServer $PWD/out/htdocs/rgw.fcgi -socket $PWD/out/sock.client.radosgw.rgw$rgw
+CustomLog $CEPH_OUT_DIR/apache.access.log combined
+PidFile $CEPH_OUT_DIR/apache.pid
+DocumentRoot $CEPH_OUT_DIR/htdocs
+FastCgiIPCDir $CEPH_OUT_DIR/fastcgi_sock
+FastCgiExternalServer $CEPH_OUT_DIR/htdocs/rgw.fcgi -socket $CEPH_OUT_DIR/sock.client.radosgw.rgw$rgw
RewriteEngine On
RewriteRule ^/([a-zA-Z0-9-_.]*)([/]?.*) /rgw.fcgi?page=$1&params=$2&%{QUERY_STRING} [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L]
@@ -537,7 +545,7 @@ SetEnv RGW_LOG_LEVEL 20
SetEnv RGW_PRINT_CONTINUE yes
SetEnv RGW_SHOULD_LOG yes
-<Directory $PWD/out/htdocs>
+<Directory $CEPH_OUT_DIR/htdocs>
Options +ExecCGI
AllowOverride All
SetHandler fastcgi-script
@@ -546,7 +554,7 @@ SetEnv RGW_SHOULD_LOG yes
AllowEncodedSlashes On
ServerSignature Off
EOF
- $SUDO $CEPH_ADM auth get-or-create client.radosgw.rgw$rgw osd 'allow rwx' mon 'allow r' -o out/keyring.client.radosgw.rgw$rgw
+ $SUDO $CEPH_ADM auth get-or-create client.radosgw.rgw$rgw osd 'allow rwx' mon 'allow r' -o $CEPH_OUT_DIR/keyring.client.radosgw.rgw$rgw
#akey=`echo $$ | md5sum | cut -c 1-20`
#skey=`dd if=/dev/urandom of=/tmp/random.$$ bs=1 count=40 2>/dev/null ; base64 < /tmp/random.$$ ; rm /tmp/random.$$`
@@ -554,12 +562,12 @@ EOF
skey='h7GhxuBLTrlhVUyxSPUKUV8r/2EI4ngqJxD7iBdBYLhwluN30JaT3Q=='
echo access key $akey
echo secret key $skey
- $CEPH_BIN/radosgw-admin user create --uid tester --access-key $akey --secret $skey --display-name 'M. Tester' --email tester@ceph.com
+ $CEPH_BIN/radosgw-admin user create --uid tester --access-key $akey --secret $skey --display-name 'M. Tester' --email tester@ceph.com -c $conf
fi
fi
echo start rgw$rgw on http://localhost:$rgwport
run 'rgw' $SUDO $CEPH_BIN/radosgw -n client.radosgw.rgw$rgw $ARGS
- run 'apache2' $SUDO apache2 -f $PWD/out/apache.conf
+ run 'apache2' $SUDO apache2 -f $CEPH_OUT_DIR/apache.conf
done
fi