summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTommi Virtanen <tv@inktank.com>2012-10-05 10:57:42 -0700
committerSage Weil <sage@inktank.com>2012-10-16 18:15:25 -0700
commitb0164d9902c834c95187f6b277f58a30d26cc4d3 (patch)
treebb81485710a8ccd04115a2d06b925829172604e2
parent5a3076fd51706e32bc55b734099037520299003b (diff)
downloadceph-b0164d9902c834c95187f6b277f58a30d26cc4d3.tar.gz
ceph-disk-prepare, debian/control: Support external journals.
Previously, ceph-disk-* would only let you use a journal that was a file inside the OSD data directory. With this, you can do: ceph-disk-prepare /dev/sdb /dev/sdb to put the journal as a second partition on the same disk as the OSD data (might save some file system overhead), or, more interestingly: ceph-disk-prepare /dev/sdb /dev/sdc which makes it create a new partition on /dev/sdc to use as the journal. Size of the partition is decided by $osd_journal_size. /dev/sdc must be a GPT-format disk. Multiple OSDs may share the same journal disk (using separate partitions); this way, a single fast SSD can serve as journal for multiple spinning disks. The second use case currently requires parted, so a Recommends: for parted has been added to Debian packaging. Closes: #3078 Closes: #3079 Signed-off-by: Tommi Virtanen <tv@inktank.com>
-rw-r--r--debian/control2
-rwxr-xr-xsrc/ceph-disk-prepare175
2 files changed, 175 insertions, 2 deletions
diff --git a/debian/control b/debian/control
index 7bfb1a4bf09..b03fe89ed01 100644
--- a/debian/control
+++ b/debian/control
@@ -12,7 +12,7 @@ Standards-Version: 3.9.3
Package: ceph
Architecture: linux-any
Depends: ${shlibs:Depends}, ${misc:Depends}, sdparm | hdparm, binutils, ceph-common, uuid-runtime, python, xfsprogs
-Recommends: ceph-mds, librados2, librbd1, btrfs-tools, gdisk
+Recommends: ceph-mds, librados2, librbd1, btrfs-tools, gdisk, parted
Description: distributed storage and file system
Ceph is a distributed storage system designed to provide excellent
performance, reliability, and scalability.
diff --git a/src/ceph-disk-prepare b/src/ceph-disk-prepare
index b69f21e4bf3..ec3dd8250f3 100755
--- a/src/ceph-disk-prepare
+++ b/src/ceph-disk-prepare
@@ -54,6 +54,23 @@ def write_one_line(parent, name, text):
CEPH_OSD_ONDISK_MAGIC = 'ceph osd volume v026'
+JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-b4b80ceff106'
+
+
+# TODO depend on python2.7
+def _check_output(*args, **kwargs):
+ process = subprocess.Popen(
+ stdout=subprocess.PIPE,
+ *args, **kwargs)
+ out, _ = process.communicate()
+ ret = process.wait()
+ if ret:
+ cmd = kwargs.get("args")
+ if cmd is None:
+ cmd = args[0]
+ raise subprocess.CalledProcessError(ret, cmd, output=out)
+ return out
+
def get_conf(cluster, variable):
try:
@@ -86,6 +103,36 @@ def get_conf(cluster, variable):
return value
+def get_conf_with_default(cluster, variable):
+ """
+ Get a config value that is known to the C++ code.
+
+ This will fail if called on variables that are not defined in
+ common config options.
+ """
+ try:
+ out = _check_output(
+ args=[
+ 'ceph-osd',
+ '--cluster={cluster}'.format(
+ cluster=cluster,
+ ),
+ '--show-config-value={variable}'.format(
+ variable=variable,
+ ),
+ ],
+ close_fds=True,
+ )
+ except subprocess.CalledProcessError as e:
+ raise PrepareError(
+ 'getting variable from configuration failed',
+ e,
+ )
+
+ value = out.split('\n', 1)[0]
+ return value
+
+
def get_fsid(cluster):
fsid = get_conf(cluster=cluster, variable='fsid')
if fsid is None:
@@ -168,8 +215,48 @@ def unmount(
os.rmdir(path)
+def get_free_partition_index(dev):
+ try:
+ lines = _check_output(
+ args=[
+ 'parted',
+ '--machine',
+ '--',
+ dev,
+ 'print',
+ ],
+ )
+ except subprocess.CalledProcessError as e:
+ raise PrepareError('cannot read partition index', e)
+
+ if not lines:
+ raise PrepareError('parted failed to output anything')
+ lines = lines.splitlines(True)
+
+ if lines[0] not in ['CHS;\n', 'CYL;\n', 'BYT;\n']:
+ raise PrepareError('weird parted units', lines[0])
+ del lines[0]
+
+ if not lines[0].startswith('/dev/'):
+ raise PrepareError('weird parted disk entry', lines[0])
+ del lines[0]
+
+ seen = set()
+ for line in lines:
+ idx, _ = line.split(':', 1)
+ idx = int(idx)
+ seen.add(idx)
+
+ num = 1
+ while num in seen:
+ num += 1
+ return num
+
+
def prepare(
disk,
+ journal,
+ journal_size,
fstype,
mkfs_args,
mount_options,
@@ -184,15 +271,78 @@ def prepare(
WARNING: This will unconditionally overwrite anything given to
it.
"""
- osd_uuid = str(uuid.uuid4())
try:
+ # this kills the crab
subprocess.check_call(
args=[
'sgdisk',
'--zap-all',
'--clear',
'--mbrtogpt',
+ '--',
+ disk,
+ ],
+ )
+ except subprocess.CalledProcessError as e:
+ raise PrepareError(e)
+
+ osd_uuid = str(uuid.uuid4())
+
+ # store the partition uuid iff using external journal
+ journal_uuid = None
+
+ if journal is not None:
+ journal_uuid = str(uuid.uuid4())
+
+ if journal == disk:
+ # we're sharing the disk between osd data and journal;
+ # make journal be partition number 2, so it's pretty; put
+ # journal at end of free space so partitioning tools don't
+ # reorder them suddenly
+ num = 2
+ journal_part = '{num}:-{size}M:0'.format(
+ num=num,
+ size=journal_size,
+ )
+ else:
+ # sgdisk has no way for me to say "whatever is the next
+ # free index number" when setting type guids etc, so we
+ # need to awkwardly look up the next free number, and then
+ # fix that in the call -- and hope nobody races with us;
+ # then again nothing guards the partition table from races
+ # anyway
+ num = get_free_partition_index(dev=journal)
+ journal_part = '{num}:0:{size}M'.format(
+ num=num,
+ size=journal_size,
+ )
+
+ try:
+ subprocess.check_call(
+ args=[
+ 'sgdisk',
+ '--new={part}'.format(part=journal_part),
+ '--change-name={num}:ceph journal'.format(num=num),
+ '--partition-guid={num}:{journal_uuid}'.format(
+ num=num,
+ journal_uuid=journal_uuid,
+ ),
+ '--typecode={num}:{uuid}'.format(
+ num=num,
+ uuid=JOURNAL_UUID,
+ ),
+ '--',
+ journal,
+ ],
+ )
+ except subprocess.CalledProcessError as e:
+ raise PrepareError(e)
+
+ try:
+ subprocess.check_call(
+ args=[
+ 'sgdisk',
'--largest-new=1',
'--change-name=1:ceph data',
'--partition-guid=1:{osd_uuid}'.format(
@@ -226,6 +376,14 @@ def prepare(
path = mount(dev=dev, fstype=fstype, options=mount_options)
try:
+ if journal_uuid is not None:
+ # we're using an external journal; point to it here
+ os.symlink(
+ '/dev/disk/by-partuuid/{journal_uuid}'.format(
+ journal_uuid=journal_uuid,
+ ),
+ os.path.join(path, 'journal'),
+ )
write_one_line(path, 'ceph_fsid', cluster_uuid)
write_one_line(path, 'fsid', osd_uuid)
write_one_line(path, 'magic', CEPH_OSD_ONDISK_MAGIC)
@@ -273,6 +431,13 @@ def parse_args():
metavar='DISK',
help='path to OSD data disk block device',
)
+ parser.add_argument(
+ 'journal',
+ metavar='JOURNAL',
+ nargs='?',
+ help=('path to OSD journal disk block device;'
+ + ' leave out to store journal in file'),
+ )
parser.set_defaults(
# we want to hold on to this, for later
prog=parser.prog,
@@ -323,8 +488,16 @@ def main():
),
)
+ journal_size = get_conf_with_default(
+ cluster=args.cluster,
+ variable='osd_journal_size',
+ )
+ journal_size = int(journal_size)
+
prepare(
disk=args.disk,
+ journal=args.journal,
+ journal_size=journal_size,
fstype=args.fs_type,
mkfs_args=mkfs_args,
mount_options=mount_options,