summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-08-15 21:47:18 -0700
committerSage Weil <sage@inktank.com>2013-08-26 13:10:06 -0700
commit0e2bfe71965eeef29b47e8032637ea820a7ce49c (patch)
tree06a023c41adbf34e1ff41644872fb3d908fc3fb4
parent6b51c960715971a0351e8203d4896cb0c4138a3f (diff)
downloadceph-0e2bfe71965eeef29b47e8032637ea820a7ce49c.tar.gz
osdc/ObjectCacher: match reads with their original rx buffers
Consider a sequence like: 1- start read on 100~200 100~200 state rx 2- truncate to 200 100~100 state rx 3- start read on 200~200 100~100 state rx 200~200 state rx 4- get 100~200 read result Currently this makes us crash on osdc/ObjectCacher.cc: 738: FAILED assert(bh->length() <= start+(loff_t)length-opos) when processing the second 200~200 bufferhead (it is too big). The larger issue, though, is that we should not be looking at this data at all; it has been truncated away. Fix this by marking each rx buffer with the read request that is sent to fill it, and only fill it from that read request. Then the first reply will fill the first 100~100 extend but not touch the other extent; the second read will do that. Signed-off-by: Sage Weil <sage@inktank.com> (cherry picked from commit b59f930ae147767eb4c9ff18c3821f6936a83227)
-rw-r--r--src/osdc/ObjectCacher.cc27
-rw-r--r--src/osdc/ObjectCacher.h18
2 files changed, 33 insertions, 12 deletions
diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc
index 51fad699555..8dd9de26cf6 100644
--- a/src/osdc/ObjectCacher.cc
+++ b/src/osdc/ObjectCacher.cc
@@ -30,6 +30,7 @@ ObjectCacher::BufferHead *ObjectCacher::Object::split(BufferHead *left, loff_t o
// split off right
ObjectCacher::BufferHead *right = new BufferHead(this);
right->last_write_tid = left->last_write_tid;
+ right->last_read_tid = left->last_read_tid;
right->set_state(left->get_state());
right->snapc = left->snapc;
@@ -500,6 +501,7 @@ ObjectCacher::ObjectCacher(CephContext *cct_, string name, WritebackHandler& wb,
max_size(max_bytes), max_objects(max_objects),
block_writes_upfront(block_writes_upfront),
flush_set_callback(flush_callback), flush_set_callback_arg(flush_callback_arg),
+ last_read_tid(0),
flusher_stop(false), flusher_thread(this), finisher(cct),
stat_clean(0), stat_zero(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_missing(0),
stat_error(0), stat_dirty_waiting(0), reads_outstanding(0)
@@ -603,25 +605,29 @@ void ObjectCacher::bh_read(BufferHead *bh)
<< reads_outstanding << dendl;
mark_rx(bh);
+ bh->last_read_tid = ++last_read_tid;
// finisher
- C_ReadFinish *onfinish = new C_ReadFinish(this, bh->ob,
+ C_ReadFinish *onfinish = new C_ReadFinish(this, bh->ob, bh->last_read_tid,
bh->start(), bh->length());
// go
writeback_handler.read(bh->ob->get_oid(), bh->ob->get_oloc(),
bh->start(), bh->length(), bh->ob->get_snap(),
&onfinish->bl, bh->ob->truncate_size, bh->ob->truncate_seq,
onfinish);
+
++reads_outstanding;
}
-void ObjectCacher::bh_read_finish(int64_t poolid, sobject_t oid, loff_t start,
- uint64_t length, bufferlist &bl, int r,
+void ObjectCacher::bh_read_finish(int64_t poolid, sobject_t oid, tid_t tid,
+ loff_t start, uint64_t length,
+ bufferlist &bl, int r,
bool trust_enoent)
{
assert(lock.is_locked());
ldout(cct, 7) << "bh_read_finish "
<< oid
+ << " tid " << tid
<< " " << start << "~" << length
<< " (bl is " << bl.length() << ")"
<< " returned " << r
@@ -711,7 +717,7 @@ void ObjectCacher::bh_read_finish(int64_t poolid, sobject_t oid, loff_t start,
BufferHead *bh = p->second;
ldout(cct, 20) << "checking bh " << *bh << dendl;
-
+
// finishers?
for (map<loff_t, list<Context*> >::iterator it = bh->waitfor_read.begin();
it != bh->waitfor_read.end();
@@ -720,9 +726,9 @@ void ObjectCacher::bh_read_finish(int64_t poolid, sobject_t oid, loff_t start,
bh->waitfor_read.clear();
if (bh->start() > opos) {
- ldout(cct, 1) << "weirdness: gap when applying read results, "
- << opos << "~" << bh->start() - opos
- << dendl;
+ ldout(cct, 1) << "bh_read_finish skipping gap "
+ << opos << "~" << bh->start() - opos
+ << dendl;
opos = bh->start();
continue;
}
@@ -733,6 +739,13 @@ void ObjectCacher::bh_read_finish(int64_t poolid, sobject_t oid, loff_t start,
continue;
}
+ if (bh->last_read_tid != tid) {
+ ldout(cct, 10) << "bh_read_finish bh->last_read_tid " << bh->last_read_tid
+ << " != tid " << tid << ", skipping" << dendl;
+ opos = bh->end();
+ continue;
+ }
+
assert(opos >= bh->start());
assert(bh->start() == opos); // we don't merge rx bh's... yet!
assert(bh->length() <= start+(loff_t)length-opos);
diff --git a/src/osdc/ObjectCacher.h b/src/osdc/ObjectCacher.h
index ac2833b9ea6..9b045db5da9 100644
--- a/src/osdc/ObjectCacher.h
+++ b/src/osdc/ObjectCacher.h
@@ -104,6 +104,7 @@ class ObjectCacher {
Object *ob;
bufferlist bl;
tid_t last_write_tid; // version of bh (if non-zero)
+ tid_t last_read_tid; // tid of last read op (if any)
utime_t last_write;
SnapContext snapc;
int error; // holds return value for failed reads
@@ -116,6 +117,7 @@ class ObjectCacher {
ref(0),
ob(o),
last_write_tid(0),
+ last_read_tid(0),
error(0) {
ex.start = ex.length = 0;
}
@@ -339,6 +341,8 @@ class ObjectCacher {
vector<hash_map<sobject_t, Object*> > objects; // indexed by pool_id
+ tid_t last_read_tid;
+
set<BufferHead*> dirty_bh;
LRU bh_lru_dirty, bh_lru_rest;
LRU ob_lru;
@@ -455,8 +459,9 @@ class ObjectCacher {
bool external_call);
public:
- void bh_read_finish(int64_t poolid, sobject_t oid, loff_t offset,
- uint64_t length, bufferlist &bl, int r,
+ void bh_read_finish(int64_t poolid, sobject_t oid, tid_t tid,
+ loff_t offset, uint64_t length,
+ bufferlist &bl, int r,
bool trust_enoent);
void bh_write_commit(int64_t poolid, sobject_t oid, loff_t offset,
uint64_t length, tid_t t, int r);
@@ -469,17 +474,20 @@ class ObjectCacher {
uint64_t length;
xlist<C_ReadFinish*>::item set_item;
bool trust_enoent;
+ tid_t tid;
public:
bufferlist bl;
- C_ReadFinish(ObjectCacher *c, Object *ob, loff_t s, uint64_t l) :
+ C_ReadFinish(ObjectCacher *c, Object *ob, tid_t t, loff_t s, uint64_t l) :
oc(c), poolid(ob->oloc.pool), oid(ob->get_soid()), start(s), length(l),
- set_item(this), trust_enoent(true) {
+ set_item(this), trust_enoent(true),
+ tid(t) {
ob->reads.push_back(&set_item);
}
void finish(int r) {
- oc->bh_read_finish(poolid, oid, start, length, bl, r, trust_enoent);
+ oc->bh_read_finish(poolid, oid, tid, start, length, bl, r, trust_enoent);
+
// object destructor clears the list
if (set_item.is_on_list())
set_item.remove_myself();