summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-09-23 10:26:48 -0700
committerSage Weil <sage@inktank.com>2013-09-23 10:26:48 -0700
commit10f291838e906dfe3816a6df1883386ddbf8c357 (patch)
tree89288a03060e49cc4e86b0da125bd249310a77e1
parentb683005c219e3d6da8fb241a684d348440ff8270 (diff)
parent89e8b8be2fbfea065defba39ee2d2a620b21a11f (diff)
downloadceph-10f291838e906dfe3816a6df1883386ddbf8c357.tar.gz
Merge pull request #603 from dachary/wip-erasure-code-example
erasure code example cleanup
-rw-r--r--src/osd/ErasureCodeInterface.h88
-rw-r--r--src/test/osd/ErasureCodeExample.h85
-rw-r--r--src/test/osd/TestErasureCodeExample.cc46
3 files changed, 175 insertions, 44 deletions
diff --git a/src/osd/ErasureCodeInterface.h b/src/osd/ErasureCodeInterface.h
index 5ce2842d562..656ee91987e 100644
--- a/src/osd/ErasureCodeInterface.h
+++ b/src/osd/ErasureCodeInterface.h
@@ -25,15 +25,15 @@
are systematic (i.e. the data is not mangled and can be
reconstructed by concatenating chunks ).
- All methods returns **0** on success and a negative value on
+ All methods return **0** on success and a negative value on
error. If the value returned on error is not explained in
**ErasureCodeInterface**, the sources or the documentation of the
- interface implementer must be read to figure out what it means. It
- is recommended that each error code matches an *errno* value that
- relates to the cause of the error.
+ interface implementer (i.e. the plugin ) must be read to figure
+ out what it means. It is recommended that each error code matches
+ an *errno* value that relates to the cause of the error.
Assuming the interface implementer provides three data chunks ( K
- = 3 ) and two coding chunks ( M = 2 ), a buffer can be encoded as
+ = 3 ) and two coding chunks ( M = 2 ), a buffer could be encoded as
follows:
~~~~~~~~~~~~~~~~{.c}
@@ -50,16 +50,20 @@
encoded[4] // coding chunk 1
~~~~~~~~~~~~~~~~
- If encoded[2] ( which contains **EF** ) is missing and accessing
- encoded[3] ( the first coding chunk ) is more expensive than
- accessing encoded[4] ( the second coding chunk ), the
- **minimum_to_decode_with_cost** method can be called as follows:
+ The **minimum_to_decode_with_cost** method can be used to minimize
+ the cost of fetching the chunks necessary to retrieve a given
+ content. For instance, if encoded[2] (contained **EF**) is missing
+ and accessing encoded[3] (the first coding chunk) is more
+ expensive than accessing encoded[4] (the second coding chunk),
+ **minimum_to_decode_with_cost** is expected to chose the first
+ coding chunk.
~~~~~~~~~~~~~~~~{.c}
set<int> want_to_read(2); // want the chunk containing "EF"
map<int,int> available(
0 => 1, // data chunk 0 : available and costs 1
1 => 1, // data chunk 1 : available and costs 1
+ // data chunk 2 : missing
3 => 9, // coding chunk 1 : available and costs 9
4 => 1, // coding chunk 2 : available and costs 1
);
@@ -67,14 +71,14 @@
minimum_to_decode_with_cost(want_to_read,
available,
&minimum);
- minimum == set<int>(0, 1, 4);
+ minimum == set<int>(0, 1, 4); // NOT set<int>(0, 1, 3);
~~~~~~~~~~~~~~~~
It sets **minimum** with three chunks to reconstruct the desired
data chunk and will pick the second coding chunk ( 4 ) because it
is less expensive ( 1 < 9 ) to retrieve than the first coding
chunk ( 3 ). The caller is responsible for retrieving the chunks
- and call **decode** to reconstruct the second data chunk content.
+ and call **decode** to reconstruct the second data chunk.
~~~~~~~~~~~~~~~~{.c}
map<int,bufferlist> chunks;
@@ -85,6 +89,10 @@
decoded[2] == "EF"
~~~~~~~~~~~~~~~~
+ The semantic of the cost value is defined by the caller and must
+ be known to the implementer. For instance, it may be more
+ expensive to retrieve two chunks with cost 1 + 9 = 10 than two
+ chunks with cost 6 + 6 = 12.
*/
#include <map>
@@ -113,7 +121,7 @@ namespace ceph {
*
* @param [in] want_to_read chunk indexes to be decoded
* @param [in] available chunk indexes containing valid data
- * @param [out] minimum chunk indexes to retrieve for decode
+ * @param [out] minimum chunk indexes to retrieve
* @return **0** on success or a negative errno on error.
*/
virtual int minimum_to_decode(const set<int> &want_to_read,
@@ -124,8 +132,8 @@ namespace ceph {
* Compute the smallest subset of **available** chunks that needs
* to be retrieved in order to successfully decode
* **want_to_read** chunks. If there are more than one possible
- * subset, select the subset that contains the chunks with the
- * lowest cost.
+ * subset, select the subset that minimizes the overall retrieval
+ * cost.
*
* The **available** parameter maps chunk indexes to their
* retrieval cost. The higher the cost value, the more costly it
@@ -141,7 +149,7 @@ namespace ceph {
* @param [in] want_to_read chunk indexes to be decoded
* @param [in] available map chunk indexes containing valid data
* to their retrieval cost
- * @param [out] minimum chunk indexes to retrieve for decode
+ * @param [out] minimum chunk indexes to retrieve
* @return **0** on success or a negative errno on error.
*/
virtual int minimum_to_decode_with_cost(const set<int> &want_to_read,
@@ -150,15 +158,31 @@ namespace ceph {
/**
* Encode the content of **in** and store the result in
- * **encoded**. The **encoded** map contains at least all
- * chunk indexes found in the **want_to_encode** set.
+ * **encoded**. All buffers pointed to by **encoded** have the
+ * same size. The **encoded** map contains at least all chunk
+ * indexes found in the **want_to_encode** set.
*
* The **encoded** map is expected to be a pointer to an empty
* map.
*
+ * Assuming the **in** parameter is **length** bytes long,
+ * the concatenation of the first **length** bytes of the
+ * **encoded** buffers is equal to the content of the **in**
+ * parameter.
+ *
* The **encoded** map may contain more chunks than required by
* **want_to_encode** and the caller is expected to permanently
- * store all of them, not just the chunks from **want_to_encode**.
+ * store all of them, not just the chunks listed in
+ * **want_to_encode**.
+ *
+ * The **encoded** map may contain pointers to data stored in
+ * the **in** parameter. If the caller modifies the content of
+ * **in** after calling the encode method, it may have a side
+ * effect on the content of **encoded**.
+ *
+ * The **encoded** map may contain pointers to buffers allocated
+ * by the encode method. They will be freed when **encoded** is
+ * freed. The allocation method is not specified.
*
* Returns 0 on success.
*
@@ -172,24 +196,30 @@ namespace ceph {
map<int, bufferlist> *encoded) = 0;
/**
- * Decode the **chunks** and store at least **want_to_read** chunks
- * in **decoded**.
+ * Decode the **chunks** and store at least **want_to_read**
+ * chunks in **decoded**.
+ *
+ * The **decoded** map must be a pointer to an empty map.
*
* There must be enough **chunks** ( as returned by
* **minimum_to_decode** or **minimum_to_decode_with_cost** ) to
- * perform a successfull decoding of all chunks found in
+ * perform a successful decoding of all chunks listed in
* **want_to_read**.
*
- * The **decoded** map is expected to be a pointer to an empty
- * map.
+ * All buffers pointed by **in** must have the same size.
+ *
+ * On success, the **decoded** map may contain more chunks than
+ * required by **want_to_read** and they can safely be used by the
+ * caller.
*
- * The **decoded** map may contain more chunks than required by
- * **want_to_read** and they can safely be used by the caller.
+ * If a chunk is listed in **want_to_read** and there is a
+ * corresponding **bufferlist** in **chunks**, it will be
+ * referenced in **decoded**. If not it will be reconstructed from
+ * the existing chunks.
*
- * If a chunk is listed in **want_to_read** and there is
- * corresponding **bufferlist** in **chunks**, it will be copied
- * verbatim into **decoded**. If not it will be reconstructed from
- * the existing chunks.
+ * Because **decoded** may contain pointers to data found in
+ * **chunks**, modifying the content of **chunks** after calling
+ * decode may have a side effect on the content of **decoded**.
*
* Returns 0 on success.
*
diff --git a/src/test/osd/ErasureCodeExample.h b/src/test/osd/ErasureCodeExample.h
index 896e614c6b5..95d79feb923 100644
--- a/src/test/osd/ErasureCodeExample.h
+++ b/src/test/osd/ErasureCodeExample.h
@@ -19,12 +19,19 @@
#include <unistd.h>
#include <errno.h>
+#include <algorithm>
#include <sstream>
#include "osd/ErasureCodeInterface.h"
+#define FIRST_DATA_CHUNK 0
+#define SECOND_DATA_CHUNK 1
#define DATA_CHUNKS 2u
+
+#define CODING_CHUNK 2
#define CODING_CHUNKS 1u
+#define MINIMUM_TO_RECOVER 2u
+
class ErasureCodeExample : public ErasureCodeInterface {
public:
useconds_t delay;
@@ -43,21 +50,43 @@ public:
virtual int minimum_to_decode(const set<int> &want_to_read,
const set<int> &available_chunks,
set<int> *minimum) {
- if (available_chunks.size() < DATA_CHUNKS)
+ if (includes(available_chunks.begin(), available_chunks.end(),
+ want_to_read.begin(), want_to_read.end())) {
+ *minimum = want_to_read;
+ return 0;
+ } else if (available_chunks.size() >= MINIMUM_TO_RECOVER) {
+ *minimum = available_chunks;
+ return 0;
+ } else {
return -EIO;
- set<int>::iterator i;
- unsigned j;
- for (i = available_chunks.begin(), j = 0; j < DATA_CHUNKS; i++, j++)
- minimum->insert(*i);
- return 0;
+ }
}
virtual int minimum_to_decode_with_cost(const set<int> &want_to_read,
const map<int, int> &available,
set<int> *minimum) {
+ //
+ // If one chunk is more expensive to fetch than the others,
+ // recover it instead. For instance, if the cost reflects the
+ // time it takes for a chunk to be retrieved from a remote
+ // OSD and if CPU is cheap, it could make sense to recover
+ // instead of fetching the chunk.
+ //
+ map<int, int> c2c(available);
+ if (c2c.size() > DATA_CHUNKS) {
+ if (c2c[FIRST_DATA_CHUNK] > c2c[SECOND_DATA_CHUNK] &&
+ c2c[FIRST_DATA_CHUNK] > c2c[CODING_CHUNK])
+ c2c.erase(FIRST_DATA_CHUNK);
+ else if(c2c[SECOND_DATA_CHUNK] > c2c[FIRST_DATA_CHUNK] &&
+ c2c[SECOND_DATA_CHUNK] > c2c[CODING_CHUNK])
+ c2c.erase(SECOND_DATA_CHUNK);
+ else if(c2c[CODING_CHUNK] > c2c[FIRST_DATA_CHUNK] &&
+ c2c[CODING_CHUNK] > c2c[SECOND_DATA_CHUNK])
+ c2c.erase(CODING_CHUNK);
+ }
set <int> available_chunks;
- for (map<int, int>::const_iterator i = available.begin();
- i != available.end();
+ for (map<int, int>::const_iterator i = c2c.begin();
+ i != c2c.end();
i++)
available_chunks.insert(i->first);
return minimum_to_decode(want_to_read, available_chunks, minimum);
@@ -66,16 +95,28 @@ public:
virtual int encode(const set<int> &want_to_encode,
const bufferlist &in,
map<int, bufferlist> *encoded) {
+ //
+ // make sure all data chunks have the same length, allocating
+ // padding if necessary.
+ //
unsigned chunk_length = ( in.length() / DATA_CHUNKS ) + 1;
unsigned length = chunk_length * ( DATA_CHUNKS + CODING_CHUNKS );
bufferlist out(in);
bufferptr pad(length - in.length());
pad.zero(0, DATA_CHUNKS);
out.push_back(pad);
+ //
+ // compute the coding chunk with first chunk ^ second chunk
+ //
char *p = out.c_str();
- for (unsigned i = 0; i < chunk_length * DATA_CHUNKS; i++)
- p[i + 2 * chunk_length] =
- p[i + 0 * chunk_length] ^ p[i + 1 * chunk_length];
+ for (unsigned i = 0; i < chunk_length; i++)
+ p[i + CODING_CHUNK * chunk_length] =
+ p[i + FIRST_DATA_CHUNK * chunk_length] ^
+ p[i + SECOND_DATA_CHUNK * chunk_length];
+ //
+ // populate the bufferlist with bufferptr pointing
+ // to chunk boundaries
+ //
const bufferptr ptr = out.buffers().front();
for (set<int>::iterator j = want_to_encode.begin();
j != want_to_encode.end();
@@ -89,14 +130,30 @@ public:
virtual int decode(const set<int> &want_to_read,
const map<int, bufferlist> &chunks,
map<int, bufferlist> *decoded) {
-
+ //
+ // All chunks have the same size
+ //
unsigned chunk_length = (*chunks.begin()).second.length();
for (set<int>::iterator i = want_to_read.begin();
i != want_to_read.end();
i++) {
- if (chunks.find(*i) != chunks.end())
+ if (chunks.find(*i) != chunks.end()) {
+ //
+ // If the chunk is available, just copy the bufferptr pointer
+ // to the decoded argument.
+ //
(*decoded)[*i] = chunks.find(*i)->second;
- else {
+ } else if(chunks.size() != 2) {
+ //
+ // If a chunk is missing and there are not enough chunks
+ // to recover, abort.
+ //
+ return -ERANGE;
+ } else {
+ //
+ // No matter what the missing chunk is, XOR of the other
+ // two recovers it.
+ //
bufferptr chunk(chunk_length);
map<int, bufferlist>::const_iterator k = chunks.begin();
const char *a = k->second.buffers().front().c_str();
diff --git a/src/test/osd/TestErasureCodeExample.cc b/src/test/osd/TestErasureCodeExample.cc
index 66f521d7863..6866dfdbb9f 100644
--- a/src/test/osd/TestErasureCodeExample.cc
+++ b/src/test/osd/TestErasureCodeExample.cc
@@ -65,10 +65,54 @@ TEST(ErasureCodeExample, minimum_to_decode)
EXPECT_EQ(0, example.minimum_to_decode(want_to_read,
available_chunks,
&minimum));
+ EXPECT_EQ(1u, minimum.size());
+ EXPECT_EQ(1u, minimum.count(1));
+ }
+}
+
+TEST(ErasureCodeExample, minimum_to_decode_with_cost)
+{
+ map<std::string,std::string> parameters;
+ ErasureCodeExample example(parameters);
+ map<int,int> available;
+ set<int> want_to_read;
+ want_to_read.insert(1);
+ {
+ set<int> minimum;
+ EXPECT_EQ(-EIO, example.minimum_to_decode_with_cost(want_to_read,
+ available,
+ &minimum));
+ }
+ available[0] = 1;
+ available[2] = 1;
+ {
+ set<int> minimum;
+ EXPECT_EQ(0, example.minimum_to_decode_with_cost(want_to_read,
+ available,
+ &minimum));
EXPECT_EQ(2u, minimum.size());
EXPECT_EQ(1u, minimum.count(0));
+ EXPECT_EQ(1u, minimum.count(2));
+ }
+ {
+ set<int> minimum;
+ available[1] = 1;
+ EXPECT_EQ(0, example.minimum_to_decode_with_cost(want_to_read,
+ available,
+ &minimum));
+ EXPECT_EQ(1u, minimum.size());
EXPECT_EQ(1u, minimum.count(1));
}
+ {
+ set<int> minimum;
+ available[1] = 2;
+ EXPECT_EQ(0, example.minimum_to_decode_with_cost(want_to_read,
+ available,
+ &minimum));
+ EXPECT_EQ(2u, minimum.size());
+ EXPECT_EQ(1u, minimum.count(0));
+ EXPECT_EQ(1u, minimum.count(2));
+ }
}
TEST(ErasureCodeExample, encode_decode)
@@ -142,5 +186,5 @@ int main(int argc, char **argv) {
}
// Local Variables:
-// compile-command: "cd ../.. ; make -j4 && make unittest_erasure_code_example && ./unittest_erasure_code_example --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
+// compile-command: "cd ../.. ; make -j4 && make unittest_erasure_code_example && valgrind --leak-check=full --tool=memcheck ./unittest_erasure_code_example --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
// End: