diff options
author | Sage Weil <sage@inktank.com> | 2013-09-23 10:26:48 -0700 |
---|---|---|
committer | Sage Weil <sage@inktank.com> | 2013-09-23 10:26:48 -0700 |
commit | 10f291838e906dfe3816a6df1883386ddbf8c357 (patch) | |
tree | 89288a03060e49cc4e86b0da125bd249310a77e1 | |
parent | b683005c219e3d6da8fb241a684d348440ff8270 (diff) | |
parent | 89e8b8be2fbfea065defba39ee2d2a620b21a11f (diff) | |
download | ceph-10f291838e906dfe3816a6df1883386ddbf8c357.tar.gz |
Merge pull request #603 from dachary/wip-erasure-code-example
erasure code example cleanup
-rw-r--r-- | src/osd/ErasureCodeInterface.h | 88 | ||||
-rw-r--r-- | src/test/osd/ErasureCodeExample.h | 85 | ||||
-rw-r--r-- | src/test/osd/TestErasureCodeExample.cc | 46 |
3 files changed, 175 insertions, 44 deletions
diff --git a/src/osd/ErasureCodeInterface.h b/src/osd/ErasureCodeInterface.h index 5ce2842d562..656ee91987e 100644 --- a/src/osd/ErasureCodeInterface.h +++ b/src/osd/ErasureCodeInterface.h @@ -25,15 +25,15 @@ are systematic (i.e. the data is not mangled and can be reconstructed by concatenating chunks ). - All methods returns **0** on success and a negative value on + All methods return **0** on success and a negative value on error. If the value returned on error is not explained in **ErasureCodeInterface**, the sources or the documentation of the - interface implementer must be read to figure out what it means. It - is recommended that each error code matches an *errno* value that - relates to the cause of the error. + interface implementer (i.e. the plugin ) must be read to figure + out what it means. It is recommended that each error code matches + an *errno* value that relates to the cause of the error. Assuming the interface implementer provides three data chunks ( K - = 3 ) and two coding chunks ( M = 2 ), a buffer can be encoded as + = 3 ) and two coding chunks ( M = 2 ), a buffer could be encoded as follows: ~~~~~~~~~~~~~~~~{.c} @@ -50,16 +50,20 @@ encoded[4] // coding chunk 1 ~~~~~~~~~~~~~~~~ - If encoded[2] ( which contains **EF** ) is missing and accessing - encoded[3] ( the first coding chunk ) is more expensive than - accessing encoded[4] ( the second coding chunk ), the - **minimum_to_decode_with_cost** method can be called as follows: + The **minimum_to_decode_with_cost** method can be used to minimize + the cost of fetching the chunks necessary to retrieve a given + content. For instance, if encoded[2] (contained **EF**) is missing + and accessing encoded[3] (the first coding chunk) is more + expensive than accessing encoded[4] (the second coding chunk), + **minimum_to_decode_with_cost** is expected to chose the first + coding chunk. ~~~~~~~~~~~~~~~~{.c} set<int> want_to_read(2); // want the chunk containing "EF" map<int,int> available( 0 => 1, // data chunk 0 : available and costs 1 1 => 1, // data chunk 1 : available and costs 1 + // data chunk 2 : missing 3 => 9, // coding chunk 1 : available and costs 9 4 => 1, // coding chunk 2 : available and costs 1 ); @@ -67,14 +71,14 @@ minimum_to_decode_with_cost(want_to_read, available, &minimum); - minimum == set<int>(0, 1, 4); + minimum == set<int>(0, 1, 4); // NOT set<int>(0, 1, 3); ~~~~~~~~~~~~~~~~ It sets **minimum** with three chunks to reconstruct the desired data chunk and will pick the second coding chunk ( 4 ) because it is less expensive ( 1 < 9 ) to retrieve than the first coding chunk ( 3 ). The caller is responsible for retrieving the chunks - and call **decode** to reconstruct the second data chunk content. + and call **decode** to reconstruct the second data chunk. ~~~~~~~~~~~~~~~~{.c} map<int,bufferlist> chunks; @@ -85,6 +89,10 @@ decoded[2] == "EF" ~~~~~~~~~~~~~~~~ + The semantic of the cost value is defined by the caller and must + be known to the implementer. For instance, it may be more + expensive to retrieve two chunks with cost 1 + 9 = 10 than two + chunks with cost 6 + 6 = 12. */ #include <map> @@ -113,7 +121,7 @@ namespace ceph { * * @param [in] want_to_read chunk indexes to be decoded * @param [in] available chunk indexes containing valid data - * @param [out] minimum chunk indexes to retrieve for decode + * @param [out] minimum chunk indexes to retrieve * @return **0** on success or a negative errno on error. */ virtual int minimum_to_decode(const set<int> &want_to_read, @@ -124,8 +132,8 @@ namespace ceph { * Compute the smallest subset of **available** chunks that needs * to be retrieved in order to successfully decode * **want_to_read** chunks. If there are more than one possible - * subset, select the subset that contains the chunks with the - * lowest cost. + * subset, select the subset that minimizes the overall retrieval + * cost. * * The **available** parameter maps chunk indexes to their * retrieval cost. The higher the cost value, the more costly it @@ -141,7 +149,7 @@ namespace ceph { * @param [in] want_to_read chunk indexes to be decoded * @param [in] available map chunk indexes containing valid data * to their retrieval cost - * @param [out] minimum chunk indexes to retrieve for decode + * @param [out] minimum chunk indexes to retrieve * @return **0** on success or a negative errno on error. */ virtual int minimum_to_decode_with_cost(const set<int> &want_to_read, @@ -150,15 +158,31 @@ namespace ceph { /** * Encode the content of **in** and store the result in - * **encoded**. The **encoded** map contains at least all - * chunk indexes found in the **want_to_encode** set. + * **encoded**. All buffers pointed to by **encoded** have the + * same size. The **encoded** map contains at least all chunk + * indexes found in the **want_to_encode** set. * * The **encoded** map is expected to be a pointer to an empty * map. * + * Assuming the **in** parameter is **length** bytes long, + * the concatenation of the first **length** bytes of the + * **encoded** buffers is equal to the content of the **in** + * parameter. + * * The **encoded** map may contain more chunks than required by * **want_to_encode** and the caller is expected to permanently - * store all of them, not just the chunks from **want_to_encode**. + * store all of them, not just the chunks listed in + * **want_to_encode**. + * + * The **encoded** map may contain pointers to data stored in + * the **in** parameter. If the caller modifies the content of + * **in** after calling the encode method, it may have a side + * effect on the content of **encoded**. + * + * The **encoded** map may contain pointers to buffers allocated + * by the encode method. They will be freed when **encoded** is + * freed. The allocation method is not specified. * * Returns 0 on success. * @@ -172,24 +196,30 @@ namespace ceph { map<int, bufferlist> *encoded) = 0; /** - * Decode the **chunks** and store at least **want_to_read** chunks - * in **decoded**. + * Decode the **chunks** and store at least **want_to_read** + * chunks in **decoded**. + * + * The **decoded** map must be a pointer to an empty map. * * There must be enough **chunks** ( as returned by * **minimum_to_decode** or **minimum_to_decode_with_cost** ) to - * perform a successfull decoding of all chunks found in + * perform a successful decoding of all chunks listed in * **want_to_read**. * - * The **decoded** map is expected to be a pointer to an empty - * map. + * All buffers pointed by **in** must have the same size. + * + * On success, the **decoded** map may contain more chunks than + * required by **want_to_read** and they can safely be used by the + * caller. * - * The **decoded** map may contain more chunks than required by - * **want_to_read** and they can safely be used by the caller. + * If a chunk is listed in **want_to_read** and there is a + * corresponding **bufferlist** in **chunks**, it will be + * referenced in **decoded**. If not it will be reconstructed from + * the existing chunks. * - * If a chunk is listed in **want_to_read** and there is - * corresponding **bufferlist** in **chunks**, it will be copied - * verbatim into **decoded**. If not it will be reconstructed from - * the existing chunks. + * Because **decoded** may contain pointers to data found in + * **chunks**, modifying the content of **chunks** after calling + * decode may have a side effect on the content of **decoded**. * * Returns 0 on success. * diff --git a/src/test/osd/ErasureCodeExample.h b/src/test/osd/ErasureCodeExample.h index 896e614c6b5..95d79feb923 100644 --- a/src/test/osd/ErasureCodeExample.h +++ b/src/test/osd/ErasureCodeExample.h @@ -19,12 +19,19 @@ #include <unistd.h> #include <errno.h> +#include <algorithm> #include <sstream> #include "osd/ErasureCodeInterface.h" +#define FIRST_DATA_CHUNK 0 +#define SECOND_DATA_CHUNK 1 #define DATA_CHUNKS 2u + +#define CODING_CHUNK 2 #define CODING_CHUNKS 1u +#define MINIMUM_TO_RECOVER 2u + class ErasureCodeExample : public ErasureCodeInterface { public: useconds_t delay; @@ -43,21 +50,43 @@ public: virtual int minimum_to_decode(const set<int> &want_to_read, const set<int> &available_chunks, set<int> *minimum) { - if (available_chunks.size() < DATA_CHUNKS) + if (includes(available_chunks.begin(), available_chunks.end(), + want_to_read.begin(), want_to_read.end())) { + *minimum = want_to_read; + return 0; + } else if (available_chunks.size() >= MINIMUM_TO_RECOVER) { + *minimum = available_chunks; + return 0; + } else { return -EIO; - set<int>::iterator i; - unsigned j; - for (i = available_chunks.begin(), j = 0; j < DATA_CHUNKS; i++, j++) - minimum->insert(*i); - return 0; + } } virtual int minimum_to_decode_with_cost(const set<int> &want_to_read, const map<int, int> &available, set<int> *minimum) { + // + // If one chunk is more expensive to fetch than the others, + // recover it instead. For instance, if the cost reflects the + // time it takes for a chunk to be retrieved from a remote + // OSD and if CPU is cheap, it could make sense to recover + // instead of fetching the chunk. + // + map<int, int> c2c(available); + if (c2c.size() > DATA_CHUNKS) { + if (c2c[FIRST_DATA_CHUNK] > c2c[SECOND_DATA_CHUNK] && + c2c[FIRST_DATA_CHUNK] > c2c[CODING_CHUNK]) + c2c.erase(FIRST_DATA_CHUNK); + else if(c2c[SECOND_DATA_CHUNK] > c2c[FIRST_DATA_CHUNK] && + c2c[SECOND_DATA_CHUNK] > c2c[CODING_CHUNK]) + c2c.erase(SECOND_DATA_CHUNK); + else if(c2c[CODING_CHUNK] > c2c[FIRST_DATA_CHUNK] && + c2c[CODING_CHUNK] > c2c[SECOND_DATA_CHUNK]) + c2c.erase(CODING_CHUNK); + } set <int> available_chunks; - for (map<int, int>::const_iterator i = available.begin(); - i != available.end(); + for (map<int, int>::const_iterator i = c2c.begin(); + i != c2c.end(); i++) available_chunks.insert(i->first); return minimum_to_decode(want_to_read, available_chunks, minimum); @@ -66,16 +95,28 @@ public: virtual int encode(const set<int> &want_to_encode, const bufferlist &in, map<int, bufferlist> *encoded) { + // + // make sure all data chunks have the same length, allocating + // padding if necessary. + // unsigned chunk_length = ( in.length() / DATA_CHUNKS ) + 1; unsigned length = chunk_length * ( DATA_CHUNKS + CODING_CHUNKS ); bufferlist out(in); bufferptr pad(length - in.length()); pad.zero(0, DATA_CHUNKS); out.push_back(pad); + // + // compute the coding chunk with first chunk ^ second chunk + // char *p = out.c_str(); - for (unsigned i = 0; i < chunk_length * DATA_CHUNKS; i++) - p[i + 2 * chunk_length] = - p[i + 0 * chunk_length] ^ p[i + 1 * chunk_length]; + for (unsigned i = 0; i < chunk_length; i++) + p[i + CODING_CHUNK * chunk_length] = + p[i + FIRST_DATA_CHUNK * chunk_length] ^ + p[i + SECOND_DATA_CHUNK * chunk_length]; + // + // populate the bufferlist with bufferptr pointing + // to chunk boundaries + // const bufferptr ptr = out.buffers().front(); for (set<int>::iterator j = want_to_encode.begin(); j != want_to_encode.end(); @@ -89,14 +130,30 @@ public: virtual int decode(const set<int> &want_to_read, const map<int, bufferlist> &chunks, map<int, bufferlist> *decoded) { - + // + // All chunks have the same size + // unsigned chunk_length = (*chunks.begin()).second.length(); for (set<int>::iterator i = want_to_read.begin(); i != want_to_read.end(); i++) { - if (chunks.find(*i) != chunks.end()) + if (chunks.find(*i) != chunks.end()) { + // + // If the chunk is available, just copy the bufferptr pointer + // to the decoded argument. + // (*decoded)[*i] = chunks.find(*i)->second; - else { + } else if(chunks.size() != 2) { + // + // If a chunk is missing and there are not enough chunks + // to recover, abort. + // + return -ERANGE; + } else { + // + // No matter what the missing chunk is, XOR of the other + // two recovers it. + // bufferptr chunk(chunk_length); map<int, bufferlist>::const_iterator k = chunks.begin(); const char *a = k->second.buffers().front().c_str(); diff --git a/src/test/osd/TestErasureCodeExample.cc b/src/test/osd/TestErasureCodeExample.cc index 66f521d7863..6866dfdbb9f 100644 --- a/src/test/osd/TestErasureCodeExample.cc +++ b/src/test/osd/TestErasureCodeExample.cc @@ -65,10 +65,54 @@ TEST(ErasureCodeExample, minimum_to_decode) EXPECT_EQ(0, example.minimum_to_decode(want_to_read, available_chunks, &minimum)); + EXPECT_EQ(1u, minimum.size()); + EXPECT_EQ(1u, minimum.count(1)); + } +} + +TEST(ErasureCodeExample, minimum_to_decode_with_cost) +{ + map<std::string,std::string> parameters; + ErasureCodeExample example(parameters); + map<int,int> available; + set<int> want_to_read; + want_to_read.insert(1); + { + set<int> minimum; + EXPECT_EQ(-EIO, example.minimum_to_decode_with_cost(want_to_read, + available, + &minimum)); + } + available[0] = 1; + available[2] = 1; + { + set<int> minimum; + EXPECT_EQ(0, example.minimum_to_decode_with_cost(want_to_read, + available, + &minimum)); EXPECT_EQ(2u, minimum.size()); EXPECT_EQ(1u, minimum.count(0)); + EXPECT_EQ(1u, minimum.count(2)); + } + { + set<int> minimum; + available[1] = 1; + EXPECT_EQ(0, example.minimum_to_decode_with_cost(want_to_read, + available, + &minimum)); + EXPECT_EQ(1u, minimum.size()); EXPECT_EQ(1u, minimum.count(1)); } + { + set<int> minimum; + available[1] = 2; + EXPECT_EQ(0, example.minimum_to_decode_with_cost(want_to_read, + available, + &minimum)); + EXPECT_EQ(2u, minimum.size()); + EXPECT_EQ(1u, minimum.count(0)); + EXPECT_EQ(1u, minimum.count(2)); + } } TEST(ErasureCodeExample, encode_decode) @@ -142,5 +186,5 @@ int main(int argc, char **argv) { } // Local Variables: -// compile-command: "cd ../.. ; make -j4 && make unittest_erasure_code_example && ./unittest_erasure_code_example --gtest_filter=*.* --log-to-stderr=true --debug-osd=20" +// compile-command: "cd ../.. ; make -j4 && make unittest_erasure_code_example && valgrind --leak-check=full --tool=memcheck ./unittest_erasure_code_example --gtest_filter=*.* --log-to-stderr=true --debug-osd=20" // End: |