Merge pull request #603 from dachary/wip-erasure-code-example

erasure code example cleanup
author: Sage Weil <sage@inktank.com> 2013-09-23 10:26:48 -0700
committer: Sage Weil <sage@inktank.com> 2013-09-23 10:26:48 -0700
commit: 10f291838e906dfe3816a6df1883386ddbf8c357 (patch)
tree: 89288a03060e49cc4e86b0da125bd249310a77e1
parent: b683005c219e3d6da8fb241a684d348440ff8270 (diff)
parent: 89e8b8be2fbfea065defba39ee2d2a620b21a11f (diff)
download: ceph-10f291838e906dfe3816a6df1883386ddbf8c357.tar.gz
3 files changed, 175 insertions, 44 deletions
diff --git a/src/osd/ErasureCodeInterface.h b/src/osd/ErasureCodeInterface.h
index 5ce2842d562..656ee91987e 100644
--- a/src/osd/ErasureCodeInterface.h
+++ b/src/osd/ErasureCodeInterface.h
@@ -25,15 +25,15 @@
     are systematic (i.e. the data is not mangled and can be
     reconstructed by concatenating chunks ).
     
-    All methods returns **0** on success and a negative value on
+    All methods return **0** on success and a negative value on
     error. If the value returned on error is not explained in
     **ErasureCodeInterface**, the sources or the documentation of the
-    interface implementer must be read to figure out what it means. It
-    is recommended that each error code matches an *errno* value that
-    relates to the cause of the error.
+    interface implementer (i.e. the plugin ) must be read to figure
+    out what it means. It is recommended that each error code matches
+    an *errno* value that relates to the cause of the error.
 
     Assuming the interface implementer provides three data chunks ( K
-    = 3 ) and two coding chunks ( M = 2 ), a buffer can be encoded as
+    = 3 ) and two coding chunks ( M = 2 ), a buffer could be encoded as
     follows:
     
     ~~~~~~~~~~~~~~~~{.c}
@@ -50,16 +50,20 @@
     encoded[4]         // coding chunk 1
     ~~~~~~~~~~~~~~~~
 
-    If encoded[2] ( which contains **EF** ) is missing and accessing
-    encoded[3] ( the first coding chunk ) is more expensive than
-    accessing encoded[4] ( the second coding chunk ), the
-    **minimum_to_decode_with_cost** method can be called as follows:
+    The **minimum_to_decode_with_cost** method can be used to minimize
+    the cost of fetching the chunks necessary to retrieve a given
+    content. For instance, if encoded[2] (contained **EF**) is missing
+    and accessing encoded[3] (the first coding chunk) is more
+    expensive than accessing encoded[4] (the second coding chunk),
+    **minimum_to_decode_with_cost** is expected to chose the first
+    coding chunk.
 
     ~~~~~~~~~~~~~~~~{.c}
     set<int> want_to_read(2); // want the chunk containing "EF"
     map<int,int> available(
           0 => 1,  // data chunk 0 : available and costs 1
           1 => 1,  // data chunk 1 : available and costs 1
+                   // data chunk 2 : missing
           3 => 9,  // coding chunk 1 : available and costs 9
           4 => 1,  // coding chunk 2 : available and costs 1
     );
@@ -67,14 +71,14 @@
     minimum_to_decode_with_cost(want_to_read,
                                 available,
                                 &minimum);
-    minimum == set<int>(0, 1, 4);
+    minimum == set<int>(0, 1, 4); // NOT set<int>(0, 1, 3);
     ~~~~~~~~~~~~~~~~
     
     It sets **minimum** with three chunks to reconstruct the desired
     data chunk and will pick the second coding chunk ( 4 ) because it
     is less expensive ( 1 < 9 ) to retrieve than the first coding
     chunk ( 3 ). The caller is responsible for retrieving the chunks
-    and call **decode** to reconstruct the second data chunk content.
+    and call **decode** to reconstruct the second data chunk.
     
     ~~~~~~~~~~~~~~~~{.c}
     map<int,bufferlist> chunks;
@@ -85,6 +89,10 @@
     decoded[2] == "EF"
     ~~~~~~~~~~~~~~~~
 
+    The semantic of the cost value is defined by the caller and must
+    be known to the implementer. For instance, it may be more
+    expensive to retrieve two chunks with cost 1 + 9 = 10 than two
+    chunks with cost 6 + 6 = 12. 
  */ 
 
 #include <map>
@@ -113,7 +121,7 @@ namespace ceph {
      *
      * @param [in] want_to_read chunk indexes to be decoded
      * @param [in] available chunk indexes containing valid data
-     * @param [out] minimum chunk indexes to retrieve for decode
+     * @param [out] minimum chunk indexes to retrieve 
      * @return **0** on success or a negative errno on error.
      */
     virtual int minimum_to_decode(const set<int> &want_to_read,
@@ -124,8 +132,8 @@ namespace ceph {
      * Compute the smallest subset of **available** chunks that needs
      * to be retrieved in order to successfully decode
      * **want_to_read** chunks. If there are more than one possible
-     * subset, select the subset that contains the chunks with the
-     * lowest cost.
+     * subset, select the subset that minimizes the overall retrieval
+     * cost.
      *
      * The **available** parameter maps chunk indexes to their
      * retrieval cost. The higher the cost value, the more costly it
@@ -141,7 +149,7 @@ namespace ceph {
      * @param [in] want_to_read chunk indexes to be decoded
      * @param [in] available map chunk indexes containing valid data 
      *             to their retrieval cost
-     * @param [out] minimum chunk indexes to retrieve for decode
+     * @param [out] minimum chunk indexes to retrieve 
      * @return **0** on success or a negative errno on error.
      */
     virtual int minimum_to_decode_with_cost(const set<int> &want_to_read,
@@ -150,15 +158,31 @@ namespace ceph {
 
     /**
      * Encode the content of **in** and store the result in
-     * **encoded**. The **encoded** map contains at least all
-     * chunk indexes found in the **want_to_encode** set. 
+     * **encoded**. All buffers pointed to by **encoded** have the
+     * same size. The **encoded** map contains at least all chunk
+     * indexes found in the **want_to_encode** set.
      *
      * The **encoded** map is expected to be a pointer to an empty
      * map.
      *
+     * Assuming the **in** parameter is **length** bytes long, 
+     * the concatenation of the first **length** bytes of the
+     * **encoded** buffers is equal to the content of the **in**
+     * parameter.
+     *
      * The **encoded** map may contain more chunks than required by
      * **want_to_encode** and the caller is expected to permanently
-     * store all of them, not just the chunks from **want_to_encode**.
+     * store all of them, not just the chunks listed in
+     * **want_to_encode**.
+     *
+     * The **encoded** map may contain pointers to data stored in
+     * the **in** parameter. If the caller modifies the content of
+     * **in** after calling the encode method, it may have a side
+     * effect on the content of **encoded**. 
+     *
+     * The **encoded** map may contain pointers to buffers allocated
+     * by the encode method. They will be freed when **encoded** is
+     * freed. The allocation method is not specified.
      *
      * Returns 0 on success.
      *
@@ -172,24 +196,30 @@ namespace ceph {
                        map<int, bufferlist> *encoded) = 0;
 
     /**
-     * Decode the **chunks** and store at least **want_to_read** chunks
-     * in **decoded**. 
+     * Decode the **chunks** and store at least **want_to_read**
+     * chunks in **decoded**.
+     *
+     * The **decoded** map must be a pointer to an empty map.
      *
      * There must be enough **chunks** ( as returned by
      * **minimum_to_decode** or **minimum_to_decode_with_cost** ) to
-     * perform a successfull decoding of all chunks found in
+     * perform a successful decoding of all chunks listed in
      * **want_to_read**.
      *
-     * The **decoded** map is expected to be a pointer to an empty
-     * map.
+     * All buffers pointed by **in** must have the same size.
+     *
+     * On success, the **decoded** map may contain more chunks than
+     * required by **want_to_read** and they can safely be used by the
+     * caller.
      *
-     * The **decoded** map may contain more chunks than required by
-     * **want_to_read** and they can safely be used by the caller.
+     * If a chunk is listed in **want_to_read** and there is a
+     * corresponding **bufferlist** in **chunks**, it will be
+     * referenced in **decoded**. If not it will be reconstructed from
+     * the existing chunks. 
      *
-     * If a chunk is listed in **want_to_read** and there is
-     * corresponding **bufferlist** in **chunks**, it will be copied
-     * verbatim into **decoded**. If not it will be reconstructed from
-     * the existing chunks.
+     * Because **decoded** may contain pointers to data found in
+     * **chunks**, modifying the content of **chunks** after calling
+     * decode may have a side effect on the content of **decoded**.
      *
      * Returns 0 on success.
      *
diff --git a/src/test/osd/ErasureCodeExample.h b/src/test/osd/ErasureCodeExample.h
index 896e614c6b5..95d79feb923 100644
--- a/src/test/osd/ErasureCodeExample.h
+++ b/src/test/osd/ErasureCodeExample.h
@@ -19,12 +19,19 @@
 
 #include <unistd.h>
 #include <errno.h>
+#include <algorithm>
 #include <sstream>
 #include "osd/ErasureCodeInterface.h"
 
+#define FIRST_DATA_CHUNK 0
+#define SECOND_DATA_CHUNK 1
 #define DATA_CHUNKS 2u
+
+#define CODING_CHUNK 2
 #define CODING_CHUNKS 1u
 
+#define MINIMUM_TO_RECOVER 2u
+
 class ErasureCodeExample : public ErasureCodeInterface {
 public:
   useconds_t delay;
@@ -43,21 +50,43 @@ public:
   virtual int minimum_to_decode(const set<int> &want_to_read,
                                 const set<int> &available_chunks,
                                 set<int> *minimum) {
-    if (available_chunks.size() < DATA_CHUNKS)
+    if (includes(available_chunks.begin(), available_chunks.end(),
+		 want_to_read.begin(), want_to_read.end())) {
+      *minimum = want_to_read;
+      return 0;
+    } else if (available_chunks.size() >= MINIMUM_TO_RECOVER) {
+      *minimum = available_chunks;
+      return 0;
+    } else {
       return -EIO;
-    set<int>::iterator i;
-    unsigned j;
-    for (i = available_chunks.begin(), j = 0; j < DATA_CHUNKS; i++, j++)
-      minimum->insert(*i);
-    return 0;
+    }
   }
 
   virtual int minimum_to_decode_with_cost(const set<int> &want_to_read,
                                           const map<int, int> &available,
                                           set<int> *minimum) {
+    //
+    // If one chunk is more expensive to fetch than the others,
+    // recover it instead. For instance, if the cost reflects the
+    // time it takes for a chunk to be retrieved from a remote
+    // OSD and if CPU is cheap, it could make sense to recover
+    // instead of fetching the chunk.
+    //
+    map<int, int> c2c(available);
+    if (c2c.size() > DATA_CHUNKS) {
+      if (c2c[FIRST_DATA_CHUNK] > c2c[SECOND_DATA_CHUNK] &&
+	  c2c[FIRST_DATA_CHUNK] > c2c[CODING_CHUNK])
+	c2c.erase(FIRST_DATA_CHUNK);
+      else if(c2c[SECOND_DATA_CHUNK] > c2c[FIRST_DATA_CHUNK] &&
+	      c2c[SECOND_DATA_CHUNK] > c2c[CODING_CHUNK])
+	c2c.erase(SECOND_DATA_CHUNK);
+      else if(c2c[CODING_CHUNK] > c2c[FIRST_DATA_CHUNK] &&
+	      c2c[CODING_CHUNK] > c2c[SECOND_DATA_CHUNK])
+	c2c.erase(CODING_CHUNK);
+    }
     set <int> available_chunks;
-    for (map<int, int>::const_iterator i = available.begin();
-	 i != available.end();
+    for (map<int, int>::const_iterator i = c2c.begin();
+	 i != c2c.end();
 	 i++)
       available_chunks.insert(i->first);
     return minimum_to_decode(want_to_read, available_chunks, minimum);
@@ -66,16 +95,28 @@ public:
   virtual int encode(const set<int> &want_to_encode,
                      const bufferlist &in,
                      map<int, bufferlist> *encoded) {
+    //
+    // make sure all data chunks have the same length, allocating
+    // padding if necessary.
+    //
     unsigned chunk_length = ( in.length() / DATA_CHUNKS ) + 1;
     unsigned length = chunk_length * ( DATA_CHUNKS + CODING_CHUNKS );
     bufferlist out(in);
     bufferptr pad(length - in.length());
     pad.zero(0, DATA_CHUNKS);
     out.push_back(pad);
+    //
+    // compute the coding chunk with first chunk ^ second chunk
+    //
     char *p = out.c_str();
-    for (unsigned i = 0; i < chunk_length * DATA_CHUNKS; i++)
-      p[i + 2 * chunk_length] =
-        p[i + 0 * chunk_length] ^ p[i + 1 * chunk_length];
+    for (unsigned i = 0; i < chunk_length; i++)
+      p[i + CODING_CHUNK * chunk_length] =
+        p[i + FIRST_DATA_CHUNK * chunk_length] ^
+	p[i + SECOND_DATA_CHUNK * chunk_length];
+    //
+    // populate the bufferlist with bufferptr pointing
+    // to chunk boundaries
+    //
     const bufferptr ptr = out.buffers().front();
     for (set<int>::iterator j = want_to_encode.begin();
          j != want_to_encode.end();
@@ -89,14 +130,30 @@ public:
   virtual int decode(const set<int> &want_to_read,
                      const map<int, bufferlist> &chunks,
                      map<int, bufferlist> *decoded) {
-    
+    //
+    // All chunks have the same size
+    //
     unsigned chunk_length = (*chunks.begin()).second.length();
     for (set<int>::iterator i = want_to_read.begin();
          i != want_to_read.end();
          i++) {
-      if (chunks.find(*i) != chunks.end())
+      if (chunks.find(*i) != chunks.end()) {
+	//
+	// If the chunk is available, just copy the bufferptr pointer
+	// to the decoded argument.
+	//
         (*decoded)[*i] = chunks.find(*i)->second;
-      else {
+      } else if(chunks.size() != 2) {
+	//
+	// If a chunk is missing and there are not enough chunks
+	// to recover, abort.
+	//
+	return -ERANGE;
+      } else {
+	//
+	// No matter what the missing chunk is, XOR of the other
+	// two recovers it.
+	//
         bufferptr chunk(chunk_length);
         map<int, bufferlist>::const_iterator k = chunks.begin();
         const char *a = k->second.buffers().front().c_str();
diff --git a/src/test/osd/TestErasureCodeExample.cc b/src/test/osd/TestErasureCodeExample.cc
index 66f521d7863..6866dfdbb9f 100644
--- a/src/test/osd/TestErasureCodeExample.cc
+++ b/src/test/osd/TestErasureCodeExample.cc
@@ -65,10 +65,54 @@ TEST(ErasureCodeExample, minimum_to_decode)
     EXPECT_EQ(0, example.minimum_to_decode(want_to_read,
                                            available_chunks,
                                            &minimum));
+    EXPECT_EQ(1u, minimum.size());
+    EXPECT_EQ(1u, minimum.count(1));
+  }
+}
+
+TEST(ErasureCodeExample, minimum_to_decode_with_cost)
+{
+  map<std::string,std::string> parameters;
+  ErasureCodeExample example(parameters);
+  map<int,int> available;
+  set<int> want_to_read;
+  want_to_read.insert(1);
+  {
+    set<int> minimum;
+    EXPECT_EQ(-EIO, example.minimum_to_decode_with_cost(want_to_read,
+							available,
+							&minimum));
+  }
+  available[0] = 1;
+  available[2] = 1;
+  {
+    set<int> minimum;
+    EXPECT_EQ(0, example.minimum_to_decode_with_cost(want_to_read,
+						     available,
+						     &minimum));
     EXPECT_EQ(2u, minimum.size());
     EXPECT_EQ(1u, minimum.count(0));
+    EXPECT_EQ(1u, minimum.count(2));
+  }
+  {
+    set<int> minimum;
+    available[1] = 1;
+    EXPECT_EQ(0, example.minimum_to_decode_with_cost(want_to_read,
+						     available,
+						     &minimum));
+    EXPECT_EQ(1u, minimum.size());
     EXPECT_EQ(1u, minimum.count(1));
   }
+  {
+    set<int> minimum;
+    available[1] = 2;
+    EXPECT_EQ(0, example.minimum_to_decode_with_cost(want_to_read,
+						     available,
+						     &minimum));
+    EXPECT_EQ(2u, minimum.size());
+    EXPECT_EQ(1u, minimum.count(0));
+    EXPECT_EQ(1u, minimum.count(2));
+  }
 }
 
 TEST(ErasureCodeExample, encode_decode)
@@ -142,5 +186,5 @@ int main(int argc, char **argv) {
 }
 
 // Local Variables:
-// compile-command: "cd ../.. ; make -j4 && make unittest_erasure_code_example && ./unittest_erasure_code_example --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
+// compile-command: "cd ../.. ; make -j4 && make unittest_erasure_code_example && valgrind  --leak-check=full --tool=memcheck ./unittest_erasure_code_example --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
 // End:
author	Sage Weil <sage@inktank.com>	2013-09-23 10:26:48 -0700
committer	Sage Weil <sage@inktank.com>	2013-09-23 10:26:48 -0700
commit	10f291838e906dfe3816a6df1883386ddbf8c357 (patch)
tree	89288a03060e49cc4e86b0da125bd249310a77e1
parent	b683005c219e3d6da8fb241a684d348440ff8270 (diff)
parent	89e8b8be2fbfea065defba39ee2d2a620b21a11f (diff)
download	ceph-10f291838e906dfe3816a6df1883386ddbf8c357.tar.gz