1 files changed, 396 insertions, 0 deletions
diff --git a/db/repair.cc b/db/repair.cc
new file mode 100644
index 0000000..014e00e
--- /dev/null
+++ b/db/repair.cc
@@ -0,0 +1,396 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// We recover the contents of the descriptor from the other files we find.
+// (1) Any log files are first converted to tables
+// (2) We scan every table to compute
+//     (a) smallest/largest for the table
+//     (b) large value refs from the table
+//     (c) largest sequence number in the table
+// (3) We generate descriptor contents:
+//      - log number is set to zero
+//      - next-file-number is set to 1 + largest file number we found
+//      - last-sequence-number is set to largest sequence# found across
+//        all tables (see 2c)
+//      - compaction pointers are cleared
+//      - every table file is added at level 0
+//
+// Possible optimization 1:
+//   (a) Compute total size and use to pick appropriate max-level M
+//   (b) Sort tables by largest sequence# in the table
+//   (c) For each table: if it overlaps earlier table, place in level-0,
+//       else place in level-M.
+// Possible optimization 2:
+//   Store per-table metadata (smallest, largest, largest-seq#,
+//   large-value-refs, ...) in the table's meta section to speed up
+//   ScanTable.
+
+#include "db/builder.h"
+#include "db/db_impl.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "db/write_batch_internal.h"
+#include "leveldb/comparator.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+
+namespace leveldb {
+
+namespace {
+
+class Repairer {
+ public:
+  Repairer(const std::string& dbname, const Options& options)
+      : dbname_(dbname),
+        env_(options.env),
+        icmp_(options.comparator),
+        options_(SanitizeOptions(dbname, &icmp_, options)),
+        owns_info_log_(options_.info_log != options.info_log),
+        next_file_number_(1) {
+    // TableCache can be small since we expect each table to be opened once.
+    table_cache_ = new TableCache(dbname_, &options_, 10);
+  }
+
+  ~Repairer() {
+    delete table_cache_;
+    if (owns_info_log_) {
+      delete options_.info_log;
+    }
+  }
+
+  Status Run() {
+    Status status = FindFiles();
+    if (status.ok()) {
+      ConvertLogFilesToTables();
+      ExtractMetaData();
+      status = WriteDescriptor();
+    }
+    if (status.ok()) {
+      unsigned long long bytes = 0;
+      for (int i = 0; i < tables_.size(); i++) {
+        bytes += tables_[i].meta.file_size;
+      }
+      Log(env_, options_.info_log,
+          "**** Repaired leveldb %s; "
+          "recovered %d files; %llu bytes. "
+          "Some data may have been lost. "
+          "****",
+          dbname_.c_str(),
+          static_cast<int>(tables_.size()),
+          bytes);
+    }
+    return status;
+  }
+
+ private:
+  struct TableInfo {
+    FileMetaData meta;
+    SequenceNumber max_sequence;
+  };
+
+  std::string const dbname_;
+  Env* const env_;
+  InternalKeyComparator const icmp_;
+  Options const options_;
+  bool owns_info_log_;
+  TableCache* table_cache_;
+  VersionEdit edit_;
+
+  std::vector<std::string> manifests_;
+  std::vector<uint64_t> table_numbers_;
+  std::vector<uint64_t> logs_;
+  std::vector<TableInfo> tables_;
+  uint64_t next_file_number_;
+
+  Status FindFiles() {
+    std::vector<std::string> filenames;
+    Status status = env_->GetChildren(dbname_, &filenames);
+    if (!status.ok()) {
+      return status;
+    }
+    if (filenames.empty()) {
+      return Status::IOError(dbname_, "repair found no files");
+    }
+
+    uint64_t number;
+    LargeValueRef large_ref;
+    FileType type;
+    for (int i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &large_ref, &type)) {
+        if (type == kLargeValueFile) {
+          // Will be picked up when we process a Table that points to it
+        } else if (type == kDescriptorFile) {
+          manifests_.push_back(filenames[i]);
+        } else {
+          if (number + 1 > next_file_number_) {
+            next_file_number_ = number + 1;
+          }
+          if (type == kLogFile) {
+            logs_.push_back(number);
+          } else if (type == kTableFile) {
+            table_numbers_.push_back(number);
+          } else {
+            // Ignore other files
+          }
+        }
+      }
+    }
+    return status;
+  }
+
+  void ConvertLogFilesToTables() {
+    for (int i = 0; i < logs_.size(); i++) {
+      std::string logname = LogFileName(dbname_, logs_[i]);
+      Status status = ConvertLogToTable(logs_[i]);
+      if (!status.ok()) {
+        Log(env_, options_.info_log, "Log #%llu: ignoring conversion error: %s",
+            (unsigned long long) logs_[i],
+            status.ToString().c_str());
+      }
+      ArchiveFile(logname);
+    }
+  }
+
+  Status ConvertLogToTable(uint64_t log) {
+    struct LogReporter : public log::Reader::Reporter {
+      Env* env;
+      WritableFile* info_log;
+      uint64_t lognum;
+      virtual void Corruption(size_t bytes, const Status& s) {
+        // We print error messages for corruption, but continue repairing.
+        Log(env, info_log, "Log #%llu: dropping %d bytes; %s",
+            (unsigned long long) lognum,
+            static_cast<int>(bytes),
+            s.ToString().c_str());
+      }
+    };
+
+    // Open the log file
+    std::string logname = LogFileName(dbname_, log);
+    SequentialFile* lfile;
+    Status status = env_->NewSequentialFile(logname, &lfile);
+    if (!status.ok()) {
+      return status;
+    }
+
+    // Create the log reader.
+    LogReporter reporter;
+    reporter.env = env_;
+    reporter.info_log = options_.info_log;
+    reporter.lognum = log;
+    // We intentially make log::Reader do checksumming so that
+    // corruptions cause entire commits to be skipped instead of
+    // propagating bad information (like overly large sequence
+    // numbers).
+    log::Reader reader(lfile, &reporter, false/*do not checksum*/);
+
+    // Read all the records and add to a memtable
+    std::string scratch;
+    Slice record;
+    WriteBatch batch;
+    MemTable mem(icmp_);
+    int counter = 0;
+    while (reader.ReadRecord(&record, &scratch)) {
+      if (record.size() < 12) {
+        reporter.Corruption(
+            record.size(), Status::Corruption("log record too small"));
+        continue;
+      }
+      WriteBatchInternal::SetContents(&batch, record);
+      status = WriteBatchInternal::InsertInto(&batch, &mem);
+      if (status.ok()) {
+        counter += WriteBatchInternal::Count(&batch);
+      } else {
+        Log(env_, options_.info_log, "Log #%llu: ignoring %s",
+            (unsigned long long) log,
+            status.ToString().c_str());
+        status = Status::OK();  // Keep going with rest of file
+      }
+    }
+    delete lfile;
+
+    // We ignore any version edits generated by the conversion to a Table
+    // since ExtractMetaData() will also generate edits.
+    VersionEdit skipped;
+    FileMetaData meta;
+    meta.number = next_file_number_++;
+    Iterator* iter = mem.NewIterator();
+    status = BuildTable(dbname_, env_, options_, table_cache_, iter,
+                        &meta, &skipped);
+    delete iter;
+    if (status.ok()) {
+      if (meta.file_size > 0) {
+        table_numbers_.push_back(meta.number);
+      }
+    }
+    Log(env_, options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
+        (unsigned long long) log,
+        counter,
+        (unsigned long long) meta.number,
+        status.ToString().c_str());
+    return status;
+  }
+
+  void ExtractMetaData() {
+    std::vector<TableInfo> kept;
+    for (int i = 0; i < table_numbers_.size(); i++) {
+      TableInfo t;
+      t.meta.number = table_numbers_[i];
+      Status status = ScanTable(&t);
+      if (!status.ok()) {
+        std::string fname = TableFileName(dbname_, table_numbers_[i]);
+        Log(env_, options_.info_log, "Table #%llu: ignoring %s",
+            (unsigned long long) table_numbers_[i],
+            status.ToString().c_str());
+        ArchiveFile(fname);
+      } else {
+        tables_.push_back(t);
+      }
+    }
+  }
+
+  Status ScanTable(TableInfo* t) {
+    std::string fname = TableFileName(dbname_, t->meta.number);
+    int counter = 0;
+    Status status = env_->GetFileSize(fname, &t->meta.file_size);
+    if (status.ok()) {
+      Iterator* iter = table_cache_->NewIterator(
+          ReadOptions(), t->meta.number, t->meta.file_size);
+      bool empty = true;
+      ParsedInternalKey parsed;
+      t->max_sequence = 0;
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+        Slice key = iter->key();
+        if (!ParseInternalKey(key, &parsed)) {
+          Log(env_, options_.info_log, "Table #%llu: unparsable key %s",
+              (unsigned long long) t->meta.number,
+              EscapeString(key).c_str());
+          continue;
+        }
+
+        counter++;
+        if (empty) {
+          empty = false;
+          t->meta.smallest.DecodeFrom(key);
+        }
+        t->meta.largest.DecodeFrom(key);
+        if (parsed.sequence > t->max_sequence) {
+          t->max_sequence = parsed.sequence;
+        }
+
+        if (ExtractValueType(key) == kTypeLargeValueRef) {
+          if (iter->value().size() != LargeValueRef::ByteSize()) {
+            Log(env_, options_.info_log, "Table #%llu: bad large value ref",
+                (unsigned long long) t->meta.number);
+          } else {
+            edit_.AddLargeValueRef(LargeValueRef::FromRef(iter->value()),
+                                   t->meta.number,
+                                   key);
+          }
+        }
+      }
+      if (!iter->status().ok()) {
+        status = iter->status();
+      }
+      delete iter;
+    }
+    Log(env_, options_.info_log, "Table #%llu: %d entries %s",
+        (unsigned long long) t->meta.number,
+        counter,
+        status.ToString().c_str());
+    return status;
+  }
+
+  Status WriteDescriptor() {
+    std::string tmp = TempFileName(dbname_, 1);
+    WritableFile* file;
+    Status status = env_->NewWritableFile(tmp, &file);
+    if (!status.ok()) {
+      return status;
+    }
+
+    SequenceNumber max_sequence = 0;
+    for (int i = 0; i < tables_.size(); i++) {
+      if (max_sequence < tables_[i].max_sequence) {
+        max_sequence = tables_[i].max_sequence;
+      }
+    }
+
+    edit_.SetComparatorName(icmp_.user_comparator()->Name());
+    edit_.SetLogNumber(0);
+    edit_.SetNextFile(next_file_number_);
+    edit_.SetLastSequence(max_sequence);
+
+    for (int i = 0; i < tables_.size(); i++) {
+      // TODO(opt): separate out into multiple levels
+      const TableInfo& t = tables_[i];
+      edit_.AddFile(0, t.meta.number, t.meta.file_size,
+                    t.meta.smallest, t.meta.largest);
+    }
+
+    //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
+    {
+      log::Writer log(file);
+      std::string record;
+      edit_.EncodeTo(&record);
+      status = log.AddRecord(record);
+    }
+    if (status.ok()) {
+      status = file->Close();
+    }
+    delete file;
+    file = NULL;
+
+    if (!status.ok()) {
+      env_->DeleteFile(tmp);
+    } else {
+      // Discard older manifests
+      for (int i = 0; i < manifests_.size(); i++) {
+        ArchiveFile(dbname_ + "/" + manifests_[i]);
+      }
+
+      // Install new manifest
+      status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1));
+      if (status.ok()) {
+        status = SetCurrentFile(env_, dbname_, 1);
+      } else {
+        env_->DeleteFile(tmp);
+      }
+    }
+    return status;
+  }
+
+  void ArchiveFile(const std::string& fname) {
+    // Move into another directory.  E.g., for
+    //    dir/foo
+    // rename to
+    //    dir/lost/foo
+    const char* slash = strrchr(fname.c_str(), '/');
+    std::string new_dir;
+    if (slash != NULL) {
+      new_dir.assign(fname.data(), slash - fname.data());
+    }
+    new_dir.append("/lost");
+    env_->CreateDir(new_dir);  // Ignore error
+    std::string new_file = new_dir;
+    new_file.append("/");
+    new_file.append((slash == NULL) ? fname.c_str() : slash + 1);
+    Status s = env_->RenameFile(fname, new_file);
+    Log(env_, options_.info_log, "Archiving %s: %s\n",
+        fname.c_str(), s.ToString().c_str());
+  }
+};
+}
+
+Status RepairDB(const std::string& dbname, const Options& options) {
+  Repairer repairer(dbname, options);
+  return repairer.Run();
+}
+
+}