diff options
author | Michael Cahill <michael.cahill@wiredtiger.com> | 2014-05-16 20:13:51 +1000 |
---|---|---|
committer | Michael Cahill <michael.cahill@wiredtiger.com> | 2014-05-16 20:13:51 +1000 |
commit | 379875bc04a606b95edc091b1033d2087d8be1f2 (patch) | |
tree | 9ae7c1d1e1b54877ff9e4ec3f13f6d7ee66ff634 /api | |
parent | b4c98612cbc759b6b1a33314f0f6ae39530c1701 (diff) | |
download | mongo-379875bc04a606b95edc091b1033d2087d8be1f2.tar.gz |
Implement the LevelDB API, backed by WiredTiger.
Diffstat (limited to 'api')
-rw-r--r-- | api/leveldb/Makefile.am | 18 | ||||
-rw-r--r-- | api/leveldb/db/dbformat.h | 223 | ||||
-rw-r--r-- | api/leveldb/db/skiplist.h | 379 | ||||
-rw-r--r-- | api/leveldb/db/write_batch.cc | 109 | ||||
-rw-r--r-- | api/leveldb/db/write_batch_internal.h | 39 | ||||
-rw-r--r-- | api/leveldb/leveldb_test.cc | 36 | ||||
-rw-r--r-- | api/leveldb/leveldb_wt.cc | 673 | ||||
-rw-r--r-- | api/leveldb/leveldb_wt.h | 23 | ||||
-rw-r--r-- | api/leveldb/port/port.h | 11 | ||||
-rw-r--r-- | api/leveldb/util/arena.h | 68 | ||||
-rw-r--r-- | api/leveldb/util/coding.cc | 194 | ||||
-rw-r--r-- | api/leveldb/util/coding.h | 104 | ||||
-rw-r--r-- | api/leveldb/util/comparator.cc | 80 | ||||
-rw-r--r-- | api/leveldb/util/env.cc | 96 | ||||
-rw-r--r-- | api/leveldb/util/env_posix.cc | 617 | ||||
-rw-r--r-- | api/leveldb/util/logging.cc | 80 | ||||
-rw-r--r-- | api/leveldb/util/logging.h | 47 | ||||
-rw-r--r-- | api/leveldb/util/options.cc | 29 | ||||
-rw-r--r-- | api/leveldb/util/posix_logger.h | 98 | ||||
-rw-r--r-- | api/leveldb/util/random.h | 72 | ||||
-rw-r--r-- | api/leveldb/util/status.cc | 74 |
21 files changed, 3070 insertions, 0 deletions
diff --git a/api/leveldb/Makefile.am b/api/leveldb/Makefile.am new file mode 100644 index 00000000000..740dfb16456 --- /dev/null +++ b/api/leveldb/Makefile.am @@ -0,0 +1,18 @@ +AM_CPPFLAGS = -I$(top_builddir) + +lib_LTLIBRARIES = libwiredtiger_leveldb.la +LDADD = $(lib_LTLIBRARIES) $(top_builddir)/libwiredtiger.la + +noinst_PROGRAMS = leveldb_test + +libwiredtiger_leveldb_la_LDFLAGS = -release @VERSION@ +libwiredtiger_leveldb_la_SOURCES = \ + leveldb_wt.cc \ + db/write_batch.cc \ + util/coding.cc util/comparator.cc util/env.cc util/env_posix.cc \ + util/logging.cc util/options.cc util/status.cc + +leveldb_test_SOURCES = leveldb_test.cc +#leveldb_test_LDADD = $(top_builddir)/libwiredtiger.la + +TESTS = $(noinst_PROGRAMS) diff --git a/api/leveldb/db/dbformat.h b/api/leveldb/db/dbformat.h new file mode 100644 index 00000000000..37c21ed95e7 --- /dev/null +++ b/api/leveldb/db/dbformat.h @@ -0,0 +1,223 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_FORMAT_H_ +#define STORAGE_LEVELDB_DB_FORMAT_H_ + +#include <stdio.h> +#include "leveldb_wt.h" +#include "util/coding.h" +#include "util/logging.h" + +namespace leveldb { + +// Grouping of constants. We may want to make some of these +// parameters set via options. +namespace config { +static const int kNumLevels = 7; + +// Level-0 compaction is started when we hit this many files. +static const int kL0_CompactionTrigger = 4; + +// Soft limit on number of level-0 files. We slow down writes at this point. +static const int kL0_SlowdownWritesTrigger = 8; + +// Maximum number of level-0 files. We stop writes at this point. +static const int kL0_StopWritesTrigger = 12; + +// Maximum level to which a new compacted memtable is pushed if it +// does not create overlap. We try to push to level 2 to avoid the +// relatively expensive level 0=>1 compactions and to avoid some +// expensive manifest file operations. We do not push all the way to +// the largest level since that can generate a lot of wasted disk +// space if the same key space is being repeatedly overwritten. +static const int kMaxMemCompactLevel = 2; + +} // namespace config + +class InternalKey; + +// Value types encoded as the last component of internal keys. +// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk +// data structures. +enum ValueType { + kTypeDeletion = 0x0, + kTypeValue = 0x1 +}; +// kValueTypeForSeek defines the ValueType that should be passed when +// constructing a ParsedInternalKey object for seeking to a particular +// sequence number (since we sort sequence numbers in decreasing order +// and the value type is embedded as the low 8 bits in the sequence +// number in internal keys, we need to use the highest-numbered +// ValueType, not the lowest). +static const ValueType kValueTypeForSeek = kTypeValue; + +typedef uint64_t SequenceNumber; + +// We leave eight bits empty at the bottom so a type and sequence# +// can be packed together into 64-bits. +static const SequenceNumber kMaxSequenceNumber = + ((0x1ull << 56) - 1); + +struct ParsedInternalKey { + Slice user_key; + SequenceNumber sequence; + ValueType type; + + ParsedInternalKey() { } // Intentionally left uninitialized (for speed) + ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t) + : user_key(u), sequence(seq), type(t) { } + std::string DebugString() const; +}; + +// Return the length of the encoding of "key". +inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) { + return key.user_key.size() + 8; +} + +// Append the serialization of "key" to *result. +extern void AppendInternalKey(std::string* result, + const ParsedInternalKey& key); + +// Attempt to parse an internal key from "internal_key". On success, +// stores the parsed data in "*result", and returns true. +// +// On error, returns false, leaves "*result" in an undefined state. +extern bool ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result); + +// Returns the user key portion of an internal key. +inline Slice ExtractUserKey(const Slice& internal_key) { + assert(internal_key.size() >= 8); + return Slice(internal_key.data(), internal_key.size() - 8); +} + +inline ValueType ExtractValueType(const Slice& internal_key) { + assert(internal_key.size() >= 8); + const size_t n = internal_key.size(); + uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + unsigned char c = num & 0xff; + return static_cast<ValueType>(c); +} + +// A comparator for internal keys that uses a specified comparator for +// the user key portion and breaks ties by decreasing sequence number. +class InternalKeyComparator : public Comparator { + private: + const Comparator* user_comparator_; + public: + explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) { } + virtual const char* Name() const; + virtual int Compare(const Slice& a, const Slice& b) const; + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const; + virtual void FindShortSuccessor(std::string* key) const; + + const Comparator* user_comparator() const { return user_comparator_; } + + int Compare(const InternalKey& a, const InternalKey& b) const; +}; + +// Filter policy wrapper that converts from internal keys to user keys +class InternalFilterPolicy : public FilterPolicy { + private: + const FilterPolicy* const user_policy_; + public: + explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { } + virtual const char* Name() const; + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const; + virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const; +}; + +// Modules in this directory should keep internal keys wrapped inside +// the following class instead of plain strings so that we do not +// incorrectly use string comparisons instead of an InternalKeyComparator. +class InternalKey { + private: + std::string rep_; + public: + InternalKey() { } // Leave rep_ as empty to indicate it is invalid + InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) { + AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t)); + } + + void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); } + Slice Encode() const { + assert(!rep_.empty()); + return rep_; + } + + Slice user_key() const { return ExtractUserKey(rep_); } + + void SetFrom(const ParsedInternalKey& p) { + rep_.clear(); + AppendInternalKey(&rep_, p); + } + + void Clear() { rep_.clear(); } + + std::string DebugString() const; +}; + +inline int InternalKeyComparator::Compare( + const InternalKey& a, const InternalKey& b) const { + return Compare(a.Encode(), b.Encode()); +} + +inline bool ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result) { + const size_t n = internal_key.size(); + if (n < 8) return false; + uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + unsigned char c = num & 0xff; + result->sequence = num >> 8; + result->type = static_cast<ValueType>(c); + result->user_key = Slice(internal_key.data(), n - 8); + return (c <= static_cast<unsigned char>(kTypeValue)); +} + +// A helper class useful for DBImpl::Get() +class LookupKey { + public: + // Initialize *this for looking up user_key at a snapshot with + // the specified sequence number. + LookupKey(const Slice& user_key, SequenceNumber sequence); + + ~LookupKey(); + + // Return a key suitable for lookup in a MemTable. + Slice memtable_key() const { return Slice(start_, end_ - start_); } + + // Return an internal key (suitable for passing to an internal iterator) + Slice internal_key() const { return Slice(kstart_, end_ - kstart_); } + + // Return the user key + Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); } + + private: + // We construct a char array of the form: + // klength varint32 <-- start_ + // userkey char[klength] <-- kstart_ + // tag uint64 + // <-- end_ + // The array is a suitable MemTable key. + // The suffix starting with "userkey" can be used as an InternalKey. + const char* start_; + const char* kstart_; + const char* end_; + char space_[200]; // Avoid allocation for short keys + + // No copying allowed + LookupKey(const LookupKey&); + void operator=(const LookupKey&); +}; + +inline LookupKey::~LookupKey() { + if (start_ != space_) delete[] start_; +} + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_DB_FORMAT_H_ diff --git a/api/leveldb/db/skiplist.h b/api/leveldb/db/skiplist.h new file mode 100644 index 00000000000..af85be6d016 --- /dev/null +++ b/api/leveldb/db/skiplist.h @@ -0,0 +1,379 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Thread safety +// ------------- +// +// Writes require external synchronization, most likely a mutex. +// Reads require a guarantee that the SkipList will not be destroyed +// while the read is in progress. Apart from that, reads progress +// without any internal locking or synchronization. +// +// Invariants: +// +// (1) Allocated nodes are never deleted until the SkipList is +// destroyed. This is trivially guaranteed by the code since we +// never delete any skip list nodes. +// +// (2) The contents of a Node except for the next/prev pointers are +// immutable after the Node has been linked into the SkipList. +// Only Insert() modifies the list, and it is careful to initialize +// a node and use release-stores to publish the nodes in one or +// more lists. +// +// ... prev vs. next pointer ordering ... + +#include <assert.h> +#include <stdlib.h> +#include "port/port.h" +#include "util/arena.h" +#include "util/random.h" + +namespace leveldb { + +class Arena; + +template<typename Key, class Comparator> +class SkipList { + private: + struct Node; + + public: + // Create a new SkipList object that will use "cmp" for comparing keys, + // and will allocate memory using "*arena". Objects allocated in the arena + // must remain allocated for the lifetime of the skiplist object. + explicit SkipList(Comparator cmp, Arena* arena); + + // Insert key into the list. + // REQUIRES: nothing that compares equal to key is currently in the list. + void Insert(const Key& key); + + // Returns true iff an entry that compares equal to key is in the list. + bool Contains(const Key& key) const; + + // Iteration over the contents of a skip list + class Iterator { + public: + // Initialize an iterator over the specified list. + // The returned iterator is not valid. + explicit Iterator(const SkipList* list); + + // Returns true iff the iterator is positioned at a valid node. + bool Valid() const; + + // Returns the key at the current position. + // REQUIRES: Valid() + const Key& key() const; + + // Advances to the next position. + // REQUIRES: Valid() + void Next(); + + // Advances to the previous position. + // REQUIRES: Valid() + void Prev(); + + // Advance to the first entry with a key >= target + void Seek(const Key& target); + + // Position at the first entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToFirst(); + + // Position at the last entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToLast(); + + private: + const SkipList* list_; + Node* node_; + // Intentionally copyable + }; + + private: + enum { kMaxHeight = 12 }; + + // Immutable after construction + Comparator const compare_; + Arena* const arena_; // Arena used for allocations of nodes + + Node* const head_; + + // Modified only by Insert(). Read racily by readers, but stale + // values are ok. + port::AtomicPointer max_height_; // Height of the entire list + + inline int GetMaxHeight() const { + return static_cast<int>( + reinterpret_cast<intptr_t>(max_height_.NoBarrier_Load())); + } + + // Read/written only by Insert(). + Random rnd_; + + Node* NewNode(const Key& key, int height); + int RandomHeight(); + bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } + + // Return true if key is greater than the data stored in "n" + bool KeyIsAfterNode(const Key& key, Node* n) const; + + // Return the earliest node that comes at or after key. + // Return NULL if there is no such node. + // + // If prev is non-NULL, fills prev[level] with pointer to previous + // node at "level" for every level in [0..max_height_-1]. + Node* FindGreaterOrEqual(const Key& key, Node** prev) const; + + // Return the latest node with a key < key. + // Return head_ if there is no such node. + Node* FindLessThan(const Key& key) const; + + // Return the last node in the list. + // Return head_ if list is empty. + Node* FindLast() const; + + // No copying allowed + SkipList(const SkipList&); + void operator=(const SkipList&); +}; + +// Implementation details follow +template<typename Key, class Comparator> +struct SkipList<Key,Comparator>::Node { + explicit Node(const Key& k) : key(k) { } + + Key const key; + + // Accessors/mutators for links. Wrapped in methods so we can + // add the appropriate barriers as necessary. + Node* Next(int n) { + assert(n >= 0); + // Use an 'acquire load' so that we observe a fully initialized + // version of the returned Node. + return reinterpret_cast<Node*>(next_[n].Acquire_Load()); + } + void SetNext(int n, Node* x) { + assert(n >= 0); + // Use a 'release store' so that anybody who reads through this + // pointer observes a fully initialized version of the inserted node. + next_[n].Release_Store(x); + } + + // No-barrier variants that can be safely used in a few locations. + Node* NoBarrier_Next(int n) { + assert(n >= 0); + return reinterpret_cast<Node*>(next_[n].NoBarrier_Load()); + } + void NoBarrier_SetNext(int n, Node* x) { + assert(n >= 0); + next_[n].NoBarrier_Store(x); + } + + private: + // Array of length equal to the node height. next_[0] is lowest level link. + port::AtomicPointer next_[1]; +}; + +template<typename Key, class Comparator> +typename SkipList<Key,Comparator>::Node* +SkipList<Key,Comparator>::NewNode(const Key& key, int height) { + char* mem = arena_->AllocateAligned( + sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1)); + return new (mem) Node(key); +} + +template<typename Key, class Comparator> +inline SkipList<Key,Comparator>::Iterator::Iterator(const SkipList* list) { + list_ = list; + node_ = NULL; +} + +template<typename Key, class Comparator> +inline bool SkipList<Key,Comparator>::Iterator::Valid() const { + return node_ != NULL; +} + +template<typename Key, class Comparator> +inline const Key& SkipList<Key,Comparator>::Iterator::key() const { + assert(Valid()); + return node_->key; +} + +template<typename Key, class Comparator> +inline void SkipList<Key,Comparator>::Iterator::Next() { + assert(Valid()); + node_ = node_->Next(0); +} + +template<typename Key, class Comparator> +inline void SkipList<Key,Comparator>::Iterator::Prev() { + // Instead of using explicit "prev" links, we just search for the + // last node that falls before key. + assert(Valid()); + node_ = list_->FindLessThan(node_->key); + if (node_ == list_->head_) { + node_ = NULL; + } +} + +template<typename Key, class Comparator> +inline void SkipList<Key,Comparator>::Iterator::Seek(const Key& target) { + node_ = list_->FindGreaterOrEqual(target, NULL); +} + +template<typename Key, class Comparator> +inline void SkipList<Key,Comparator>::Iterator::SeekToFirst() { + node_ = list_->head_->Next(0); +} + +template<typename Key, class Comparator> +inline void SkipList<Key,Comparator>::Iterator::SeekToLast() { + node_ = list_->FindLast(); + if (node_ == list_->head_) { + node_ = NULL; + } +} + +template<typename Key, class Comparator> +int SkipList<Key,Comparator>::RandomHeight() { + // Increase height with probability 1 in kBranching + static const unsigned int kBranching = 4; + int height = 1; + while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) { + height++; + } + assert(height > 0); + assert(height <= kMaxHeight); + return height; +} + +template<typename Key, class Comparator> +bool SkipList<Key,Comparator>::KeyIsAfterNode(const Key& key, Node* n) const { + // NULL n is considered infinite + return (n != NULL) && (compare_(n->key, key) < 0); +} + +template<typename Key, class Comparator> +typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindGreaterOrEqual(const Key& key, Node** prev) + const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + Node* next = x->Next(level); + if (KeyIsAfterNode(key, next)) { + // Keep searching in this list + x = next; + } else { + if (prev != NULL) prev[level] = x; + if (level == 0) { + return next; + } else { + // Switch to next list + level--; + } + } + } +} + +template<typename Key, class Comparator> +typename SkipList<Key,Comparator>::Node* +SkipList<Key,Comparator>::FindLessThan(const Key& key) const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + assert(x == head_ || compare_(x->key, key) < 0); + Node* next = x->Next(level); + if (next == NULL || compare_(next->key, key) >= 0) { + if (level == 0) { + return x; + } else { + // Switch to next list + level--; + } + } else { + x = next; + } + } +} + +template<typename Key, class Comparator> +typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindLast() + const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + Node* next = x->Next(level); + if (next == NULL) { + if (level == 0) { + return x; + } else { + // Switch to next list + level--; + } + } else { + x = next; + } + } +} + +template<typename Key, class Comparator> +SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena) + : compare_(cmp), + arena_(arena), + head_(NewNode(0 /* any key will do */, kMaxHeight)), + max_height_(reinterpret_cast<void*>(1)), + rnd_(0xdeadbeef) { + for (int i = 0; i < kMaxHeight; i++) { + head_->SetNext(i, NULL); + } +} + +template<typename Key, class Comparator> +void SkipList<Key,Comparator>::Insert(const Key& key) { + // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual() + // here since Insert() is externally synchronized. + Node* prev[kMaxHeight]; + Node* x = FindGreaterOrEqual(key, prev); + + // Our data structure does not allow duplicate insertion + assert(x == NULL || !Equal(key, x->key)); + + int height = RandomHeight(); + if (height > GetMaxHeight()) { + for (int i = GetMaxHeight(); i < height; i++) { + prev[i] = head_; + } + //fprintf(stderr, "Change height from %d to %d\n", max_height_, height); + + // It is ok to mutate max_height_ without any synchronization + // with concurrent readers. A concurrent reader that observes + // the new value of max_height_ will see either the old value of + // new level pointers from head_ (NULL), or a new value set in + // the loop below. In the former case the reader will + // immediately drop to the next level since NULL sorts after all + // keys. In the latter case the reader will use the new node. + max_height_.NoBarrier_Store(reinterpret_cast<void*>(height)); + } + + x = NewNode(key, height); + for (int i = 0; i < height; i++) { + // NoBarrier_SetNext() suffices since we will add a barrier when + // we publish a pointer to "x" in prev[i]. + x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i)); + prev[i]->SetNext(i, x); + } +} + +template<typename Key, class Comparator> +bool SkipList<Key,Comparator>::Contains(const Key& key) const { + Node* x = FindGreaterOrEqual(key, NULL); + if (x != NULL && Equal(key, x->key)) { + return true; + } else { + return false; + } +} + +} // namespace leveldb diff --git a/api/leveldb/db/write_batch.cc b/api/leveldb/db/write_batch.cc new file mode 100644 index 00000000000..6a488a83e3e --- /dev/null +++ b/api/leveldb/db/write_batch.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBatch::rep_ := +// sequence: fixed64 +// count: fixed32 +// data: record[count] +// record := +// kTypeValue varstring varstring | +// kTypeDeletion varstring +// varstring := +// len: varint32 +// data: uint8[len] + +#include "leveldb_wt.h" +#include "db/write_batch_internal.h" + +namespace leveldb { + +// WriteBatch header has an 8-byte sequence number followed by a 4-byte count. +static const size_t kHeader = 12; + +WriteBatch::WriteBatch() { + Clear(); +} + +WriteBatch::~WriteBatch() { } + +WriteBatch::Handler::~Handler() { } + +void WriteBatch::Clear() { + rep_.clear(); + rep_.resize(kHeader); +} + +Status WriteBatch::Iterate(Handler* handler) const { + Slice input(rep_); + if (input.size() < kHeader) { + return Status::Corruption("malformed WriteBatch (too small)"); + } + + input.remove_prefix(kHeader); + Slice key, value; + int found = 0; + while (!input.empty()) { + found++; + char tag = input[0]; + input.remove_prefix(1); + switch (tag) { + case kTypeValue: + if (GetLengthPrefixedSlice(&input, &key) && + GetLengthPrefixedSlice(&input, &value)) { + handler->Put(key, value); + } else { + return Status::Corruption("bad WriteBatch Put"); + } + break; + case kTypeDeletion: + if (GetLengthPrefixedSlice(&input, &key)) { + handler->Delete(key); + } else { + return Status::Corruption("bad WriteBatch Delete"); + } + break; + default: + return Status::Corruption("unknown WriteBatch tag"); + } + } + if (found != WriteBatchInternal::Count(this)) { + return Status::Corruption("WriteBatch has wrong count"); + } else { + return Status::OK(); + } +} + +int WriteBatchInternal::Count(const WriteBatch* b) { + return DecodeFixed32(b->rep_.data() + 8); +} + +void WriteBatchInternal::SetCount(WriteBatch* b, int n) { + EncodeFixed32(&b->rep_[8], n); +} + +void WriteBatch::Put(const Slice& key, const Slice& value) { + WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); + rep_.push_back(static_cast<char>(kTypeValue)); + PutLengthPrefixedSlice(&rep_, key); + PutLengthPrefixedSlice(&rep_, value); +} + +void WriteBatch::Delete(const Slice& key) { + WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); + rep_.push_back(static_cast<char>(kTypeDeletion)); + PutLengthPrefixedSlice(&rep_, key); +} + +void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { + assert(contents.size() >= kHeader); + b->rep_.assign(contents.data(), contents.size()); +} + +void WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src) { + SetCount(dst, Count(dst) + Count(src)); + assert(src->rep_.size() >= kHeader); + dst->rep_.append(src->rep_.data() + kHeader, src->rep_.size() - kHeader); +} + +} // namespace leveldb diff --git a/api/leveldb/db/write_batch_internal.h b/api/leveldb/db/write_batch_internal.h new file mode 100644 index 00000000000..b4295004e13 --- /dev/null +++ b/api/leveldb/db/write_batch_internal.h @@ -0,0 +1,39 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ +#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ + +#include "leveldb_wt.h" +#include "db/dbformat.h" + +namespace leveldb { + +// WriteBatchInternal provides static methods for manipulating a +// WriteBatch that we don't want in the public WriteBatch interface. +class WriteBatchInternal { + public: + // Return the number of entries in the batch. + static int Count(const WriteBatch* batch); + + // Set the count for the number of entries in the batch. + static void SetCount(WriteBatch* batch, int n); + + static Slice Contents(const WriteBatch* batch) { + return Slice(batch->rep_); + } + + static size_t ByteSize(const WriteBatch* batch) { + return batch->rep_.size(); + } + + static void SetContents(WriteBatch* batch, const Slice& contents); + + static void Append(WriteBatch* dst, const WriteBatch* src); +}; + +} // namespace leveldb + + +#endif // STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ diff --git a/api/leveldb/leveldb_test.cc b/api/leveldb/leveldb_test.cc new file mode 100644 index 00000000000..d3b78b4eb4b --- /dev/null +++ b/api/leveldb/leveldb_test.cc @@ -0,0 +1,36 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ +#include <assert.h> +#include <iostream> +#include <hyperleveldb/db.h> + +using namespace std; + +extern "C" int main() { + leveldb::DB* db; + leveldb::Options options; + options.create_if_missing = true; + leveldb::Status s = leveldb::DB::Open(options, "WTLDB_HOME", &db); + assert(s.ok()); + + s = db->Put(leveldb::WriteOptions(), "key", "value"); + assert(s.ok()); + + leveldb::ReadOptions read_options; + read_options.snapshot = db->GetSnapshot(); + leveldb::Iterator* iter = db->NewIterator(read_options); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + cout << iter->key().ToString() << ": " << iter->value().ToString() << endl; + } + + delete iter; + db->ReleaseSnapshot(read_options.snapshot); + + delete db; + + return (0); +} diff --git a/api/leveldb/leveldb_wt.cc b/api/leveldb/leveldb_wt.cc new file mode 100644 index 00000000000..defa8b73556 --- /dev/null +++ b/api/leveldb/leveldb_wt.cc @@ -0,0 +1,673 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ +#include "leveldb_wt.h" + +using leveldb::FilterPolicy; +using leveldb::Iterator; +using leveldb::Options; +using leveldb::ReadOptions; +using leveldb::WriteBatch; +using leveldb::WriteOptions; +using leveldb::Range; +using leveldb::Slice; +using leveldb::Snapshot; +using leveldb::Status; +#ifndef WITHOUT_HYPERLEVELDB +namespace leveldb { +class ReplayIterator; +} +#else +using leveldb::Value; +#endif + +#define WT_URI "table:data" +#define WT_CONFIG "type=lsm,leaf_page_max=4KB,leaf_item_max=1KB" + +/* Destructors required for interfaces. */ +leveldb::DB::~DB() {} +Snapshot::~Snapshot() {} + +/* Iterators, from leveldb/table/iterator.cc */ +Iterator::Iterator() { + cleanup_.function = NULL; + cleanup_.next = NULL; +} + +Iterator::~Iterator() { + if (cleanup_.function != NULL) { + (*cleanup_.function)(cleanup_.arg1, cleanup_.arg2); + for (Cleanup* c = cleanup_.next; c != NULL; ) { + (*c->function)(c->arg1, c->arg2); + Cleanup* next = c->next; + delete c; + c = next; + } + } +} + +void Iterator::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) { + assert(func != NULL); + Cleanup* c; + if (cleanup_.function == NULL) { + c = &cleanup_; + } else { + c = new Cleanup; + c->next = cleanup_.next; + cleanup_.next = c; + } + c->function = func; + c->arg1 = arg1; + c->arg2 = arg2; +} + +namespace { +class EmptyIterator : public Iterator { + public: + EmptyIterator(const Status& s) : status_(s) { } + virtual bool Valid() const { return false; } + virtual void Seek(const Slice& target) { } + virtual void SeekToFirst() { } + virtual void SeekToLast() { } + virtual void Next() { assert(false); } + virtual void Prev() { assert(false); } + Slice key() const { assert(false); return Slice(); } + Slice value() const { assert(false); return Slice(); } + virtual Status status() const { return status_; } + private: + Status status_; +}; +} // namespace + +Iterator* NewEmptyIterator() { + return new EmptyIterator(Status::OK()); +} + +Iterator* NewErrorIterator(const Status& status) { + return new EmptyIterator(status); +} + +namespace leveldb { +const FilterPolicy *NewBloomFilterPolicy(int bits_per_key) { + return 0; +} + +Cache *NewLRUCache(size_t capacity) { + return 0; +} + +Status DestroyDB(const std::string& name, const Options& options) { + fprintf(stderr, "DestroyDB %s", name.c_str()); + return Status::NotSupported("sorry!"); +} + +Status RepairDB(const std::string& dbname, const Options& options) { + return Status::NotSupported("sorry!"); +} +} + +/* POSIX thread-local storage */ +template <class T> +class ThreadLocal { +public: + static void cleanup(void *val) { + delete (T *)val; + } + + ThreadLocal() { + int ret = pthread_key_create(&key_, cleanup); + assert(ret == 0); + } + + ~ThreadLocal() { + int ret = pthread_key_delete(key_); + assert(ret == 0); + } + + T *get() { + return (T *)(pthread_getspecific(key_)); + } + + void set(T *value) { + int ret = pthread_setspecific(key_, value); + assert(ret == 0); + } + +private: + pthread_key_t key_; +}; + +/* WiredTiger implementations. */ +class DbImpl; + +/* Context for operations (including snapshots, write batches, transactions) */ +class OperationContext { +public: + OperationContext(WT_CONNECTION *conn) { + int ret = conn->open_session(conn, NULL, NULL, &session_); + assert(ret == 0); + ret = session_->open_cursor( + session_, WT_URI, NULL, NULL, &cursor_); + assert(ret == 0); + } + + ~OperationContext() { + int ret = session_->close(session_, NULL); + assert(ret == 0); + } + + WT_CURSOR *getCursor() { return cursor_; } + +private: + WT_SESSION *session_; + WT_CURSOR *cursor_; +}; + +class IteratorImpl : public Iterator { +public: + IteratorImpl(WT_CURSOR *cursor, DbImpl *db, const ReadOptions &options) : cursor_(cursor), valid_(false) {} + virtual ~IteratorImpl() {} + + // An iterator is either positioned at a key/value pair, or + // not valid. This method returns true iff the iterator is valid. + virtual bool Valid() const { return valid_; } + + virtual void SeekToFirst(); + + virtual void SeekToLast(); + + virtual void Seek(const Slice& target); + + virtual void Next(); + + virtual void Prev(); + + virtual Slice key() const { + return key_; + } + + virtual Slice value() const { + return value_; + } + + virtual Status status() const { + return status_; + } + +private: + WT_CURSOR *cursor_; + Slice key_, value_; + Status status_; + bool valid_; + + // No copying allowed + IteratorImpl(const IteratorImpl&); + void operator=(const IteratorImpl&); +}; + +class SnapshotImpl : public Snapshot { +public: + SnapshotImpl(DbImpl *db) : Snapshot() {} + virtual ~SnapshotImpl() {} +}; + +class DbImpl : public leveldb::DB { +public: + DbImpl(WT_CONNECTION *conn) : DB(), conn_(conn), context_(new ThreadLocal<OperationContext>) {} + virtual ~DbImpl() { + delete context_; + int ret = conn_->close(conn_, NULL); + assert(ret == 0); + } + + virtual Status Put(const WriteOptions& options, + const Slice& key, + const Slice& value); + + virtual Status Delete(const WriteOptions& options, const Slice& key); + + virtual Status Write(const WriteOptions& options, WriteBatch* updates); + + virtual Status Get(const ReadOptions& options, + const Slice& key, std::string* value); + +#ifndef WITHOUT_HYPERLEVELDB + virtual Status LiveBackup(const Slice& name) { return Status::NotSupported("sorry!"); } + virtual void GetReplayTimestamp(std::string* timestamp) {} + virtual void AllowGarbageCollectBeforeTimestamp(const std::string& timestamp) {} + virtual bool ValidateTimestamp(const std::string& timestamp) {} + virtual int CompareTimestamps(const std::string& lhs, const std::string& rhs) {} + virtual Status GetReplayIterator(const std::string& timestamp, + leveldb::ReplayIterator** iter) { return Status::NotSupported("sorry!"); } + virtual void ReleaseReplayIterator(leveldb::ReplayIterator* iter) {} +#else + virtual Status Get(const ReadOptions& options, + const Slice& key, Value* value); +#endif + + virtual Iterator* NewIterator(const ReadOptions& options); + + virtual const Snapshot* GetSnapshot(); + + virtual void ReleaseSnapshot(const Snapshot* snapshot); + + virtual bool GetProperty(const Slice& property, std::string* value); + + virtual void GetApproximateSizes(const Range* range, int n, + uint64_t* sizes); + + virtual void CompactRange(const Slice* begin, const Slice* end); + + virtual void SuspendCompactions(); + + virtual void ResumeCompactions(); + +private: + WT_CONNECTION *conn_; + ThreadLocal<OperationContext> *context_; + + OperationContext *getContext() { + OperationContext *ctx = context_->get(); + if (ctx == NULL) { + ctx = new OperationContext(conn_); + context_->set(ctx); + } + return (ctx); + } + + WT_CURSOR *getCursor() { return getContext()->getCursor(); } + + // No copying allowed + DbImpl(const DbImpl&); + void operator=(const DbImpl&); +}; + +Status +leveldb::DB::Open(const Options &options, const std::string &name, leveldb::DB **dbptr) +{ + WT_CONNECTION *conn; + int ret = ::wiredtiger_open(name.c_str(), NULL, "create", &conn); + assert(ret == 0); + + WT_SESSION *session; + ret = conn->open_session(conn, NULL, NULL, &session); + assert(ret == 0); + ret = session->create(session, WT_URI, WT_CONFIG); + assert(ret == 0); + ret = session->close(session, NULL); + assert(ret == 0); + + *dbptr = new DbImpl(conn); + return Status::OK(); +} + +// Set the database entry for "key" to "value". Returns OK on success, +// and a non-OK status on error. +// Note: consider setting options.sync = true. +Status +DbImpl::Put(const WriteOptions& options, + const Slice& key, const Slice& value) +{ + WT_CURSOR *cursor = getCursor(); + WT_ITEM item; + + item.data = key.data(); + item.size = key.size(); + cursor->set_key(cursor, &item); + item.data = value.data(); + item.size = value.size(); + cursor->set_value(cursor, &item); + int ret = cursor->insert(cursor); + assert(ret == 0); + return Status::OK(); +} + +// Remove the database entry (if any) for "key". Returns OK on +// success, and a non-OK status on error. It is not an error if "key" +// did not exist in the database. +// Note: consider setting options.sync = true. +Status +DbImpl::Delete(const WriteOptions& options, const Slice& key) +{ + WT_CURSOR *cursor = getCursor(); + WT_ITEM item; + + item.data = key.data(); + item.size = key.size(); + cursor->set_key(cursor, &item); + int ret = cursor->remove(cursor); + assert(ret == 0); + return Status::OK(); +} + +// Implement WriteBatch::Handler +class WriteBatchHandler : public WriteBatch::Handler { +public: + WriteBatchHandler(WT_CURSOR *cursor) : cursor_(cursor), status_(0) {} + virtual ~WriteBatchHandler() {} + int getStatus() { return status_; } + + virtual void Put(const Slice& key, const Slice& value) { + WT_ITEM item; + + item.data = key.data(); + item.size = key.size(); + cursor_->set_key(cursor_, &item); + item.data = value.data(); + item.size = value.size(); + cursor_->set_value(cursor_, &item); + int ret = cursor_->insert(cursor_); + if (ret != 0 && status_ == 0) + status_ = ret; + } + virtual void Delete(const Slice& key) { + WT_ITEM item; + + item.data = key.data(); + item.size = key.size(); + cursor_->set_key(cursor_, &item); + int ret = cursor_->remove(cursor_); + if (ret != 0 && status_ == 0) + status_ = ret; + } + +private: + WT_CURSOR *cursor_; + int status_; +}; + +// Apply the specified updates to the database. +// Returns OK on success, non-OK on failure. +// Note: consider setting options.sync = true. +Status +DbImpl::Write(const WriteOptions& options, WriteBatch* updates) +{ + WT_CURSOR *cursor = getCursor(); + WriteBatchHandler handler(cursor); + Status status = updates->Iterate(&handler); + assert(handler.getStatus() == 0); + return status; +} + +// If the database contains an entry for "key" store the +// corresponding value in *value and return OK. +// +// If there is no entry for "key" leave *value unchanged and return +// a status for which Status::IsNotFound() returns true. +// +// May return some other Status on an error. +Status +DbImpl::Get(const ReadOptions& options, + const Slice& key, std::string* value) +{ + WT_CURSOR *cursor = getCursor(); + WT_ITEM item; + + item.data = key.data(); + item.size = key.size(); + cursor->set_key(cursor, &item); + int ret = cursor->search(cursor); + if (ret == WT_NOTFOUND) + return Status::NotFound("DB::Get key not found"); + assert(ret == 0); + ret = cursor->get_value(cursor, &item); + assert(ret == 0); + *value = std::string((const char *)item.data, item.size); + return Status::OK(); +} + +#ifdef WITHOUT_HYPERLEVELDB +Status +DbImpl::Get(const ReadOptions& options, + const Slice& key, Value* value) +{ + WT_CURSOR *cursor = getCursor(); + WT_ITEM item; + + item.data = key.data(); + item.size = key.size(); + cursor->set_key(cursor, &item); + int ret = cursor->search(cursor); + if (ret == WT_NOTFOUND) + return Status::NotFound("DB::Get key not found"); + assert(ret == 0); + ret = cursor->get_value(cursor, &item); + assert(ret == 0); + value->assign((const char *)item.data, item.size); + return Status::OK(); +} +#endif + +// Return a heap-allocated iterator over the contents of the database. +// The result of NewIterator() is initially invalid (caller must +// call one of the Seek methods on the iterator before using it). +// +// Caller should delete the iterator when it is no longer needed. +// The returned iterator should be deleted before this db is deleted. +Iterator * +DbImpl::NewIterator(const ReadOptions& options) +{ + return new IteratorImpl(getCursor(), this, options); +} + +// Return a handle to the current DB state. Iterators created with +// this handle will all observe a stable snapshot of the current DB +// state. The caller must call ReleaseSnapshot(result) when the +// snapshot is no longer needed. +const Snapshot * +DbImpl::GetSnapshot() +{ + return new SnapshotImpl(this); +} + +// Release a previously acquired snapshot. The caller must not +// use "snapshot" after this call. +void +DbImpl::ReleaseSnapshot(const Snapshot* snapshot) +{ + delete (SnapshotImpl *)snapshot; +} + +// DB implementations can export properties about their state +// via this method. If "property" is a valid property understood by this +// DB implementation, fills "*value" with its current value and returns +// true. Otherwise returns false. +// +// +// Valid property names include: +// +// "leveldb.num-files-at-level<N>" - return the number of files at level <N>, +// where <N> is an ASCII representation of a level number (e.g. "0"). +// "leveldb.stats" - returns a multi-line string that describes statistics +// about the internal operation of the DB. +// "leveldb.sstables" - returns a multi-line string that describes all +// of the sstables that make up the db contents. +bool +DbImpl::GetProperty(const Slice& property, std::string* value) +{ + /* Not supported */ + return false; +} + +// For each i in [0,n-1], store in "sizes[i]", the approximate +// file system space used by keys in "[range[i].start .. range[i].limit)". +// +// Note that the returned sizes measure file system space usage, so +// if the user data compresses by a factor of ten, the returned +// sizes will be one-tenth the size of the corresponding user data size. +// +// The results may not include the sizes of recently written data. +void +DbImpl::GetApproximateSizes(const Range* range, int n, + uint64_t* sizes) +{ + int i; + + /* XXX Not supported */ + for (i = 0; i < n; i++) + sizes[i] = 1; +} + +// Compact the underlying storage for the key range [*begin,*end]. +// In particular, deleted and overwritten versions are discarded, +// and the data is rearranged to reduce the cost of operations +// needed to access the data. This operation should typically only +// be invoked by users who understand the underlying implementation. +// +// begin==NULL is treated as a key before all keys in the database. +// end==NULL is treated as a key after all keys in the database. +// Therefore the following call will compact the entire database: +// db->CompactRange(NULL, NULL); +void +DbImpl::CompactRange(const Slice* begin, const Slice* end) +{ + /* Not supported */ + assert(0); +} + +// Suspends the background compaction thread. This methods +// returns once suspended. +void +DbImpl::SuspendCompactions() +{ + /* Not supported */ + assert(0); +} + +// Resumes a suspended background compation thread. +void +DbImpl::ResumeCompactions() +{ + /* Not supported */ + assert(0); +} + +// Position at the first key in the source. The iterator is Valid() +// after this call iff the source is not empty. +void +IteratorImpl::SeekToFirst() +{ + WT_ITEM item; + + int ret = cursor_->reset(cursor_); + assert(ret == 0); + ret = cursor_->next(cursor_); + if (ret != 0) { + valid_ = false; + return; + } + ret = cursor_->get_key(cursor_, &item); + assert(ret == 0); + key_ = Slice((const char *)item.data, item.size); + ret = cursor_->get_value(cursor_, &item); + assert(ret == 0); + value_ = Slice((const char *)item.data, item.size); + valid_ = true; +} + +// Position at the last key in the source. The iterator is +// Valid() after this call iff the source is not empty. +void +IteratorImpl::SeekToLast() +{ + WT_ITEM item; + + int ret = cursor_->reset(cursor_); + assert(ret == 0); + ret = cursor_->prev(cursor_); + if (ret != 0) { + valid_ = false; + return; + } + ret = cursor_->get_key(cursor_, &item); + assert(ret == 0); + key_ = Slice((const char *)item.data, item.size); + ret = cursor_->get_value(cursor_, &item); + assert(ret == 0); + value_ = Slice((const char *)item.data, item.size); + valid_ = true; +} + +// Position at the first key in the source that at or past target +// The iterator is Valid() after this call iff the source contains +// an entry that comes at or past target. +void +IteratorImpl::Seek(const Slice& target) +{ + WT_ITEM item; + + item.data = target.data(); + item.size = target.size(); + cursor_->set_key(cursor_, &item); + int cmp, ret = cursor_->search_near(cursor_, &cmp); + if (ret == 0 && cmp < 0) + ret = cursor_->next(cursor_); + if (ret == WT_NOTFOUND) + status_ = Status::NotFound("Iterator::Seek key not found"); + if (ret != 0) { + valid_ = false; + return; + } + ret = cursor_->get_key(cursor_, &item); + assert(ret == 0); + key_ = Slice((const char *)item.data, item.size); + ret = cursor_->get_value(cursor_, &item); + assert(ret == 0); + value_ = Slice((const char *)item.data, item.size); + valid_ = true; +} + +// Moves to the next entry in the source. After this call, Valid() is +// true iff the iterator was not positioned at the last entry in the source. +// REQUIRES: Valid() +void +IteratorImpl::Next() +{ + WT_ITEM item; + + assert(valid_); + + int ret = cursor_->next(cursor_); + if (ret == WT_NOTFOUND) + status_ = Status::NotFound("Iterator::Next no more records"); + if (ret != 0) { + valid_ = false; + return; + } + ret = cursor_->get_key(cursor_, &item); + assert(ret == 0); + key_ = Slice((const char *)item.data, item.size); + ret = cursor_->get_value(cursor_, &item); + assert(ret == 0); + value_ = Slice((const char *)item.data, item.size); + valid_ = true; +} + +// Moves to the previous entry in the source. After this call, Valid() is +// true iff the iterator was not positioned at the first entry in source. +// REQUIRES: Valid() +void +IteratorImpl::Prev() +{ + WT_ITEM item; + + assert(valid_); + + int ret = cursor_->prev(cursor_); + if (ret == WT_NOTFOUND) + status_ = Status::NotFound("Iterator::Prev no more records"); + if (ret != 0) { + valid_ = false; + return; + } + ret = cursor_->get_key(cursor_, &item); + assert(ret == 0); + key_ = Slice((const char *)item.data, item.size); + ret = cursor_->get_value(cursor_, &item); + assert(ret == 0); + value_ = Slice((const char *)item.data, item.size); + valid_ = true; +} diff --git a/api/leveldb/leveldb_wt.h b/api/leveldb/leveldb_wt.h new file mode 100644 index 00000000000..0c46456a9ef --- /dev/null +++ b/api/leveldb/leveldb_wt.h @@ -0,0 +1,23 @@ +#ifndef WITHOUT_HYPERLEVELDB +#include <hyperleveldb/comparator.h> +#include <hyperleveldb/db.h> +#include <hyperleveldb/env.h> +#include <hyperleveldb/filter_policy.h> +#include <hyperleveldb/slice.h> +#include <hyperleveldb/slice.h> +#include <hyperleveldb/status.h> +#include <hyperleveldb/table_builder.h> +#include <hyperleveldb/write_batch.h> +#else +#include <leveldb/comparator.h> +#include <leveldb/db.h> +#include <leveldb/env.h> +#include <leveldb/filter_policy.h> +#include <leveldb/slice.h> +#include <leveldb/slice.h> +#include <leveldb/status.h> +#include <leveldb/table_builder.h> +#include <leveldb/write_batch.h> +#endif + +#include "wiredtiger.h" diff --git a/api/leveldb/port/port.h b/api/leveldb/port/port.h new file mode 100644 index 00000000000..b6f551182da --- /dev/null +++ b/api/leveldb/port/port.h @@ -0,0 +1,11 @@ +#ifndef _PORT_H_ +#define _PORT_H_ 1 +/* Stub portability header for imported LevelDB code. */ + +#include "wiredtiger.h" + +namespace port { + const int kLittleEndian = 1; +} + +#endif diff --git a/api/leveldb/util/arena.h b/api/leveldb/util/arena.h new file mode 100644 index 00000000000..8f7dde226c4 --- /dev/null +++ b/api/leveldb/util/arena.h @@ -0,0 +1,68 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_ARENA_H_ +#define STORAGE_LEVELDB_UTIL_ARENA_H_ + +#include <cstddef> +#include <vector> +#include <assert.h> +#include <stdint.h> + +namespace leveldb { + +class Arena { + public: + Arena(); + ~Arena(); + + // Return a pointer to a newly allocated memory block of "bytes" bytes. + char* Allocate(size_t bytes); + + // Allocate memory with the normal alignment guarantees provided by malloc + char* AllocateAligned(size_t bytes); + + // Returns an estimate of the total memory usage of data allocated + // by the arena (including space allocated but not yet used for user + // allocations). + size_t MemoryUsage() const { + return blocks_memory_ + blocks_.capacity() * sizeof(char*); + } + + private: + char* AllocateFallback(size_t bytes); + char* AllocateNewBlock(size_t block_bytes); + + // Allocation state + char* alloc_ptr_; + size_t alloc_bytes_remaining_; + + // Array of new[] allocated memory blocks + std::vector<char*> blocks_; + + // Bytes of memory in blocks allocated so far + size_t blocks_memory_; + + // No copying allowed + Arena(const Arena&); + void operator=(const Arena&); +}; + +inline char* Arena::Allocate(size_t bytes) { + // The semantics of what to return are a bit messy if we allow + // 0-byte allocations, so we disallow them here (we don't need + // them for our internal use). + assert(bytes > 0); + if (bytes <= alloc_bytes_remaining_) { + char* result = alloc_ptr_; + alloc_ptr_ += bytes; + alloc_bytes_remaining_ -= bytes; + return result; + } + return AllocateFallback(bytes); +} + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_UTIL_ARENA_H_ diff --git a/api/leveldb/util/coding.cc b/api/leveldb/util/coding.cc new file mode 100644 index 00000000000..dbd7a6545c6 --- /dev/null +++ b/api/leveldb/util/coding.cc @@ -0,0 +1,194 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/coding.h" + +namespace leveldb { + +void EncodeFixed32(char* buf, uint32_t value) { +#if __BYTE_ORDER == __LITTLE_ENDIAN + memcpy(buf, &value, sizeof(value)); +#else + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; +#endif +} + +void EncodeFixed64(char* buf, uint64_t value) { +#if __BYTE_ORDER == __LITTLE_ENDIAN + memcpy(buf, &value, sizeof(value)); +#else + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; + buf[4] = (value >> 32) & 0xff; + buf[5] = (value >> 40) & 0xff; + buf[6] = (value >> 48) & 0xff; + buf[7] = (value >> 56) & 0xff; +#endif +} + +void PutFixed32(std::string* dst, uint32_t value) { + char buf[sizeof(value)]; + EncodeFixed32(buf, value); + dst->append(buf, sizeof(buf)); +} + +void PutFixed64(std::string* dst, uint64_t value) { + char buf[sizeof(value)]; + EncodeFixed64(buf, value); + dst->append(buf, sizeof(buf)); +} + +char* EncodeVarint32(char* dst, uint32_t v) { + // Operate on characters as unsigneds + unsigned char* ptr = reinterpret_cast<unsigned char*>(dst); + static const int B = 128; + if (v < (1<<7)) { + *(ptr++) = v; + } else if (v < (1<<14)) { + *(ptr++) = v | B; + *(ptr++) = v>>7; + } else if (v < (1<<21)) { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = v>>14; + } else if (v < (1<<28)) { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = (v>>14) | B; + *(ptr++) = v>>21; + } else { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = (v>>14) | B; + *(ptr++) = (v>>21) | B; + *(ptr++) = v>>28; + } + return reinterpret_cast<char*>(ptr); +} + +void PutVarint32(std::string* dst, uint32_t v) { + char buf[5]; + char* ptr = EncodeVarint32(buf, v); + dst->append(buf, ptr - buf); +} + +char* EncodeVarint64(char* dst, uint64_t v) { + static const int B = 128; + unsigned char* ptr = reinterpret_cast<unsigned char*>(dst); + while (v >= B) { + *(ptr++) = (v & (B-1)) | B; + v >>= 7; + } + *(ptr++) = static_cast<unsigned char>(v); + return reinterpret_cast<char*>(ptr); +} + +void PutVarint64(std::string* dst, uint64_t v) { + char buf[10]; + char* ptr = EncodeVarint64(buf, v); + dst->append(buf, ptr - buf); +} + +void PutLengthPrefixedSlice(std::string* dst, const Slice& value) { + PutVarint32(dst, value.size()); + dst->append(value.data(), value.size()); +} + +int VarintLength(uint64_t v) { + int len = 1; + while (v >= 128) { + v >>= 7; + len++; + } + return len; +} + +const char* GetVarint32PtrFallback(const char* p, + const char* limit, + uint32_t* value) { + uint32_t result = 0; + for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) { + uint32_t byte = *(reinterpret_cast<const unsigned char*>(p)); + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return reinterpret_cast<const char*>(p); + } + } + return NULL; +} + +bool GetVarint32(Slice* input, uint32_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint32Ptr(p, limit, value); + if (q == NULL) { + return false; + } else { + *input = Slice(q, limit - q); + return true; + } +} + +const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) { + uint64_t result = 0; + for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) { + uint64_t byte = *(reinterpret_cast<const unsigned char*>(p)); + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return reinterpret_cast<const char*>(p); + } + } + return NULL; +} + +bool GetVarint64(Slice* input, uint64_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint64Ptr(p, limit, value); + if (q == NULL) { + return false; + } else { + *input = Slice(q, limit - q); + return true; + } +} + +const char* GetLengthPrefixedSlice(const char* p, const char* limit, + Slice* result) { + uint32_t len; + p = GetVarint32Ptr(p, limit, &len); + if (p == NULL) return NULL; + if (p + len > limit) return NULL; + *result = Slice(p, len); + return p + len; +} + +bool GetLengthPrefixedSlice(Slice* input, Slice* result) { + uint32_t len; + if (GetVarint32(input, &len) && + input->size() >= len) { + *result = Slice(input->data(), len); + input->remove_prefix(len); + return true; + } else { + return false; + } +} + +} // namespace leveldb diff --git a/api/leveldb/util/coding.h b/api/leveldb/util/coding.h new file mode 100644 index 00000000000..cba97ec421a --- /dev/null +++ b/api/leveldb/util/coding.h @@ -0,0 +1,104 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Endian-neutral encoding: +// * Fixed-length numbers are encoded with least-significant byte first +// * In addition we support variable length "varint" encoding +// * Strings are encoded prefixed by their length in varint format + +#ifndef STORAGE_LEVELDB_UTIL_CODING_H_ +#define STORAGE_LEVELDB_UTIL_CODING_H_ + +#include <stdint.h> +#include <string.h> +#include <string> +#include "leveldb_wt.h" +#include "port/port.h" + +namespace leveldb { + +// Standard Put... routines append to a string +extern void PutFixed32(std::string* dst, uint32_t value); +extern void PutFixed64(std::string* dst, uint64_t value); +extern void PutVarint32(std::string* dst, uint32_t value); +extern void PutVarint64(std::string* dst, uint64_t value); +extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value); + +// Standard Get... routines parse a value from the beginning of a Slice +// and advance the slice past the parsed value. +extern bool GetVarint32(Slice* input, uint32_t* value); +extern bool GetVarint64(Slice* input, uint64_t* value); +extern bool GetLengthPrefixedSlice(Slice* input, Slice* result); + +// Pointer-based variants of GetVarint... These either store a value +// in *v and return a pointer just past the parsed value, or return +// NULL on error. These routines only look at bytes in the range +// [p..limit-1] +extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v); +extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v); + +// Returns the length of the varint32 or varint64 encoding of "v" +extern int VarintLength(uint64_t v); + +// Lower-level versions of Put... that write directly into a character buffer +// REQUIRES: dst has enough space for the value being written +extern void EncodeFixed32(char* dst, uint32_t value); +extern void EncodeFixed64(char* dst, uint64_t value); + +// Lower-level versions of Put... that write directly into a character buffer +// and return a pointer just past the last byte written. +// REQUIRES: dst has enough space for the value being written +extern char* EncodeVarint32(char* dst, uint32_t value); +extern char* EncodeVarint64(char* dst, uint64_t value); + +// Lower-level versions of Get... that read directly from a character buffer +// without any bounds checking. + +inline uint32_t DecodeFixed32(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint32_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0]))) + | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8) + | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16) + | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24)); + } +} + +inline uint64_t DecodeFixed64(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint64_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + uint64_t lo = DecodeFixed32(ptr); + uint64_t hi = DecodeFixed32(ptr + 4); + return (hi << 32) | lo; + } +} + +// Internal routine for use by fallback path of GetVarint32Ptr +extern const char* GetVarint32PtrFallback(const char* p, + const char* limit, + uint32_t* value); +inline const char* GetVarint32Ptr(const char* p, + const char* limit, + uint32_t* value) { + if (p < limit) { + uint32_t result = *(reinterpret_cast<const unsigned char*>(p)); + if ((result & 128) == 0) { + *value = result; + return p + 1; + } + } + return GetVarint32PtrFallback(p, limit, value); +} + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_UTIL_CODING_H_ diff --git a/api/leveldb/util/comparator.cc b/api/leveldb/util/comparator.cc new file mode 100644 index 00000000000..c6c9e17efee --- /dev/null +++ b/api/leveldb/util/comparator.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include <algorithm> +#include <stdint.h> +#include "leveldb_wt.h" +#include "port/port.h" +#include "util/logging.h" + +namespace leveldb { + +Comparator::~Comparator() { } + +#ifndef WITHOUT_HYPERLEVELDB +uint64_t Comparator::KeyNum(const Slice& key) const { + return 0; +} +#endif + +namespace { +class BytewiseComparatorImpl : public Comparator { + public: + BytewiseComparatorImpl() { } + + virtual const char* Name() const { + return "leveldb.BytewiseComparator"; + } + + virtual int Compare(const Slice& a, const Slice& b) const { + return a.compare(b); + } + + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const { + // Find length of common prefix + size_t min_length = std::min(start->size(), limit.size()); + size_t diff_index = 0; + while ((diff_index < min_length) && + ((*start)[diff_index] == limit[diff_index])) { + diff_index++; + } + + if (diff_index >= min_length) { + // Do not shorten if one string is a prefix of the other + } else { + uint8_t diff_byte = static_cast<uint8_t>((*start)[diff_index]); + if (diff_byte < static_cast<uint8_t>(0xff) && + diff_byte + 1 < static_cast<uint8_t>(limit[diff_index])) { + (*start)[diff_index]++; + start->resize(diff_index + 1); + assert(Compare(*start, limit) < 0); + } + } + } + + virtual void FindShortSuccessor(std::string* key) const { + // Find first character that can be incremented + size_t n = key->size(); + for (size_t i = 0; i < n; i++) { + const uint8_t byte = (*key)[i]; + if (byte != static_cast<uint8_t>(0xff)) { + (*key)[i] = byte + 1; + key->resize(i+1); + return; + } + } + // *key is a run of 0xffs. Leave it alone. + } +}; +} // namespace + +static const Comparator* bytewise = new BytewiseComparatorImpl; + +const Comparator* BytewiseComparator() { + return bytewise; +} + +} // namespace leveldb diff --git a/api/leveldb/util/env.cc b/api/leveldb/util/env.cc new file mode 100644 index 00000000000..33062fbdd1b --- /dev/null +++ b/api/leveldb/util/env.cc @@ -0,0 +1,96 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "hyperleveldb/env.h" + +namespace leveldb { + +Env::~Env() { +} + +SequentialFile::~SequentialFile() { +} + +RandomAccessFile::~RandomAccessFile() { +} + +WritableFile::~WritableFile() { +} + +Logger::~Logger() { +} + +FileLock::~FileLock() { +} + +void Log(Logger* info_log, const char* format, ...) { + if (info_log != NULL) { + va_list ap; + va_start(ap, format); + info_log->Logv(format, ap); + va_end(ap); + } +} + +static Status DoWriteStringToFile(Env* env, const Slice& data, + const std::string& fname, + bool should_sync) { + WritableFile* file; + Status s = env->NewWritableFile(fname, &file); + if (!s.ok()) { + return s; + } + s = file->Append(data); + if (s.ok() && should_sync) { + s = file->Sync(); + } + if (s.ok()) { + s = file->Close(); + } + delete file; // Will auto-close if we did not close above + if (!s.ok()) { + env->DeleteFile(fname); + } + return s; +} + +Status WriteStringToFile(Env* env, const Slice& data, + const std::string& fname) { + return DoWriteStringToFile(env, data, fname, false); +} + +Status WriteStringToFileSync(Env* env, const Slice& data, + const std::string& fname) { + return DoWriteStringToFile(env, data, fname, true); +} + +Status ReadFileToString(Env* env, const std::string& fname, std::string* data) { + data->clear(); + SequentialFile* file; + Status s = env->NewSequentialFile(fname, &file); + if (!s.ok()) { + return s; + } + static const int kBufferSize = 8192; + char* space = new char[kBufferSize]; + while (true) { + Slice fragment; + s = file->Read(kBufferSize, &fragment, space); + if (!s.ok()) { + break; + } + data->append(fragment.data(), fragment.size()); + if (fragment.empty()) { + break; + } + } + delete[] space; + delete file; + return s; +} + +EnvWrapper::~EnvWrapper() { +} + +} // namespace leveldb diff --git a/api/leveldb/util/env_posix.cc b/api/leveldb/util/env_posix.cc new file mode 100644 index 00000000000..66dab413360 --- /dev/null +++ b/api/leveldb/util/env_posix.cc @@ -0,0 +1,617 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include <deque> +#include <dirent.h> +#include <errno.h> +#include <fcntl.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/types.h> +#include <time.h> +#include <unistd.h> +#if defined(LEVELDB_PLATFORM_ANDROID) +#include <sys/stat.h> +#endif +#include "leveldb_wt.h" +#include "port/port.h" +#include "util/logging.h" +#include "util/posix_logger.h" + +namespace leveldb { + +namespace { + +static Status IOError(const std::string& context, int err_number) { + return Status::IOError(context, strerror(err_number)); +} + +class PosixSequentialFile: public SequentialFile { + private: + std::string filename_; + FILE* file_; + + public: + PosixSequentialFile(const std::string& fname, FILE* f) + : filename_(fname), file_(f) { } + virtual ~PosixSequentialFile() { fclose(file_); } + + virtual Status Read(size_t n, Slice* result, char* scratch) { + Status s; + size_t r = fread_unlocked(scratch, 1, n, file_); + *result = Slice(scratch, r); + if (r < n) { + if (feof(file_)) { + // We leave status as ok if we hit the end of the file + } else { + // A partial read with an error: return a non-ok status + s = IOError(filename_, errno); + } + } + return s; + } + + virtual Status Skip(uint64_t n) { + if (fseek(file_, n, SEEK_CUR)) { + return IOError(filename_, errno); + } + return Status::OK(); + } +}; + +// pread() based random-access +class PosixRandomAccessFile: public RandomAccessFile { + private: + std::string filename_; + int fd_; + + public: + PosixRandomAccessFile(const std::string& fname, int fd) + : filename_(fname), fd_(fd) { } + virtual ~PosixRandomAccessFile() { close(fd_); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + ssize_t r = pread(fd_, scratch, n, static_cast<off_t>(offset)); + *result = Slice(scratch, (r < 0) ? 0 : r); + if (r < 0) { + // An error: return a non-ok status + s = IOError(filename_, errno); + } + return s; + } +}; + +// mmap() based random-access +class PosixMmapReadableFile: public RandomAccessFile { + private: + std::string filename_; + void* mmapped_region_; + size_t length_; + + public: + // base[0,length-1] contains the mmapped contents of the file. + PosixMmapReadableFile(const std::string& fname, void* base, size_t length) + : filename_(fname), mmapped_region_(base), length_(length) { } + virtual ~PosixMmapReadableFile() { munmap(mmapped_region_, length_); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + if (offset + n > length_) { + *result = Slice(); + s = IOError(filename_, EINVAL); + } else { + *result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n); + } + return s; + } +}; + +// We preallocate up to an extra megabyte and use memcpy to append new +// data to the file. This is safe since we either properly close the +// file before reading from it, or for log files, the reading code +// knows enough to skip zero suffixes. +class PosixMmapFile : public WritableFile { + private: + std::string filename_; + int fd_; + size_t page_size_; + size_t map_size_; // How much extra memory to map at a time + char* base_; // The mapped region + char* limit_; // Limit of the mapped region + char* dst_; // Where to write next (in range [base_,limit_]) + char* last_sync_; // Where have we synced up to + uint64_t file_offset_; // Offset of base_ in file + + // Have we done an munmap of unsynced data? + bool pending_sync_; + + // Roundup x to a multiple of y + static size_t Roundup(size_t x, size_t y) { + return ((x + y - 1) / y) * y; + } + + size_t TruncateToPageBoundary(size_t s) { + s -= (s & (page_size_ - 1)); + assert((s % page_size_) == 0); + return s; + } + + bool UnmapCurrentRegion() { + bool result = true; + if (base_ != NULL) { + if (last_sync_ < limit_) { + // Defer syncing this data until next Sync() call, if any + pending_sync_ = true; + } + if (munmap(base_, limit_ - base_) != 0) { + result = false; + } + file_offset_ += limit_ - base_; + base_ = NULL; + limit_ = NULL; + last_sync_ = NULL; + dst_ = NULL; + + // Increase the amount we map the next time, but capped at 1MB + if (map_size_ < (1<<20)) { + map_size_ *= 2; + } + } + return result; + } + + bool MapNewRegion() { + assert(base_ == NULL); + if (ftruncate(fd_, file_offset_ + map_size_) < 0) { + return false; + } + void* ptr = mmap(NULL, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, + fd_, file_offset_); + if (ptr == MAP_FAILED) { + return false; + } + base_ = reinterpret_cast<char*>(ptr); + limit_ = base_ + map_size_; + dst_ = base_; + last_sync_ = base_; + return true; + } + + public: + PosixMmapFile(const std::string& fname, int fd, size_t page_size) + : filename_(fname), + fd_(fd), + page_size_(page_size), + map_size_(Roundup(65536, page_size)), + base_(NULL), + limit_(NULL), + dst_(NULL), + last_sync_(NULL), + file_offset_(0), + pending_sync_(false) { + assert((page_size & (page_size - 1)) == 0); + } + + + ~PosixMmapFile() { + if (fd_ >= 0) { + PosixMmapFile::Close(); + } + } + + virtual Status Append(const Slice& data) { + const char* src = data.data(); + size_t left = data.size(); + while (left > 0) { + assert(base_ <= dst_); + assert(dst_ <= limit_); + size_t avail = limit_ - dst_; + if (avail == 0) { + if (!UnmapCurrentRegion() || + !MapNewRegion()) { + return IOError(filename_, errno); + } + } + + size_t n = (left <= avail) ? left : avail; + memcpy(dst_, src, n); + dst_ += n; + src += n; + left -= n; + } + return Status::OK(); + } + + virtual Status Close() { + Status s; + size_t unused = limit_ - dst_; + if (!UnmapCurrentRegion()) { + s = IOError(filename_, errno); + } else if (unused > 0) { + // Trim the extra space at the end of the file + if (ftruncate(fd_, file_offset_ - unused) < 0) { + s = IOError(filename_, errno); + } + } + + if (close(fd_) < 0) { + if (s.ok()) { + s = IOError(filename_, errno); + } + } + + fd_ = -1; + base_ = NULL; + limit_ = NULL; + return s; + } + + virtual Status Flush() { + return Status::OK(); + } + + virtual Status Sync() { + Status s; + + if (pending_sync_) { + // Some unmapped data was not synced + pending_sync_ = false; + if (fdatasync(fd_) < 0) { + s = IOError(filename_, errno); + } + } + + if (dst_ > last_sync_) { + // Find the beginnings of the pages that contain the first and last + // bytes to be synced. + size_t p1 = TruncateToPageBoundary(last_sync_ - base_); + size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1); + last_sync_ = dst_; + if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { + s = IOError(filename_, errno); + } + } + + return s; + } + +#ifndef WITHOUT_HYPERLEVELDB + virtual Status WriteAt(uint64_t offset, const Slice& data) { return Status::NotSupported("sorry!"); } +#endif +}; + +static int LockOrUnlock(int fd, bool lock) { + errno = 0; + struct flock f; + memset(&f, 0, sizeof(f)); + f.l_type = (lock ? F_WRLCK : F_UNLCK); + f.l_whence = SEEK_SET; + f.l_start = 0; + f.l_len = 0; // Lock/unlock entire file + return fcntl(fd, F_SETLK, &f); +} + +class PosixFileLock : public FileLock { + public: + int fd_; +}; + +class PosixEnv : public Env { + public: + PosixEnv(); + virtual ~PosixEnv() { + fprintf(stderr, "Destroying Env::Default()\n"); + exit(1); + } + + virtual Status NewSequentialFile(const std::string& fname, + SequentialFile** result) { + FILE* f = fopen(fname.c_str(), "r"); + if (f == NULL) { + *result = NULL; + return IOError(fname, errno); + } else { + *result = new PosixSequentialFile(fname, f); + return Status::OK(); + } + } + + virtual Status NewRandomAccessFile(const std::string& fname, + RandomAccessFile** result) { + *result = NULL; + Status s; + int fd = open(fname.c_str(), O_RDONLY); + if (fd < 0) { + s = IOError(fname, errno); + } else if (sizeof(void*) >= 8) { + // Use mmap when virtual address-space is plentiful. + uint64_t size; + s = GetFileSize(fname, &size); + if (s.ok()) { + void* base = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); + if (base != MAP_FAILED) { + *result = new PosixMmapReadableFile(fname, base, size); + } else { + s = IOError(fname, errno); + } + } + close(fd); + } else { + *result = new PosixRandomAccessFile(fname, fd); + } + return s; + } + + virtual Status NewWritableFile(const std::string& fname, + WritableFile** result) { + Status s; + const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); + if (fd < 0) { + *result = NULL; + s = IOError(fname, errno); + } else { + *result = new PosixMmapFile(fname, fd, page_size_); + } + return s; + } + + virtual bool FileExists(const std::string& fname) { + return access(fname.c_str(), F_OK) == 0; + } + + virtual Status GetChildren(const std::string& dir, + std::vector<std::string>* result) { + result->clear(); + DIR* d = opendir(dir.c_str()); + if (d == NULL) { + return IOError(dir, errno); + } + struct dirent* entry; + while ((entry = readdir(d)) != NULL) { + result->push_back(entry->d_name); + } + closedir(d); + return Status::OK(); + } + + virtual Status DeleteFile(const std::string& fname) { + Status result; + if (unlink(fname.c_str()) != 0) { + result = IOError(fname, errno); + } + return result; + }; + + virtual Status CreateDir(const std::string& name) { + Status result; + if (mkdir(name.c_str(), 0755) != 0) { + result = IOError(name, errno); + } + return result; + }; + + virtual Status DeleteDir(const std::string& name) { + Status result; + if (rmdir(name.c_str()) != 0) { + result = IOError(name, errno); + } + return result; + }; + + virtual Status GetFileSize(const std::string& fname, uint64_t* size) { + Status s; + struct stat sbuf; + if (stat(fname.c_str(), &sbuf) != 0) { + *size = 0; + s = IOError(fname, errno); + } else { + *size = sbuf.st_size; + } + return s; + } + + virtual Status RenameFile(const std::string& src, const std::string& target) { + Status result; + if (rename(src.c_str(), target.c_str()) != 0) { + result = IOError(src, errno); + } + return result; + } + + virtual Status LockFile(const std::string& fname, FileLock** lock) { + *lock = NULL; + Status result; + int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644); + if (fd < 0) { + result = IOError(fname, errno); + } else if (LockOrUnlock(fd, true) == -1) { + result = IOError("lock " + fname, errno); + close(fd); + } else { + PosixFileLock* my_lock = new PosixFileLock; + my_lock->fd_ = fd; + *lock = my_lock; + } + return result; + } + + virtual Status UnlockFile(FileLock* lock) { + PosixFileLock* my_lock = reinterpret_cast<PosixFileLock*>(lock); + Status result; + if (LockOrUnlock(my_lock->fd_, false) == -1) { + result = IOError("unlock", errno); + } + close(my_lock->fd_); + delete my_lock; + return result; + } + + virtual void Schedule(void (*function)(void*), void* arg); + + virtual void StartThread(void (*function)(void* arg), void* arg); + + virtual Status GetTestDirectory(std::string* result) { + const char* env = getenv("TEST_TMPDIR"); + if (env && env[0] != '\0') { + *result = env; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "/tmp/leveldbtest-%d", int(geteuid())); + *result = buf; + } + // Directory may already exist + CreateDir(*result); + return Status::OK(); + } + + static uint64_t gettid() { + pthread_t tid = pthread_self(); + uint64_t thread_id = 0; + memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid))); + return thread_id; + } + + virtual Status NewLogger(const std::string& fname, Logger** result) { + FILE* f = fopen(fname.c_str(), "w"); + if (f == NULL) { + *result = NULL; + return IOError(fname, errno); + } else { + *result = new PosixLogger(f, &PosixEnv::gettid); + return Status::OK(); + } + } + + virtual uint64_t NowMicros() { + struct timeval tv; + gettimeofday(&tv, NULL); + return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec; + } + + virtual void SleepForMicroseconds(int micros) { + usleep(micros); + } + +#ifndef WITHOUT_HYPERLEVELDB + virtual Status CopyFile(const std::string&, const std::string&) { return Status::NotSupported("sorry!"); } + virtual Status LinkFile(const std::string&, const std::string&) { return Status::NotSupported("sorry!"); } +#endif + + private: + void PthreadCall(const char* label, int result) { + if (result != 0) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + exit(1); + } + } + + // BGThread() is the body of the background thread + void BGThread(); + static void* BGThreadWrapper(void* arg) { + reinterpret_cast<PosixEnv*>(arg)->BGThread(); + return NULL; + } + + size_t page_size_; + pthread_mutex_t mu_; + pthread_cond_t bgsignal_; + pthread_t bgthread_; + bool started_bgthread_; + + // Entry per Schedule() call + struct BGItem { void* arg; void (*function)(void*); }; + typedef std::deque<BGItem> BGQueue; + BGQueue queue_; +}; + +PosixEnv::PosixEnv() : page_size_(getpagesize()), + started_bgthread_(false) { + PthreadCall("mutex_init", pthread_mutex_init(&mu_, NULL)); + PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, NULL)); +} + +void PosixEnv::Schedule(void (*function)(void*), void* arg) { + PthreadCall("lock", pthread_mutex_lock(&mu_)); + + // Start background thread if necessary + if (!started_bgthread_) { + started_bgthread_ = true; + PthreadCall( + "create thread", + pthread_create(&bgthread_, NULL, &PosixEnv::BGThreadWrapper, this)); + } + + // If the queue is currently empty, the background thread may currently be + // waiting. + if (queue_.empty()) { + PthreadCall("signal", pthread_cond_signal(&bgsignal_)); + } + + // Add to priority queue + queue_.push_back(BGItem()); + queue_.back().function = function; + queue_.back().arg = arg; + + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); +} + +void PosixEnv::BGThread() { + while (true) { + // Wait until there is an item that is ready to run + PthreadCall("lock", pthread_mutex_lock(&mu_)); + while (queue_.empty()) { + PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_)); + } + + void (*function)(void*) = queue_.front().function; + void* arg = queue_.front().arg; + queue_.pop_front(); + + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + (*function)(arg); + } +} + +namespace { +struct StartThreadState { + void (*user_function)(void*); + void* arg; +}; +} +static void* StartThreadWrapper(void* arg) { + StartThreadState* state = reinterpret_cast<StartThreadState*>(arg); + state->user_function(state->arg); + delete state; + return NULL; +} + +void PosixEnv::StartThread(void (*function)(void* arg), void* arg) { + pthread_t t; + StartThreadState* state = new StartThreadState; + state->user_function = function; + state->arg = arg; + PthreadCall("start thread", + pthread_create(&t, NULL, &StartThreadWrapper, state)); +} + +} // namespace + +static pthread_once_t once = PTHREAD_ONCE_INIT; +static Env* default_env; +static void InitDefaultEnv() { default_env = new PosixEnv; } + +Env* Env::Default() { + pthread_once(&once, InitDefaultEnv); + return default_env; +} + +} // namespace leveldb diff --git a/api/leveldb/util/logging.cc b/api/leveldb/util/logging.cc new file mode 100644 index 00000000000..1d60ce22701 --- /dev/null +++ b/api/leveldb/util/logging.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/logging.h" + +#include <errno.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include "leveldb_wt.h" + +namespace leveldb { + +void AppendNumberTo(std::string* str, uint64_t num) { + char buf[30]; + snprintf(buf, sizeof(buf), "%llu", (unsigned long long) num); + str->append(buf); +} + +void AppendEscapedStringTo(std::string* str, const Slice& value) { + for (size_t i = 0; i < value.size(); i++) { + char c = value[i]; + if (c >= ' ' && c <= '~') { + str->push_back(c); + } else { + char buf[10]; + snprintf(buf, sizeof(buf), "\\x%02x", + static_cast<unsigned int>(c) & 0xff); + str->append(buf); + } + } +} + +std::string NumberToString(uint64_t num) { + std::string r; + AppendNumberTo(&r, num); + return r; +} + +std::string EscapeString(const Slice& value) { + std::string r; + AppendEscapedStringTo(&r, value); + return r; +} + +bool ConsumeChar(Slice* in, char c) { + if (!in->empty() && (*in)[0] == c) { + in->remove_prefix(1); + return true; + } else { + return false; + } +} + +bool ConsumeDecimalNumber(Slice* in, uint64_t* val) { + uint64_t v = 0; + int digits = 0; + while (!in->empty()) { + char c = (*in)[0]; + if (c >= '0' && c <= '9') { + ++digits; + const int delta = (c - '0'); + static const uint64_t kMaxUint64 = ~static_cast<uint64_t>(0); + if (v > kMaxUint64/10 || + (v == kMaxUint64/10 && delta > kMaxUint64%10)) { + // Overflow + return false; + } + v = (v * 10) + delta; + in->remove_prefix(1); + } else { + break; + } + } + *val = v; + return (digits > 0); +} + +} // namespace leveldb diff --git a/api/leveldb/util/logging.h b/api/leveldb/util/logging.h new file mode 100644 index 00000000000..b0c5da813e8 --- /dev/null +++ b/api/leveldb/util/logging.h @@ -0,0 +1,47 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Must not be included from any .h files to avoid polluting the namespace +// with macros. + +#ifndef STORAGE_LEVELDB_UTIL_LOGGING_H_ +#define STORAGE_LEVELDB_UTIL_LOGGING_H_ + +#include <stdio.h> +#include <stdint.h> +#include <string> +#include "port/port.h" + +namespace leveldb { + +class Slice; +class WritableFile; + +// Append a human-readable printout of "num" to *str +extern void AppendNumberTo(std::string* str, uint64_t num); + +// Append a human-readable printout of "value" to *str. +// Escapes any non-printable characters found in "value". +extern void AppendEscapedStringTo(std::string* str, const Slice& value); + +// Return a human-readable printout of "num" +extern std::string NumberToString(uint64_t num); + +// Return a human-readable version of "value". +// Escapes any non-printable characters found in "value". +extern std::string EscapeString(const Slice& value); + +// If *in starts with "c", advances *in past the first character and +// returns true. Otherwise, returns false. +extern bool ConsumeChar(Slice* in, char c); + +// Parse a human-readable number from "*in" into *value. On success, +// advances "*in" past the consumed number and sets "*val" to the +// numeric value. Otherwise, returns false and leaves *in in an +// unspecified state. +extern bool ConsumeDecimalNumber(Slice* in, uint64_t* val); + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_UTIL_LOGGING_H_ diff --git a/api/leveldb/util/options.cc b/api/leveldb/util/options.cc new file mode 100644 index 00000000000..6956ab6e739 --- /dev/null +++ b/api/leveldb/util/options.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "hyperleveldb/options.h" + +#include "hyperleveldb/comparator.h" +#include "hyperleveldb/env.h" + +namespace leveldb { + +Options::Options() + : comparator(BytewiseComparator()), + create_if_missing(false), + error_if_exists(false), + paranoid_checks(false), + env(Env::Default()), + info_log(NULL), + write_buffer_size(4<<20), + max_open_files(1000), + block_cache(NULL), + block_size(4096), + block_restart_interval(16), + compression(kSnappyCompression), + filter_policy(NULL) { +} + + +} // namespace leveldb diff --git a/api/leveldb/util/posix_logger.h b/api/leveldb/util/posix_logger.h new file mode 100644 index 00000000000..f15de45e05e --- /dev/null +++ b/api/leveldb/util/posix_logger.h @@ -0,0 +1,98 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Logger implementation that can be shared by all environments +// where enough posix functionality is available. + +#ifndef STORAGE_LEVELDB_UTIL_POSIX_LOGGER_H_ +#define STORAGE_LEVELDB_UTIL_POSIX_LOGGER_H_ + +#include <algorithm> +#include <stdio.h> +#include <sys/time.h> +#include <time.h> +#include "leveldb_wt.h" + +namespace leveldb { + +class PosixLogger : public Logger { + private: + FILE* file_; + uint64_t (*gettid_)(); // Return the thread id for the current thread + public: + PosixLogger(FILE* f, uint64_t (*gettid)()) : file_(f), gettid_(gettid) { } + virtual ~PosixLogger() { + fclose(file_); + } + virtual void Logv(const char* format, va_list ap) { + const uint64_t thread_id = (*gettid_)(); + + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + for (int iter = 0; iter < 2; iter++) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 30000; + base = new char[bufsize]; + } + char* p = base; + char* limit = base + bufsize; + + struct timeval now_tv; + gettimeofday(&now_tv, NULL); + const time_t seconds = now_tv.tv_sec; + struct tm t; + localtime_r(&seconds, &t); + p += snprintf(p, limit - p, + "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", + t.tm_year + 1900, + t.tm_mon + 1, + t.tm_mday, + t.tm_hour, + t.tm_min, + t.tm_sec, + static_cast<int>(now_tv.tv_usec), + static_cast<long long unsigned int>(thread_id)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + p += vsnprintf(p, limit - p, format, backup_ap); + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + fwrite(base, 1, p - base, file_); + fflush(file_); + if (base != buffer) { + delete[] base; + } + break; + } + } +}; + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_UTIL_POSIX_LOGGER_H_ diff --git a/api/leveldb/util/random.h b/api/leveldb/util/random.h new file mode 100644 index 00000000000..66e0c94e7cb --- /dev/null +++ b/api/leveldb/util/random.h @@ -0,0 +1,72 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_RANDOM_H_ +#define STORAGE_LEVELDB_UTIL_RANDOM_H_ + +#include <stdint.h> + +namespace leveldb { + +// A very simple random number generator. Not especially good at +// generating truly random bits, but good enough for our needs in this +// package. +class Random { + private: + uint32_t seed_; + public: + explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { } + uint32_t Next() { + static const uint32_t M = 2147483647L; // 2^31-1 + static const uint64_t A = 16807; // bits 14, 8, 7, 5, 2, 1, 0 + // We are computing + // seed_ = (seed_ * A) % M, where M = 2^31-1 + // + // seed_ must not be zero or M, or else all subsequent computed values + // will be zero or M respectively. For all other values, seed_ will end + // up cycling through every number in [1,M-1] + uint64_t product = seed_ * A; + + // Compute (product % M) using the fact that ((x << 31) % M) == x. + seed_ = static_cast<uint32_t>((product >> 31) + (product & M)); + // The first reduction may overflow by 1 bit, so we may need to + // repeat. mod == M is not possible; using > allows the faster + // sign-bit-based test. + if (seed_ > M) { + seed_ -= M; + } + return seed_; + } + // Returns a uniformly distributed value in the range [0..n-1] + // REQUIRES: n > 0 + uint32_t Uniform(int n) { return Next() % n; } + + // Randomly returns true ~"1/n" of the time, and false otherwise. + // REQUIRES: n > 0 + bool OneIn(int n) { return (Next() % n) == 0; } + + // Skewed: pick "base" uniformly from range [0,max_log] and then + // return "base" random bits. The effect is to pick a number in the + // range [0,2^max_log-1] with exponential bias towards smaller numbers. + uint32_t Skewed(int max_log) { + return Uniform(1 << Uniform(max_log + 1)); + } + + // Shuffle the array into random order + void Shuffle(int *array, int n) { + if (n > 1) { + int i; + for (i=0; i<n-1; i++) { + int j = i + Next() / (2147483647 / (n-i) + 1); + int t = array[j]; + array[j] = array[i]; + array[i] = t; + } + } + } +}; + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_UTIL_RANDOM_H_ diff --git a/api/leveldb/util/status.cc b/api/leveldb/util/status.cc new file mode 100644 index 00000000000..51a8210b679 --- /dev/null +++ b/api/leveldb/util/status.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include <stdio.h> +#include "leveldb_wt.h" + +namespace leveldb { + +const char* Status::CopyState(const char* state) { + uint32_t size; + memcpy(&size, state, sizeof(size)); + char* result = new char[size + 5]; + memcpy(result, state, size + 5); + return result; +} + +Status::Status(Code code, const Slice& msg, const Slice& msg2) { + assert(code != kOk); + const uint32_t len1 = msg.size(); + const uint32_t len2 = msg2.size(); + const uint32_t size = len1 + (len2 ? (2 + len2) : 0); + char* result = new char[size + 5]; + memcpy(result, &size, sizeof(size)); + result[4] = static_cast<char>(code); + memcpy(result + 5, msg.data(), len1); + if (len2) { + result[5 + len1] = ':'; + result[6 + len1] = ' '; + memcpy(result + 7 + len1, msg2.data(), len2); + } + state_ = result; +} + +std::string Status::ToString() const { + if (state_ == NULL) { + return "OK"; + } else { + char tmp[30]; + const char* type; + switch (code()) { + case kOk: + type = "OK"; + break; + case kNotFound: + type = "NotFound: "; + break; + case kCorruption: + type = "Corruption: "; + break; + case kNotSupported: + type = "Not implemented: "; + break; + case kInvalidArgument: + type = "Invalid argument: "; + break; + case kIOError: + type = "IO error: "; + break; + default: + snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", + static_cast<int>(code())); + type = tmp; + break; + } + std::string result(type); + uint32_t length; + memcpy(&length, state_, sizeof(length)); + result.append(state_ + 5, length); + return result; + } +} + +} // namespace leveldb |