diff options
author | Ramon Fernandez <ramon@mongodb.com> | 2016-01-27 17:17:17 -0500 |
---|---|---|
committer | Ramon Fernandez <ramon@mongodb.com> | 2016-01-27 17:17:21 -0500 |
commit | 90118b147a6943b19dc929862a11071538db1438 (patch) | |
tree | d1e2c409595332174953e7dbe2db262935d89ae5 /src/third_party/wiredtiger/api/leveldb | |
parent | b8cad6a59cbce2831e69e6b94f9544d83d6e00b0 (diff) | |
download | mongo-90118b147a6943b19dc929862a11071538db1438.tar.gz |
Import wiredtiger-wiredtiger-2.7.0-505-g7fea169.tar.gz from wiredtiger branch mongodb-3.4
ref: 44463c5..7fea169
WT-2355 Fix minor scratch buffer usage in logging.
WT-2348 xargs -P isn't portable
WT-2347 Java: schema format edge cases
WT-2344 OS X compiler warning
WT-2342 Enhance wtperf to support background create and drop operations
WT-2340 Add logging guarantee assertions, whitespace
WT-2339 format post-rebalance verify failure (stress run #11586)
WT-2338 Disable using pre-allocated log files when backup cursor is open
WT-2335 NULL pointer crash in config_check_search with invalid configuration string
WT-2333 Add a flag so drop doesn't block
WT-2332 Bug in logging write-no-sync mode
WT-2331 Checking of search() result for reference cursors before join()
WT-2328 schema drop does direct unlink, it should use a block manager interface.
WT-2326 Change WTPERF to use new memory allocation functions instead of the standard
WT-2321 WT-2321: race between eviction and worker threads on the eviction queue
WT-2320 Only check copyright when cutting releases
WT-2316 stress test failure: WT_CURSOR.prev out-of-order returns
WT-2314 page-swap error handling is inconsistent
WT-2313 sweep-server: conn_dhandle.c, 610: dhandle != conn->cache->evict_file_next
WT-2312 re-creating a deleted column-store page can corrupt the in-memory tree
WT-2308 custom extractor for ref_cursors in join cursor
WT-2305 Fix coverity scan issues on 23/12/2015
WT-2296 New log algorithm needs improving for sync/flush settings
WT-2295 WT_SESSION.create does a full-scan of the main table
WT-2287 WT_SESSION.rebalance
WT-2275 broken DB after application crash
WT-2267 Improve wtperf throttling implementation to provide steady load
WT-2247 variable-length column-store in-memory page splits
WT-2242 WiredTiger treats dead trees the same as other trees in eviction
WT-2142 Connection cleanup in Python tests
WT-2073 metadata cleanups
WT-1801 Add a directory sync after rollback of a WT_SESSION::rename operation
WT-1517 schema format edge cases
SERVER-22064 Coverity analysis defect 77699: Unchecked return value
SERVER-21619 sys-perf: WT crash during core_workloads_WT execution
Diffstat (limited to 'src/third_party/wiredtiger/api/leveldb')
45 files changed, 7963 insertions, 0 deletions
diff --git a/src/third_party/wiredtiger/api/leveldb/Makefile.am b/src/third_party/wiredtiger/api/leveldb/Makefile.am new file mode 100644 index 00000000000..2cfd9d945a5 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/Makefile.am @@ -0,0 +1,81 @@ +AM_CPPFLAGS = -I$(top_builddir) -I$(srcdir)/leveldb -I$(srcdir)/leveldb/include + +lib_LTLIBRARIES = libwiredtiger_leveldb.la + +noinst_PROGRAMS = leveldb_test + +# Setup the LevelDB headers to be installed in a wiredtiger/leveldb +# subdirectory, so we don't interfere with other LevelDB installs. +if HAVE_HYPERLEVELDB +leveldbincludedir = $(includedir)/wiredtiger/hyperleveldb +else +if HAVE_ROCKSDB +leveldbincludedir = $(includedir)/wiredtiger/rocksdb +else +leveldbincludedir = $(includedir)/wiredtiger/leveldb +endif +endif +leveldbinclude_HEADERS = \ + leveldb_wt_config.h \ + leveldb/include/leveldb/cache.h \ + leveldb/include/leveldb/comparator.h\ + leveldb/include/leveldb/db.h \ + leveldb/include/leveldb/env.h \ + leveldb/include/leveldb/filter_policy.h \ + leveldb/include/leveldb/iterator.h \ + leveldb/include/leveldb/options.h \ + leveldb/include/leveldb/slice.h \ + leveldb/include/leveldb/status.h \ + leveldb/include/leveldb/write_batch.h + +if HAVE_BASHOLEVELDB +AM_CPPFLAGS += -I$(srcdir)/leveldb/include/leveldb -I$(srcdir)/basho +leveldbinclude_HEADERS += \ + basho/perf_count.h +endif +if HAVE_HYPERLEVELDB +AM_CPPFLAGS += -I$(srcdir)/leveldb/include/leveldb -I$(srcdir)/hyperleveldb +leveldbinclude_HEADERS += \ + hyperleveldb/replay_iterator.h +endif + +libwiredtiger_leveldb_la_LDFLAGS = -release @VERSION@ +libwiredtiger_leveldb_la_LIBADD = $(top_builddir)/libwiredtiger_static.la +libwiredtiger_leveldb_la_SOURCES = \ + leveldb_wt.cc \ + leveldb/util/coding.cc leveldb/util/comparator.cc leveldb/util/env.cc leveldb/util/env_posix.cc \ + leveldb/util/logging.cc leveldb/util/options.cc leveldb/util/status.cc + +if HAVE_BASHOLEVELDB +libwiredtiger_leveldb_la_SOURCES += basho/perf_count.cc +endif +if HAVE_HYPERLEVELDB +libwiredtiger_leveldb_la_SOURCES += hyper_wt.cc +endif +if HAVE_ROCKSDB +libwiredtiger_leveldb_la_SOURCES += rocks_wt.cc rocksdb/write_batch.cc +else +libwiredtiger_leveldb_la_SOURCES += leveldb/db/write_batch.cc +endif + +if HAVE_ROCKSDB +pkglib_LTLIBRARIES = librocksdb.la +else +pkglib_LTLIBRARIES = libleveldb.la +endif + +libleveldb_la_LDFLAGS = -release @VERSION@ +libleveldb_la_LIBADD = $(top_builddir)/libwiredtiger_static.la +libleveldb_la_SOURCES = $(libwiredtiger_leveldb_la_SOURCES) + +librocksdb_la_LDFLAGS = -release @VERSION@ +librocksdb_la_LIBADD = $(top_builddir)/libwiredtiger_static.la +librocksdb_la_SOURCES = $(libwiredtiger_leveldb_la_SOURCES) + +leveldb_test_SOURCES = leveldb_test.cc +leveldb_test_LDADD = libwiredtiger_leveldb.la + +TESTS = $(noinst_PROGRAMS) + +clean-local: + rm -rf WTLDB_HOME diff --git a/src/third_party/wiredtiger/api/leveldb/basho/perf_count.cc b/src/third_party/wiredtiger/api/leveldb/basho/perf_count.cc new file mode 100644 index 00000000000..0e666ac1dc0 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/basho/perf_count.cc @@ -0,0 +1,657 @@ +// ------------------------------------------------------------------- +// +// perf_count.cc: performance counters LevelDB +// +// Copyright (c) 2012-2013 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +#include <limits.h> +#include <stdio.h> +#include <sys/ipc.h> +#include <sys/shm.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <syslog.h> +#include <memory.h> +#include <errno.h> + +#ifndef STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_ +#include "perf_count.h" +#endif + +#include "util/coding.h" + +#define __STDC_FORMAT_MACROS +#include <inttypes.h> + +#ifdef OS_SOLARIS +# include <atomic.h> +#endif + + +namespace leveldb +{ + +// always have something active in gPerfCounters, eliminates +// need to test for "is shared object attached yet" +static PerformanceCounters LocalStartupCounters; +PerformanceCounters * gPerfCounters(&LocalStartupCounters); + + + SstCounters::SstCounters() + : m_IsReadOnly(false), + m_Version(eSstCountVersion), + m_CounterSize(eSstCountEnumSize) + { + memset(m_Counter, 0, sizeof(m_Counter)); + + m_Counter[eSstCountKeySmallest]=ULLONG_MAX; + m_Counter[eSstCountValueSmallest]=ULLONG_MAX; + + return; + + }; // SstCounters::SstCounters + + + void + SstCounters::EncodeTo( + std::string & Dst) const + { + unsigned loop; + + PutVarint32(&Dst, m_Version); + PutVarint32(&Dst, m_CounterSize); + + for(loop=0; loop<eSstCountEnumSize; ++loop) + PutVarint64(&Dst, m_Counter[loop]); + } // SstCounters::EncodeTo + + + Status + SstCounters::DecodeFrom( + const Slice& src) + { + Status ret_status; + Slice cursor; + bool good; + int loop; + + cursor=src; + m_IsReadOnly=true; + good=GetVarint32(&cursor, &m_Version); + good=good && (m_Version<=eSstCountVersion); + + // all lesser number of stats to be read + good=good && GetVarint32(&cursor, &m_CounterSize); + if (good && eSstCountEnumSize < m_CounterSize) + m_CounterSize=eSstCountEnumSize; + + for (loop=0; good && loop<eSstCountEnumSize; ++loop) + { + good=GetVarint64(&cursor, &m_Counter[loop]); + } // for + + // if (!good) change ret_status to bad + + return(ret_status); + + } // SstCounters::DecodeFrom + + + uint64_t + SstCounters::Inc( + unsigned Index) + { + uint64_t ret_val; + + ret_val=0; + if (!m_IsReadOnly && Index<m_CounterSize) + { + ++m_Counter[Index]; + ret_val=m_Counter[Index]; + } // if + + return(ret_val); + } // SstCounters::Inc + + + uint64_t + SstCounters::Add( + unsigned Index, + uint64_t Amount) + { + uint64_t ret_val; + + ret_val=0; + if (!m_IsReadOnly && Index<m_CounterSize) + { + m_Counter[Index]+=Amount; + ret_val=m_Counter[Index]; + } // if + + return(ret_val); + } // SstCounters::Add + + + uint64_t + SstCounters::Value( + unsigned Index) const + { + uint64_t ret_val; + + ret_val=0; + if (Index<m_CounterSize) + { + ret_val=m_Counter[Index]; + } // if + + return(ret_val); + } // SstCounters::Value + + + void + SstCounters::Set( + unsigned Index, + uint64_t Value) + { + if (Index<m_CounterSize) + { + m_Counter[Index]=Value; + } // if + + return; + } // SstCounters::Set + + + void + SstCounters::Dump() const + { + unsigned loop; + + printf("SstCounters:\n"); + printf(" m_IsReadOnly: %u\n", m_IsReadOnly); + printf(" m_Version: %u\n", m_Version); + printf(" m_CounterSize: %u\n", m_CounterSize); + for (loop=0; loop<m_CounterSize; ++loop) + printf(" Counter[%2u]: %" PRIu64 "\n", loop, m_Counter[loop]); + + return; + + } // SstCounters::Dump + + + // only used for local static objects, not shared memory objects + PerformanceCounters::PerformanceCounters() + { + m_Version=ePerfVersion; + m_CounterSize=ePerfCountEnumSize; + // cast away "volatile" + memset((void*)m_Counter, 0, sizeof(m_Counter)); + + return; + + } // PerformanceCounters::PerformanceCounters + + + PerformanceCounters * + PerformanceCounters::Init( + bool IsReadOnly) + { + PerformanceCounters * ret_ptr; + bool should_create, good; + int ret_val, id; + struct shmid_ds shm_info; + size_t open_size; + + ret_ptr=NULL; + memset(&shm_info, 0, sizeof(shm_info)); + good=true; + open_size=sizeof(PerformanceCounters); + + // first id attempt, minimal request + id=shmget(ePerfKey, 0, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (-1!=id) + ret_val=shmctl(id, IPC_STAT, &shm_info); + else + ret_val=-1; + + // does the shared memory already exists (and of proper size if writing) + should_create=(0!=ret_val || (shm_info.shm_segsz < sizeof(PerformanceCounters))) && !IsReadOnly; + + // should old shared memory be deleted? + if (should_create && 0==ret_val) + { + ret_val=shmctl(id, IPC_RMID, &shm_info); + good=(0==ret_val); + if (0!=ret_val) + syslog(LOG_ERR, "shmctl IPC_RMID failed [%d, %m]", errno); + } // if + + // else open the size that exists + else if (0==ret_val) + { + open_size=shm_info.shm_segsz; + } // else if + + // attempt to attach/create to shared memory instance + if (good) + { + int flags; + + if (IsReadOnly) + flags = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; + else + flags = IPC_CREAT | S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; + + m_PerfSharedId=shmget(ePerfKey, open_size, flags); + good=(-1!=m_PerfSharedId); + } // if + + // map shared memory instance + if (good) + { + ret_ptr=(PerformanceCounters *)shmat(m_PerfSharedId, NULL, (IsReadOnly ? SHM_RDONLY : 0)); + if ((void*)-1 != ret_ptr) + { + // initialize? + if (should_create || ePerfVersion!=ret_ptr->m_Version) + { + if (!IsReadOnly) + { + memset(ret_ptr, 0, sizeof(PerformanceCounters)); + ret_ptr->m_Version=ePerfVersion; + ret_ptr->m_CounterSize=ePerfCountEnumSize; + } // if + + // bad version match to existing segment + else + { + good=false; + errno=EINVAL; + } // else + } // if + } // if + else + { + good=false; + syslog(LOG_ERR, "shmat failed [%d, %m]", errno); + } // else + + if (good) + { + // make this available process wide + gPerfCounters=ret_ptr; + } // if + else + { + ret_ptr=NULL; + m_LastError=errno; + } // else + } // if + else + { + m_LastError=errno; + ret_ptr=NULL; + } // else + + return(ret_ptr); + + }; // PerformanceCounters::Init + + + int + PerformanceCounters::Close( + PerformanceCounters * Counts) + { + int ret_val; + + if (NULL!=Counts && &LocalStartupCounters != Counts) + { + // keep gPerf valid + if (gPerfCounters==Counts) + gPerfCounters=&LocalStartupCounters; + + ret_val=shmdt(Counts); + if (0!=ret_val) + ret_val=errno; + } // if + else + { + ret_val=EINVAL; + } // else + + return(ret_val); + } // PerformanceCounters::Close + + + uint64_t + PerformanceCounters::Inc( + unsigned Index) + { + uint64_t ret_val; + + ret_val=0; + if (Index<m_CounterSize) + { + volatile uint64_t * val_ptr; + + val_ptr=&m_Counter[Index]; + +# if ULONG_MAX != 4294967295UL +#ifdef OS_SOLARIS + atomic_inc_64(val_ptr); +#else + __sync_add_and_fetch(val_ptr, 1); +#endif +#else + // hack fest for 64 bit semi-atomic on 32bit machine + uint32_t ret_32, * ptr_32; + + ptr_32=(uint32_t *)&val_ptr; + ret_32=__sync_add_and_fetch(ptr_32, 1); + if (0==ret_32) + { + ++ptr_32; + __sync_add_and_fetch(ptr_32, 1); + } // if +#endif + ret_val=*val_ptr; + } // if + + return(ret_val); + } // PerformanceCounters::Inc + + + uint64_t + PerformanceCounters::Dec( + unsigned Index) + { + uint64_t ret_val; + + ret_val=0; + if (Index<m_CounterSize) + { + volatile uint64_t * val_ptr; + + val_ptr=&m_Counter[Index]; + +# if ULONG_MAX != 4294967295UL +#ifdef OS_SOLARIS + atomic_dec_64(val_ptr); +#else + __sync_sub_and_fetch(val_ptr, 1); +#endif +#else + // hack fest for 64 bit semi-atomic on 32bit machine + uint32_t ret_32, * ptr_32; + + ptr_32=(uint32_t *)&val_ptr; + ret_32=__sync_sub_and_fetch(ptr_32, 1); + if (0xFFFFFFFF==ret_32) + { + ++ptr_32; + __sync_sub_and_fetch(ptr_32, 1); + } // if +#endif + ret_val=*val_ptr; + } // if + + return(ret_val); + } // PerformanceCounters::Dec + + + uint64_t + PerformanceCounters::Add( + unsigned Index, + uint64_t Amount) + { + uint64_t ret_val; + + ret_val=0; + if (Index<m_CounterSize) + { + volatile uint64_t * val_ptr; + + val_ptr=&m_Counter[Index]; + +# if ULONG_MAX != 4294967295UL +#ifdef OS_SOLARIS + ret_val=atomic_add_64_nv(val_ptr, Amount); +#else + ret_val=__sync_add_and_fetch(val_ptr, Amount); +#endif +#else + // hack fest for 64 bit semi-atomic on 32bit machine + uint32_t old_32, ret_32, * ptr_32; + + ptr_32=(uint32_t *)&val_ptr; + old_32=*ptr_32; + ret_32=__sync_add_and_fetch(ptr_32, Amount); + if (ret_32<old_32) + { + ++ptr_32; + __sync_add_and_fetch(ptr_32, 1); + } // if + + ret_val=*val_ptr; +#endif + } // if + + return(ret_val); + } // PerformanceCounters::Add + + + uint64_t + PerformanceCounters::Value( + unsigned Index) const + { + uint64_t ret_val; + + ret_val=0; + if (Index<m_CounterSize) + { + ret_val=m_Counter[Index]; + } // if + + return(ret_val); + } // SstCounters::Value + + + void + PerformanceCounters::Set( + unsigned Index, + uint64_t Amount) + { + if (Index<m_CounterSize) + { + volatile uint64_t * val_ptr; + + val_ptr=&m_Counter[Index]; + + *val_ptr=Amount; + } // if + + return; + } // PerformanceCounters::Set + + + volatile const uint64_t * + PerformanceCounters::GetPtr( + unsigned Index) const + { + const volatile uint64_t * ret_ptr; + + if (Index<m_CounterSize) + ret_ptr=&m_Counter[Index]; + else + ret_ptr=&m_BogusCounter; + + return(ret_ptr); + + } // PerformanceCounters::GetPtr + + + const char * + PerformanceCounters::GetNamePtr( + unsigned Index) + { + const char * ret_ptr; + + if (Index<ePerfCountEnumSize) + ret_ptr=m_PerfCounterNames[Index]; + else + ret_ptr="???"; + + return(ret_ptr); + + } // PerformanceCounters::GetPtr + + + int PerformanceCounters::m_PerfSharedId=-1; + int PerformanceCounters::m_LastError=0; + volatile uint64_t PerformanceCounters::m_BogusCounter=0; + const char * PerformanceCounters::m_PerfCounterNames[]= + { + "ROFileOpen", + "ROFileClose", + "ROFileUnmap", + "RWFileOpen", + "RWFileClose", + "RWFileUnmap", + "ApiOpen", + "ApiGet", + "ApiWrite", + "WriteSleep", + "WriteWaitImm", + "WriteWaitLevel0", + "WriteNewMem", + "WriteError", + "WriteNoWait", + "GetMem", + "GetImm", + "GetVersion", + "SearchLevel[0]", + "SearchLevel[1]", + "SearchLevel[2]", + "SearchLevel[3]", + "SearchLevel[4]", + "SearchLevel[5]", + "SearchLevel[6]", + "TableCached", + "TableOpened", + "TableGet", + "BGCloseUnmap", + "BGCompactImm", + "BGNormal", + "BGCompactLevel0", + "BlockFiltered", + "BlockFilterFalse", + "BlockCached", + "BlockRead", + "BlockFilterRead", + "BlockValidGet", + "Debug[0]", + "Debug[1]", + "Debug[2]", + "Debug[3]", + "Debug[4]", + "ReadBlockError", + "DBIterNew", + "DBIterNext", + "DBIterPrev", + "DBIterSeek", + "DBIterSeekFirst", + "DBIterSeekLast", + "DBIterDelete", + "eleveldbDirect", + "eleveldbQueued", + "eleveldbDequeued", + "elevelRefCreate", + "elevelRefDelete", + "ThrottleGauge", + "ThrottleCounter", + "ThrottleMicros0", + "ThrottleKeys0", + "ThrottleBacklog0", + "ThrottleCompacts0", + "ThrottleMicros1", + "ThrottleKeys1", + "ThrottleBacklog1", + "ThrottleCompacts1", + "BGWriteError", + "ThrottleWait", + "ThreadError", + "BGImmDirect", + "BGImmQueued", + "BGImmDequeued", + "BGImmWeighted", + "BGUnmapDirect", + "BGUnmapQueued", + "BGUnmapDequeued", + "BGUnmapWeighted", + "BGLevel0Direct", + "BGLevel0Queued", + "BGLevel0Dequeued", + "BGLevel0Weighted", + "BGCompactDirect", + "BGCompactQueued", + "BGCompactDequeued", + "BGCompactWeighted", + "FileCacheInsert", + "FileCacheRemove", + "BlockCacheInsert", + "BlockCacheRemove", + "ApiDelete" + }; + + + int + PerformanceCounters::LookupCounter( + const char * Name) + { + int index,loop; + + index=-1; + + if (NULL!=Name && '\0'!=*Name) + { + for (loop=0; loop<ePerfCountEnumSize && -1==index; ++loop) + { + if (0==strcmp(m_PerfCounterNames[loop], Name)) + index=loop; + } // loop + } // if + + return(index); + }; + + void + PerformanceCounters::Dump() + { + int loop; + + printf(" m_Version: %u\n", m_Version); + printf(" m_CounterSize: %u\n", m_CounterSize); + + for (loop=0; loop<ePerfCountEnumSize; ++loop) + { + printf(" %s: %" PRIu64 "\n", m_PerfCounterNames[loop], m_Counter[loop]); + } // loop + }; // Dump + +} // namespace leveldb diff --git a/src/third_party/wiredtiger/api/leveldb/basho/perf_count.h b/src/third_party/wiredtiger/api/leveldb/basho/perf_count.h new file mode 100644 index 00000000000..b0f4abf9b66 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/basho/perf_count.h @@ -0,0 +1,298 @@ +// ------------------------------------------------------------------- +// +// perf_count.h: performance counters LevelDB +// +// Copyright (c) 2012-2013 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +#ifndef STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_ +#define STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_ + +#include "leveldb_wt_config.h" + +#include <stdint.h> +#include <string> +#include "status.h" + +namespace leveldb { + +enum SstCountEnum +{ + // + // array index values/names + // + eSstCountKeys=0, //!< how many keys in this sst + eSstCountBlocks=1, //!< how many blocks in this sst + eSstCountCompressAborted=2,//!< how many blocks attempted compression and aborted use + eSstCountKeySize=3, //!< byte count of all keys + eSstCountValueSize=4, //!< byte count of all values + eSstCountBlockSize=5, //!< byte count of all blocks (pre-compression) + eSstCountBlockWriteSize=6, //!< post-compression size, or BlockSize if no compression + eSstCountIndexKeys=7, //!< how many keys in the index block + eSstCountKeyLargest=8, //!< largest key in sst + eSstCountKeySmallest=9, //!< smallest key in sst + eSstCountValueLargest=10, //!< largest value in sst + eSstCountValueSmallest=11, //!< smallest value in sst + eSstCountDeleteKey=12, //!< tombstone count + eSstCountBlockSizeUsed=13, //!< Options::block_size used with this file + eSstCountUserDataSize=14, //!< post-compression size of non-metadata (user keys/values/block overhead) + + // must follow last index name to represent size of array + eSstCountEnumSize, //!< size of the array described by the enum values + + eSstCountVersion=1 + +}; // enum SstCountEnum + + +class SstCounters +{ +protected: + bool m_IsReadOnly; //!< set when data decoded from a file + uint32_t m_Version; //!< object revision identification + uint32_t m_CounterSize; //!< number of objects in m_Counter + + uint64_t m_Counter[eSstCountEnumSize]; + +public: + // constructors / destructor + SstCounters(); + + // Put data into disk form + void EncodeTo(std::string & Dst) const; + + // Populate member data from prior EncodeTo block + Status DecodeFrom(const Slice& src); + + // increment the counter + uint64_t Inc(unsigned Index); + + // add value to the counter + uint64_t Add(unsigned Index, uint64_t Amount); + + // return value of a counter + uint64_t Value(unsigned Index) const; + + // set a value + void Set(unsigned Index, uint64_t); + + // return number of counters + uint32_t Size() const {return(m_CounterSize);}; + + // printf all values + void Dump() const; + +}; // class SstCounters + + +extern struct PerformanceCounters * gPerfCounters; + + +enum PerformanceCountersEnum +{ + // + // array index values/names + // (enum explicitly numbered to allow future edits / moves / inserts) + // + ePerfROFileOpen=0, //!< PosixMmapReadableFile open + ePerfROFileClose=1, //!< closed + ePerfROFileUnmap=2, //!< unmap without close + + ePerfRWFileOpen=3, //!< PosixMmapFile open + ePerfRWFileClose=4, //!< closed + ePerfRWFileUnmap=5, //!< unmap without close + + ePerfApiOpen=6, //!< Count of DB::Open completions + ePerfApiGet=7, //!< Count of DBImpl::Get completions + ePerfApiWrite=8, //!< Count of DBImpl::Get completions + + ePerfWriteSleep=9, //!< DBImpl::MakeRoomForWrite called sleep + ePerfWriteWaitImm=10, //!< DBImpl::MakeRoomForWrite called Wait on Imm compact + ePerfWriteWaitLevel0=11,//!< DBImpl::MakeRoomForWrite called Wait on Level0 compact + ePerfWriteNewMem=12, //!< DBImpl::MakeRoomForWrite created new memory log + ePerfWriteError=13, //!< DBImpl::MakeRoomForWrite saw bg_error_ + ePerfWriteNoWait=14, //!< DBImpl::MakeRoomForWrite took no action + + ePerfGetMem=15, //!< DBImpl::Get read from memory log + ePerfGetImm=16, //!< DBImpl::Get read from previous memory log + ePerfGetVersion=17, //!< DBImpl::Get read from Version object + + // code ASSUMES the levels are in numerical order, + // i.e. based off of ePerfSearchLevel0 + ePerfSearchLevel0=18, //!< Version::Get read searched one or more files here + ePerfSearchLevel1=19, //!< Version::Get read searched one or more files here + ePerfSearchLevel2=20, //!< Version::Get read searched one or more files here + ePerfSearchLevel3=21, //!< Version::Get read searched one or more files here + ePerfSearchLevel4=22, //!< Version::Get read searched one or more files here + ePerfSearchLevel5=23, //!< Version::Get read searched one or more files here + ePerfSearchLevel6=24, //!< Version::Get read searched one or more files here + + ePerfTableCached=25, //!< TableCache::FindTable found table in cache + ePerfTableOpened=26, //!< TableCache::FindTable had to open table file + ePerfTableGet=27, //!< TableCache::Get used to retrieve a key + + ePerfBGCloseUnmap=28, //!< PosixEnv::BGThreaed started Unmap/Close job + ePerfBGCompactImm=29, //!< PosixEnv::BGThreaed started compaction of Imm + ePerfBGNormal=30, //!< PosixEnv::BGThreaed started normal compaction job + ePerfBGCompactLevel0=31,//!< PosixEnv::BGThreaed started compaction of Level0 + + ePerfBlockFiltered=32, //!< Table::BlockReader search stopped due to filter + ePerfBlockFilterFalse=33,//!< Table::BlockReader gave a false positive for match + ePerfBlockCached=34, //!< Table::BlockReader found block in cache + ePerfBlockRead=35, //!< Table::BlockReader read block from disk + ePerfBlockFilterRead=36,//!< Table::ReadMeta filter loaded from file + ePerfBlockValidGet=37, //!< Table::InternalGet has valid iterator + + ePerfDebug0=38, //!< Developer debug counters, moveable + ePerfDebug1=39, //!< Developer debug counters, moveable + ePerfDebug2=40, //!< Developer debug counters, moveable + ePerfDebug3=41, //!< Developer debug counters, moveable + ePerfDebug4=42, //!< Developer debug counters, moveable + + ePerfReadBlockError=43, //!< crc or compression error in ReadBlock (format.cc) + + ePerfIterNew=44, //!< Count of DBImpl::NewDBIterator calls + ePerfIterNext=45, //!< Count of DBIter::Next calls + ePerfIterPrev=46, //!< Count of DBIter::Prev calls + ePerfIterSeek=47, //!< Count of DBIter::Seek calls + ePerfIterSeekFirst=48, //!< Count of DBIter::SeekFirst calls + ePerfIterSeekLast=49, //!< Count of DBIter::SeekLast calls + ePerfIterDelete=50, //!< Count of DBIter::~DBIter + + ePerfElevelDirect=51, //!< eleveldb's FindWaitingThread went direct to thread + ePerfElevelQueued=52, //!< eleveldb's FindWaitingThread queued work item + ePerfElevelDequeued=53, //!< eleveldb's worker took item from backlog queue + + ePerfElevelRefCreate=54,//!< eleveldb RefObject constructed + ePerfElevelRefDelete=55,//!< eleveldb RefObject destructed + + ePerfThrottleGauge=56, //!< current throttle value + ePerfThrottleCounter=57,//!< running throttle by seconds + + ePerfThrottleMicros0=58,//!< level 0 micros spent compacting + ePerfThrottleKeys0=59, //!< level 0 keys processed + ePerfThrottleBacklog0=60,//!< backlog at time of posting (level0) + ePerfThrottleCompacts0=61,//!< number of level 0 compactions + + ePerfThrottleMicros1=62,//!< level 1+ micros spent compacting + ePerfThrottleKeys1=63, //!< level 1+ keys processed + ePerfThrottleBacklog1=64,//!< backlog at time of posting (level1+) + ePerfThrottleCompacts1=65,//!< number of level 1+ compactions + + ePerfBGWriteError=66, //!< error in write/close, see syslog + + ePerfThrottleWait=67, //!< milliseconds of throttle wait + ePerfThreadError=68, //!< system error on thread related call, no LOG access + + ePerfBGImmDirect=69, //!< count Imm compactions happened directly + ePerfBGImmQueued=70, //!< count Imm compactions placed on queue + ePerfBGImmDequeued=71, //!< count Imm compactions removed from queue + ePerfBGImmWeighted=72, //!< total microseconds item spent on queue + + ePerfBGUnmapDirect=73, //!< count Unmap operations happened directly + ePerfBGUnmapQueued=74, //!< count Unmap operations placed on queue + ePerfBGUnmapDequeued=75,//!< count Unmap operations removed from queue + ePerfBGUnmapWeighted=76,//!< total microseconds item spent on queue + + ePerfBGLevel0Direct=77, //!< count Level0 compactions happened directly + ePerfBGLevel0Queued=78, //!< count Level0 compactions placed on queue + ePerfBGLevel0Dequeued=79,//!< count Level0 compactions removed from queue + ePerfBGLevel0Weighted=80,//!< total microseconds item spent on queue + + ePerfBGCompactDirect=81, //!< count generic compactions happened directly + ePerfBGCompactQueued=82, //!< count generic compactions placed on queue + ePerfBGCompactDequeued=83,//!< count generic compactions removed from queue + ePerfBGCompactWeighted=84,//!< total microseconds item spent on queue + + ePerfFileCacheInsert=85, //!< total bytes inserted into file cache + ePerfFileCacheRemove=86, //!< total bytes removed from file cache + + ePerfBlockCacheInsert=87, //!< total bytes inserted into block cache + ePerfBlockCacheRemove=88, //!< total bytes removed from block cache + + ePerfApiDelete=89, //!< Count of DB::Delete + + // must follow last index name to represent size of array + // (ASSUMES previous enum is highest value) + ePerfCountEnumSize, //!< size of the array described by the enum values + + ePerfVersion=1, //!< structure versioning + ePerfKey=41207 //!< random number as shared memory identifier +}; + +// +// Do NOT use virtual functions. This structure will be aligned at different +// locations in multiple processes. Things can get messy with virtuals. + +struct PerformanceCounters +{ +public: + static int m_LastError; + +protected: + uint32_t m_Version; //!< object revision identification + uint32_t m_CounterSize; //!< number of objects in m_Counter + + volatile uint64_t m_Counter[ePerfCountEnumSize]; + + static const char * m_PerfCounterNames[]; + static int m_PerfSharedId; + static volatile uint64_t m_BogusCounter; //!< for out of range GetPtr calls + +public: + // only called for local object, not for shared memory + PerformanceCounters(); + + //!< does executable's idea of version match shared object? + bool VersionTest() + {return(ePerfCountEnumSize<=m_CounterSize && ePerfVersion==m_Version);}; + + //!< mostly for perf_count_test.cc + void SetVersion(uint32_t Version, uint32_t CounterSize) + {m_Version=Version; m_CounterSize=CounterSize;}; + + static PerformanceCounters * Init(bool IsReadOnly); + static int Close(PerformanceCounters * Counts); + + uint64_t Inc(unsigned Index); + uint64_t Dec(unsigned Index); + + // add value to the counter + uint64_t Add(unsigned Index, uint64_t Amount); + + // return value of a counter + uint64_t Value(unsigned Index) const; + + // set a value + void Set(unsigned Index, uint64_t); + + volatile const uint64_t * GetPtr(unsigned Index) const; + + static const char * GetNamePtr(unsigned Index); + + int LookupCounter(const char * Name); + + void Dump(); + +}; // struct PerformanceCounters + +extern PerformanceCounters * gPerfCounters; + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_ diff --git a/src/third_party/wiredtiger/api/leveldb/config.hin b/src/third_party/wiredtiger/api/leveldb/config.hin new file mode 100644 index 00000000000..131b68969d3 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/config.hin @@ -0,0 +1,22 @@ +/* api/leveldb/config.hin. Generated by autoheader, then hand-edited. */ + +/* Build the LevelDB API with Basho LevelDB support. */ +#undef HAVE_BASHOLEVELDB + +/* Snappy support automatically loaded. */ +#undef HAVE_BUILTIN_EXTENSION_SNAPPY + +/* Zlib support automatically loaded. */ +#undef HAVE_BUILTIN_EXTENSION_ZLIB + +/* Define to 1 for diagnostic tests. */ +#undef HAVE_DIAGNOSTIC + +/* Build the LevelDB API with HyperLevelDB support. */ +#undef HAVE_HYPERLEVELDB + +/* Define to 1 if you have the `snappy' library (-lsnappy). */ +#undef HAVE_LIBSNAPPY + +/* Build the LevelDB API with RocksDB support. */ +#undef HAVE_ROCKSDB diff --git a/src/third_party/wiredtiger/api/leveldb/dummy.cc b/src/third_party/wiredtiger/api/leveldb/dummy.cc new file mode 100644 index 00000000000..d56f03b544b --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/dummy.cc @@ -0,0 +1,28 @@ +/*- + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* Nothing to see, just keep build tools happy. */ diff --git a/src/third_party/wiredtiger/api/leveldb/hyper_wt.cc b/src/third_party/wiredtiger/api/leveldb/hyper_wt.cc new file mode 100644 index 00000000000..95c82289e18 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/hyper_wt.cc @@ -0,0 +1,415 @@ +/*- + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "leveldb_wt.h" +#include <errno.h> +#include <sstream> +#include <sys/param.h> +#include <sys/stat.h> + +using leveldb::ReplayIterator; +using leveldb::Status; + +// Fill in missing methods from the interface +ReplayIterator::ReplayIterator() {} +ReplayIterator::~ReplayIterator() {} + +class ReplayIteratorImpl : public ReplayIterator { + public: + ReplayIteratorImpl(OperationContext *context) : context_(context), cursor_(NULL) { + WT_SESSION *session = context_->GetSession(); + int ret = session->open_cursor( + session, "log:", NULL, NULL, &cursor_); + status_ = WiredTigerErrorToStatus(ret); + valid_ = false; + // Position on first record. valid_ will be set appropriately. + Next(); + } + + ReplayIteratorImpl(OperationContext *context, const std::string& timestamp) : + context_(context), cursor_(NULL) { + + WT_SESSION *session = context_->GetSession(); + int ret = session->open_cursor( + session, "log:", NULL, NULL, &cursor_); + status_ = WiredTigerErrorToStatus(ret); + valid_ = false; + // Position on requested record. valid_ will be set appropriately. + SeekTo(timestamp); + } + + // An iterator is either positioned at a deleted key, present key/value pair, + // or not valid. This method returns true iff the iterator is valid. + virtual bool Valid(); + + // Moves to the next entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the last entry in the source. + // REQUIRES: Valid() + virtual void Next(); + + // Position at the first key in the source that at or past target for this + // pass. Note that this is unlike the Seek call, as the ReplayIterator is + // unsorted. + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or past target. + // Per Robert at Hyperdex, the SkipTo functions are hacky optimizations + // for LevelDB and its key layout. It is okay for them to be no-ops. + virtual void SkipTo(const Slice& target) { } + virtual void SkipToLast() { } + virtual void SeekTo(const std::string& timestamp); + virtual void SeekToLast(); + + // Return true if the current entry points to a key-value pair. If this + // returns false, it means the current entry is a deleted entry. + virtual bool HasValue() { + assert(Valid()); + if (optype == WT_LOGOP_ROW_PUT || + optype == WT_LOGOP_COL_PUT) + return true; + else + return false; + } + + int Compare(ReplayIteratorImpl* other) { + int cmp; + assert(Valid()); + // assert(other->Valid()); + int ret = cursor_->compare(cursor_, other->cursor_, &cmp); + status_ = WiredTigerErrorToStatus(ret); + return (cmp); + } + + // Return the key for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: Valid() + virtual Slice key() const { return Slice((const char *)key_.data, key_.size); } + + // Return the value for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: !AtEnd() && !AtStart() + virtual Slice value() const { return Slice((const char *)value_.data, value_.size); } + + // If an error has occurred, return it. Else return an ok status. + virtual Status status() const { return status_; } + + // must be released by giving it back to the DB + virtual ~ReplayIteratorImpl() { + int ret = Close(); + assert(ret == 0); + } + + std::string GetTimestamp() { + char lsn[256]; + assert(Valid()); + snprintf(lsn, sizeof(lsn), WT_TIMESTAMP_FORMAT, + lsn_.file, lsn_.offset); + return (std::string(lsn)); + } + + int Close() { + int ret = 0; + if (cursor_ != NULL) + ret = cursor_->close(cursor_); + status_ = WiredTigerErrorToStatus(ret); + valid_ = false; + cursor_ = NULL; + return (ret); + } + + private: + void SeekTo(WT_LSN *lsn); + // No copying allowed + ReplayIteratorImpl(const ReplayIterator&) { } + void operator=(const ReplayIterator&) { } + OperationContext *context_; + Status status_; + WT_CURSOR *cursor_; + WT_ITEM key_, value_; + WT_LSN lsn_; + bool valid_; + uint64_t txnid; + uint32_t fileid, opcount, optype, rectype; +}; + +bool +ReplayIteratorImpl::Valid() { + // If we're invalid and at the end, try again. + if (valid_ == false && cursor_ != NULL && status_.IsNotFound()) + Next(); + return valid_; +} + +void +ReplayIteratorImpl::Next() { + int ret = 0; + + if (cursor_ != NULL) { + while ((ret = cursor_->next(cursor_)) == 0) { + ret = cursor_->get_key(cursor_, + &lsn_.file, &lsn_.offset, &opcount); + if (ret != 0) + break; + ret = cursor_->get_value(cursor_, + &txnid, &rectype, &optype, &fileid, &key_, &value_); + if (ret != 0) + break; + // Next() is only interested in modification operations. + // Continue for any other type of record. + if (WT_VALID_OPERATION(fileid, optype)) { + valid_ = true; + break; + } + } + status_ = WiredTigerErrorToStatus(ret); + if (ret != 0) { + valid_ = false; + if (ret != WT_NOTFOUND) + ret = Close(); + else + ret = 0; + assert(ret == 0); + } + } +} + +void +ReplayIteratorImpl::SeekToLast() { + int ret = 0; + WT_LSN last_lsn; + + last_lsn.file = 0; + if (cursor_ != NULL) { + // Walk the log to the end, then set the cursor on the + // last valid LSN we saw. + while ((ret = cursor_->next(cursor_)) == 0) { + ret = cursor_->get_key(cursor_, + &lsn_.file, &lsn_.offset, &opcount); + if (ret != 0) + break; + ret = cursor_->get_value(cursor_, + &txnid, &rectype, &optype, &fileid, &key_, &value_); + if (ret != 0) + break; + // We're only interested in modification operations. + // Continue for any other type of record. + if (WT_VALID_OPERATION(fileid, optype)) { + valid_ = true; + last_lsn = lsn_; + } + } + // We reached the end of log + if (ret != WT_NOTFOUND || last_lsn.file == 0) { + valid_ = false; + ret = Close(); + assert(ret == 0); + } else + SeekTo(&last_lsn); + } +} + +void +ReplayIteratorImpl::SeekTo(const std::string& timestamp) { + WT_LSN target_lsn; + int ret = 0; + + if (timestamp == "all") { + if (cursor_ != NULL) { + ret = cursor_->reset(cursor_); + status_ = WiredTigerErrorToStatus(ret); + if (ret != 0) + return; + Next(); + return; + } + } + if (timestamp == "now") { + SeekToLast(); + return; + } + sscanf(timestamp.c_str(), WT_TIMESTAMP_FORMAT, + &target_lsn.file, &target_lsn.offset); + SeekTo(&target_lsn); +} + +// Set the cursor on the first modification record at or after the +// given LSN. +void +ReplayIteratorImpl::SeekTo(WT_LSN *target_lsn) { + int ret = 0; + + valid_ = false; + if (cursor_ != NULL) { + cursor_->set_key(cursor_, + target_lsn->file, target_lsn->offset, 0, 0); + ret = cursor_->search(cursor_); + status_ = WiredTigerErrorToStatus(ret); + if (ret != 0) + return; + // If we were successful, set up the info. + ret = cursor_->get_key(cursor_, + &lsn_.file, &lsn_.offset, &opcount); + status_ = WiredTigerErrorToStatus(ret); + if (ret != 0) + return; + ret = cursor_->get_value(cursor_, + &txnid, &rectype, &optype, &fileid, &key_, &value_); + status_ = WiredTigerErrorToStatus(ret); + if (ret != 0) + return; + valid_ = true; + // We're only interested in modification operations. + // Continue for any other type of record. + if (!WT_VALID_OPERATION(fileid, optype)) + Next(); + } +} + +// Create a live backup of a live LevelDB instance. +// The backup is stored in a directory named "backup-<name>" under the top +// level of the open LevelDB database. The implementation is permitted, and +// even encouraged, to improve the performance of this call through +// hard-links. +Status +DbImpl::LiveBackup(const Slice& name) +{ + OperationContext *context = GetContext(); + WT_SESSION *session = context->GetSession(); + WT_CURSOR *cursor; + int ret = session->open_cursor( + session, "backup:", NULL, NULL, &cursor); + int t_ret; + const char *filename; + const char *home = conn_->get_home(conn_); + char backup[MAXPATHLEN], buf[MAXPATHLEN * 2]; + + // If we couldn't open the backup cursor, we're done. + if (ret != 0) + return (WiredTigerErrorToStatus(ret)); + + // Remove any old directory and create the backup directory. + // WT single-threads hot backups. If we get here we already have + // the backup cursor open and we do not have to worry about other + // threads trying to remove and recreate the same directory out + // from under us. + snprintf(buf, sizeof(buf), "rm -rf %s/backup-%s", home, + (char *)name.data()); + if ((ret = system(buf)) != 0) + return WiredTigerErrorToStatus(ret); + snprintf(backup, sizeof(backup), "%s/backup-%s", home, + (char *)name.data()); + if ((ret = mkdir(backup, 0777)) != 0) + return WiredTigerErrorToStatus(ret); + // Copy all files returned by backup cursor. + while ((ret = cursor->next(cursor)) == 0 && + (ret = cursor->get_key(cursor, &filename)) == 0) { + snprintf(buf, sizeof(buf), "cp %s/%s %s/%s", + home, filename, backup, filename); + if ((ret = system(buf)) != 0) + break; + } + if (ret == WT_NOTFOUND) + ret = 0; + if ((t_ret = cursor->close(cursor)) != 0 && ret == 0) + ret = t_ret; + + return (WiredTigerErrorToStatus(ret)); +} + +// Return an opaque timestamp that identifies the current point in time of the +// database. This timestamp may be subsequently presented to the +// NewReplayIterator method to create a ReplayIterator. +void +DbImpl::GetReplayTimestamp(std::string* timestamp) +{ + OperationContext *context = GetContext(); + ReplayIteratorImpl *iter = new ReplayIteratorImpl(context); + + iter->SeekToLast(); + *timestamp = iter->GetTimestamp(); + ReleaseReplayIterator(iter); +} + +// Set the lower bound for manual garbage collection. This method only takes +// effect when Options.manual_garbage_collection is true. +void +DbImpl::AllowGarbageCollectBeforeTimestamp(const std::string& timestamp) +{ +} + +// Validate the timestamp +bool +DbImpl::ValidateTimestamp(const std::string& timestamp) +{ + bool valid; + OperationContext *context = GetContext(); + ReplayIteratorImpl *iter = new ReplayIteratorImpl(context); + + // The SeekTo function will handle "all" or "now". + iter->SeekTo(timestamp); + valid = iter->Valid(); + ReleaseReplayIterator(iter); + return valid; +} + +// Compare two timestamps and return -1, 0, 1 for lt, eq, gt +int +DbImpl::CompareTimestamps(const std::string& lhs, const std::string& rhs) +{ + OperationContext *context = GetContext(); + ReplayIteratorImpl *lhiter = new ReplayIteratorImpl(context); + ReplayIteratorImpl *rhiter = new ReplayIteratorImpl(context); + int cmp = 0; + + // The SeekTo function will handle "all" or "now". + lhiter->SeekTo(lhs); + rhiter->SeekTo(rhs); + if (lhiter->Valid() && rhiter->Valid()) + cmp = lhiter->Compare(rhiter); + ReleaseReplayIterator(lhiter); + ReleaseReplayIterator(rhiter); + return cmp; +} + +// Return a ReplayIterator that returns every write operation performed after +// the timestamp. +Status +DbImpl::GetReplayIterator(const std::string& timestamp, + ReplayIterator** iter) +{ + OperationContext *context = GetContext(); + *iter = new ReplayIteratorImpl(context, timestamp); + return ((*iter)->status()); +} + +// Release a previously allocated replay iterator. +void +DbImpl::ReleaseReplayIterator(ReplayIterator* iter) +{ + delete static_cast<ReplayIteratorImpl *>(iter); +} diff --git a/src/third_party/wiredtiger/api/leveldb/hyperleveldb/AUTHORS b/src/third_party/wiredtiger/api/leveldb/hyperleveldb/AUTHORS new file mode 100644 index 00000000000..bf024aba6a8 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/hyperleveldb/AUTHORS @@ -0,0 +1,15 @@ +# Names should be added to this file like so: +# Name or Organization <email address> + +Google Inc. + +# Initial version authors: +Jeffrey Dean <jeff@google.com> +Sanjay Ghemawat <sanjay@google.com> + +# Partial list of contributors: +Kevin Regan <kevin.d.regan@gmail.com> +Johan Bilien <jobi@litl.com> + +# HyperLevelDB authors: +Robert Escriva <robert@hyperdex.org> diff --git a/src/third_party/wiredtiger/api/leveldb/hyperleveldb/LICENSE b/src/third_party/wiredtiger/api/leveldb/hyperleveldb/LICENSE new file mode 100644 index 00000000000..262b0af095d --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/hyperleveldb/LICENSE @@ -0,0 +1,28 @@ +Copyright (c) 2011 The LevelDB Authors. All rights reserved. +Copyright (c) 2013-2014 The HyperLevelDB Authors. All rights reserved. (HyperLevelDB changes) + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/src/third_party/wiredtiger/api/leveldb/hyperleveldb/replay_iterator.h b/src/third_party/wiredtiger/api/leveldb/hyperleveldb/replay_iterator.h new file mode 100644 index 00000000000..397acdfd889 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/hyperleveldb/replay_iterator.h @@ -0,0 +1,67 @@ +// Copyright (c) 2013 The HyperLevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_INCLUDE_REPLAY_ITERATOR_H_ +#define STORAGE_LEVELDB_INCLUDE_REPLAY_ITERATOR_H_ + +#include "leveldb_wt_config.h" + +#include "slice.h" +#include "status.h" + +namespace leveldb { + +class ReplayIterator { + public: + ReplayIterator(); + + // An iterator is either positioned at a deleted key, present key/value pair, + // or not valid. This method returns true iff the iterator is valid. + virtual bool Valid() = 0; + + // Moves to the next entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the last entry in the source. + // REQUIRES: Valid() + virtual void Next() = 0; + + // Position at the first key in the source that at or past target for this + // pass. Note that this is unlike the Seek call, as the ReplayIterator is + // unsorted. + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or past target. + virtual void SkipTo(const Slice& target) = 0; + virtual void SkipToLast() = 0; + + // Return true if the current entry points to a key-value pair. If this + // returns false, it means the current entry is a deleted entry. + virtual bool HasValue() = 0; + + // Return the key for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: Valid() + virtual Slice key() const = 0; + + // Return the value for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: !AtEnd() && !AtStart() + virtual Slice value() const = 0; + + // If an error has occurred, return it. Else return an ok status. + virtual Status status() const = 0; + + protected: + // must be released by giving it back to the DB + virtual ~ReplayIterator(); + + private: + // No copying allowed + ReplayIterator(const ReplayIterator&); + void operator=(const ReplayIterator&); +}; + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_INCLUDE_REPLAY_ITERATOR_H_ diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/AUTHORS b/src/third_party/wiredtiger/api/leveldb/leveldb/AUTHORS new file mode 100644 index 00000000000..27a9407e52f --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/AUTHORS @@ -0,0 +1,8 @@ +# Names should be added to this file like so: +# Name or Organization <email address> + +Google Inc. + +# Initial version authors: +Jeffrey Dean <jeff@google.com> +Sanjay Ghemawat <sanjay@google.com> diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/LICENSE b/src/third_party/wiredtiger/api/leveldb/leveldb/LICENSE new file mode 100644 index 00000000000..8e80208cd72 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2011 The LevelDB Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/db/dbformat.h b/src/third_party/wiredtiger/api/leveldb/leveldb/db/dbformat.h new file mode 100644 index 00000000000..2c8a9d5f5a7 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/db/dbformat.h @@ -0,0 +1,233 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_FORMAT_H_ +#define STORAGE_LEVELDB_DB_FORMAT_H_ + +#include <stdio.h> +#include "leveldb_wt.h" +#include "util/coding.h" +#include "util/logging.h" + +namespace leveldb { + +// Grouping of constants. We may want to make some of these +// parameters set via options. +namespace config { +static const int kNumLevels = 7; + +// Level-0 compaction is started when we hit this many files. +static const int kL0_CompactionTrigger = 4; + +// Soft limit on number of level-0 files. We slow down writes at this point. +static const int kL0_SlowdownWritesTrigger = 8; + +// Maximum number of level-0 files. We stop writes at this point. +static const int kL0_StopWritesTrigger = 12; + +// Maximum level to which a new compacted memtable is pushed if it +// does not create overlap. We try to push to level 2 to avoid the +// relatively expensive level 0=>1 compactions and to avoid some +// expensive manifest file operations. We do not push all the way to +// the largest level since that can generate a lot of wasted disk +// space if the same key space is being repeatedly overwritten. +static const int kMaxMemCompactLevel = 2; + +} // namespace config + +class InternalKey; + +// Value types encoded as the last component of internal keys. +// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk +// data structures. +enum ValueType { + kTypeDeletion = 0x0, + kTypeValue = 0x1 +#ifdef HAVE_ROCKSDB + ,kTypeMerge = 0x2, + // Following types are used only in write ahead logs. They are not used in + // memtables or sst files: + kTypeLogData = 0x3, + kTypeColumnFamilyDeletion = 0x4, + kTypeColumnFamilyValue = 0x5, + kTypeColumnFamilyMerge = 0x6, + kMaxValue = 0x7F +#endif +}; +// kValueTypeForSeek defines the ValueType that should be passed when +// constructing a ParsedInternalKey object for seeking to a particular +// sequence number (since we sort sequence numbers in decreasing order +// and the value type is embedded as the low 8 bits in the sequence +// number in internal keys, we need to use the highest-numbered +// ValueType, not the lowest). +static const ValueType kValueTypeForSeek = kTypeValue; + +typedef uint64_t SequenceNumber; + +// We leave eight bits empty at the bottom so a type and sequence# +// can be packed together into 64-bits. +static const SequenceNumber kMaxSequenceNumber = + ((0x1ull << 56) - 1); + +struct ParsedInternalKey { + Slice user_key; + SequenceNumber sequence; + ValueType type; + + ParsedInternalKey() { } // Intentionally left uninitialized (for speed) + ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t) + : user_key(u), sequence(seq), type(t) { } + std::string DebugString() const; +}; + +// Return the length of the encoding of "key". +inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) { + return key.user_key.size() + 8; +} + +// Append the serialization of "key" to *result. +extern void AppendInternalKey(std::string* result, + const ParsedInternalKey& key); + +// Attempt to parse an internal key from "internal_key". On success, +// stores the parsed data in "*result", and returns true. +// +// On error, returns false, leaves "*result" in an undefined state. +extern bool ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result); + +// Returns the user key portion of an internal key. +inline Slice ExtractUserKey(const Slice& internal_key) { + assert(internal_key.size() >= 8); + return Slice(internal_key.data(), internal_key.size() - 8); +} + +inline ValueType ExtractValueType(const Slice& internal_key) { + assert(internal_key.size() >= 8); + const size_t n = internal_key.size(); + uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + unsigned char c = num & 0xff; + return static_cast<ValueType>(c); +} + +// A comparator for internal keys that uses a specified comparator for +// the user key portion and breaks ties by decreasing sequence number. +class InternalKeyComparator : public Comparator { + private: + const Comparator* user_comparator_; + public: + explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) { } + virtual const char* Name() const; + virtual int Compare(const Slice& a, const Slice& b) const; + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const; + virtual void FindShortSuccessor(std::string* key) const; + + const Comparator* user_comparator() const { return user_comparator_; } + + int Compare(const InternalKey& a, const InternalKey& b) const; +}; + +// Filter policy wrapper that converts from internal keys to user keys +class InternalFilterPolicy : public FilterPolicy { + private: + const FilterPolicy* const user_policy_; + public: + explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { } + virtual const char* Name() const; + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const; + virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const; +}; + +// Modules in this directory should keep internal keys wrapped inside +// the following class instead of plain strings so that we do not +// incorrectly use string comparisons instead of an InternalKeyComparator. +class InternalKey { + private: + std::string rep_; + public: + InternalKey() { } // Leave rep_ as empty to indicate it is invalid + InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) { + AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t)); + } + + void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); } + Slice Encode() const { + assert(!rep_.empty()); + return rep_; + } + + Slice user_key() const { return ExtractUserKey(rep_); } + + void SetFrom(const ParsedInternalKey& p) { + rep_.clear(); + AppendInternalKey(&rep_, p); + } + + void Clear() { rep_.clear(); } + + std::string DebugString() const; +}; + +inline int InternalKeyComparator::Compare( + const InternalKey& a, const InternalKey& b) const { + return Compare(a.Encode(), b.Encode()); +} + +inline bool ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result) { + const size_t n = internal_key.size(); + if (n < 8) return false; + uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + unsigned char c = num & 0xff; + result->sequence = num >> 8; + result->type = static_cast<ValueType>(c); + result->user_key = Slice(internal_key.data(), n - 8); + return (c <= static_cast<unsigned char>(kTypeValue)); +} + +// A helper class useful for DBImpl::Get() +class LookupKey { + public: + // Initialize *this for looking up user_key at a snapshot with + // the specified sequence number. + LookupKey(const Slice& user_key, SequenceNumber sequence); + + ~LookupKey(); + + // Return a key suitable for lookup in a MemTable. + Slice memtable_key() const { return Slice(start_, end_ - start_); } + + // Return an internal key (suitable for passing to an internal iterator) + Slice internal_key() const { return Slice(kstart_, end_ - kstart_); } + + // Return the user key + Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); } + + private: + // We construct a char array of the form: + // klength varint32 <-- start_ + // userkey char[klength] <-- kstart_ + // tag uint64 + // <-- end_ + // The array is a suitable MemTable key. + // The suffix starting with "userkey" can be used as an InternalKey. + const char* start_; + const char* kstart_; + const char* end_; + char space_[200]; // Avoid allocation for short keys + + // No copying allowed + LookupKey(const LookupKey&); + void operator=(const LookupKey&); +}; + +inline LookupKey::~LookupKey() { + if (start_ != space_) delete[] start_; +} + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_DB_FORMAT_H_ diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/db/skiplist.h b/src/third_party/wiredtiger/api/leveldb/leveldb/db/skiplist.h new file mode 100644 index 00000000000..af85be6d016 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/db/skiplist.h @@ -0,0 +1,379 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Thread safety +// ------------- +// +// Writes require external synchronization, most likely a mutex. +// Reads require a guarantee that the SkipList will not be destroyed +// while the read is in progress. Apart from that, reads progress +// without any internal locking or synchronization. +// +// Invariants: +// +// (1) Allocated nodes are never deleted until the SkipList is +// destroyed. This is trivially guaranteed by the code since we +// never delete any skip list nodes. +// +// (2) The contents of a Node except for the next/prev pointers are +// immutable after the Node has been linked into the SkipList. +// Only Insert() modifies the list, and it is careful to initialize +// a node and use release-stores to publish the nodes in one or +// more lists. +// +// ... prev vs. next pointer ordering ... + +#include <assert.h> +#include <stdlib.h> +#include "port/port.h" +#include "util/arena.h" +#include "util/random.h" + +namespace leveldb { + +class Arena; + +template<typename Key, class Comparator> +class SkipList { + private: + struct Node; + + public: + // Create a new SkipList object that will use "cmp" for comparing keys, + // and will allocate memory using "*arena". Objects allocated in the arena + // must remain allocated for the lifetime of the skiplist object. + explicit SkipList(Comparator cmp, Arena* arena); + + // Insert key into the list. + // REQUIRES: nothing that compares equal to key is currently in the list. + void Insert(const Key& key); + + // Returns true iff an entry that compares equal to key is in the list. + bool Contains(const Key& key) const; + + // Iteration over the contents of a skip list + class Iterator { + public: + // Initialize an iterator over the specified list. + // The returned iterator is not valid. + explicit Iterator(const SkipList* list); + + // Returns true iff the iterator is positioned at a valid node. + bool Valid() const; + + // Returns the key at the current position. + // REQUIRES: Valid() + const Key& key() const; + + // Advances to the next position. + // REQUIRES: Valid() + void Next(); + + // Advances to the previous position. + // REQUIRES: Valid() + void Prev(); + + // Advance to the first entry with a key >= target + void Seek(const Key& target); + + // Position at the first entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToFirst(); + + // Position at the last entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToLast(); + + private: + const SkipList* list_; + Node* node_; + // Intentionally copyable + }; + + private: + enum { kMaxHeight = 12 }; + + // Immutable after construction + Comparator const compare_; + Arena* const arena_; // Arena used for allocations of nodes + + Node* const head_; + + // Modified only by Insert(). Read racily by readers, but stale + // values are ok. + port::AtomicPointer max_height_; // Height of the entire list + + inline int GetMaxHeight() const { + return static_cast<int>( + reinterpret_cast<intptr_t>(max_height_.NoBarrier_Load())); + } + + // Read/written only by Insert(). + Random rnd_; + + Node* NewNode(const Key& key, int height); + int RandomHeight(); + bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } + + // Return true if key is greater than the data stored in "n" + bool KeyIsAfterNode(const Key& key, Node* n) const; + + // Return the earliest node that comes at or after key. + // Return NULL if there is no such node. + // + // If prev is non-NULL, fills prev[level] with pointer to previous + // node at "level" for every level in [0..max_height_-1]. + Node* FindGreaterOrEqual(const Key& key, Node** prev) const; + + // Return the latest node with a key < key. + // Return head_ if there is no such node. + Node* FindLessThan(const Key& key) const; + + // Return the last node in the list. + // Return head_ if list is empty. + Node* FindLast() const; + + // No copying allowed + SkipList(const SkipList&); + void operator=(const SkipList&); +}; + +// Implementation details follow +template<typename Key, class Comparator> +struct SkipList<Key,Comparator>::Node { + explicit Node(const Key& k) : key(k) { } + + Key const key; + + // Accessors/mutators for links. Wrapped in methods so we can + // add the appropriate barriers as necessary. + Node* Next(int n) { + assert(n >= 0); + // Use an 'acquire load' so that we observe a fully initialized + // version of the returned Node. + return reinterpret_cast<Node*>(next_[n].Acquire_Load()); + } + void SetNext(int n, Node* x) { + assert(n >= 0); + // Use a 'release store' so that anybody who reads through this + // pointer observes a fully initialized version of the inserted node. + next_[n].Release_Store(x); + } + + // No-barrier variants that can be safely used in a few locations. + Node* NoBarrier_Next(int n) { + assert(n >= 0); + return reinterpret_cast<Node*>(next_[n].NoBarrier_Load()); + } + void NoBarrier_SetNext(int n, Node* x) { + assert(n >= 0); + next_[n].NoBarrier_Store(x); + } + + private: + // Array of length equal to the node height. next_[0] is lowest level link. + port::AtomicPointer next_[1]; +}; + +template<typename Key, class Comparator> +typename SkipList<Key,Comparator>::Node* +SkipList<Key,Comparator>::NewNode(const Key& key, int height) { + char* mem = arena_->AllocateAligned( + sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1)); + return new (mem) Node(key); +} + +template<typename Key, class Comparator> +inline SkipList<Key,Comparator>::Iterator::Iterator(const SkipList* list) { + list_ = list; + node_ = NULL; +} + +template<typename Key, class Comparator> +inline bool SkipList<Key,Comparator>::Iterator::Valid() const { + return node_ != NULL; +} + +template<typename Key, class Comparator> +inline const Key& SkipList<Key,Comparator>::Iterator::key() const { + assert(Valid()); + return node_->key; +} + +template<typename Key, class Comparator> +inline void SkipList<Key,Comparator>::Iterator::Next() { + assert(Valid()); + node_ = node_->Next(0); +} + +template<typename Key, class Comparator> +inline void SkipList<Key,Comparator>::Iterator::Prev() { + // Instead of using explicit "prev" links, we just search for the + // last node that falls before key. + assert(Valid()); + node_ = list_->FindLessThan(node_->key); + if (node_ == list_->head_) { + node_ = NULL; + } +} + +template<typename Key, class Comparator> +inline void SkipList<Key,Comparator>::Iterator::Seek(const Key& target) { + node_ = list_->FindGreaterOrEqual(target, NULL); +} + +template<typename Key, class Comparator> +inline void SkipList<Key,Comparator>::Iterator::SeekToFirst() { + node_ = list_->head_->Next(0); +} + +template<typename Key, class Comparator> +inline void SkipList<Key,Comparator>::Iterator::SeekToLast() { + node_ = list_->FindLast(); + if (node_ == list_->head_) { + node_ = NULL; + } +} + +template<typename Key, class Comparator> +int SkipList<Key,Comparator>::RandomHeight() { + // Increase height with probability 1 in kBranching + static const unsigned int kBranching = 4; + int height = 1; + while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) { + height++; + } + assert(height > 0); + assert(height <= kMaxHeight); + return height; +} + +template<typename Key, class Comparator> +bool SkipList<Key,Comparator>::KeyIsAfterNode(const Key& key, Node* n) const { + // NULL n is considered infinite + return (n != NULL) && (compare_(n->key, key) < 0); +} + +template<typename Key, class Comparator> +typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindGreaterOrEqual(const Key& key, Node** prev) + const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + Node* next = x->Next(level); + if (KeyIsAfterNode(key, next)) { + // Keep searching in this list + x = next; + } else { + if (prev != NULL) prev[level] = x; + if (level == 0) { + return next; + } else { + // Switch to next list + level--; + } + } + } +} + +template<typename Key, class Comparator> +typename SkipList<Key,Comparator>::Node* +SkipList<Key,Comparator>::FindLessThan(const Key& key) const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + assert(x == head_ || compare_(x->key, key) < 0); + Node* next = x->Next(level); + if (next == NULL || compare_(next->key, key) >= 0) { + if (level == 0) { + return x; + } else { + // Switch to next list + level--; + } + } else { + x = next; + } + } +} + +template<typename Key, class Comparator> +typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindLast() + const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + Node* next = x->Next(level); + if (next == NULL) { + if (level == 0) { + return x; + } else { + // Switch to next list + level--; + } + } else { + x = next; + } + } +} + +template<typename Key, class Comparator> +SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena) + : compare_(cmp), + arena_(arena), + head_(NewNode(0 /* any key will do */, kMaxHeight)), + max_height_(reinterpret_cast<void*>(1)), + rnd_(0xdeadbeef) { + for (int i = 0; i < kMaxHeight; i++) { + head_->SetNext(i, NULL); + } +} + +template<typename Key, class Comparator> +void SkipList<Key,Comparator>::Insert(const Key& key) { + // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual() + // here since Insert() is externally synchronized. + Node* prev[kMaxHeight]; + Node* x = FindGreaterOrEqual(key, prev); + + // Our data structure does not allow duplicate insertion + assert(x == NULL || !Equal(key, x->key)); + + int height = RandomHeight(); + if (height > GetMaxHeight()) { + for (int i = GetMaxHeight(); i < height; i++) { + prev[i] = head_; + } + //fprintf(stderr, "Change height from %d to %d\n", max_height_, height); + + // It is ok to mutate max_height_ without any synchronization + // with concurrent readers. A concurrent reader that observes + // the new value of max_height_ will see either the old value of + // new level pointers from head_ (NULL), or a new value set in + // the loop below. In the former case the reader will + // immediately drop to the next level since NULL sorts after all + // keys. In the latter case the reader will use the new node. + max_height_.NoBarrier_Store(reinterpret_cast<void*>(height)); + } + + x = NewNode(key, height); + for (int i = 0; i < height; i++) { + // NoBarrier_SetNext() suffices since we will add a barrier when + // we publish a pointer to "x" in prev[i]. + x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i)); + prev[i]->SetNext(i, x); + } +} + +template<typename Key, class Comparator> +bool SkipList<Key,Comparator>::Contains(const Key& key) const { + Node* x = FindGreaterOrEqual(key, NULL); + if (x != NULL && Equal(key, x->key)) { + return true; + } else { + return false; + } +} + +} // namespace leveldb diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/db/write_batch.cc b/src/third_party/wiredtiger/api/leveldb/leveldb/db/write_batch.cc new file mode 100644 index 00000000000..0a11cb10f33 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/db/write_batch.cc @@ -0,0 +1,110 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBatch::rep_ := +// sequence: fixed64 +// count: fixed32 +// data: record[count] +// record := +// kTypeValue varstring varstring | +// kTypeDeletion varstring +// varstring := +// len: varint32 +// data: uint8[len] + +#include "leveldb_wt.h" + +#include "db/write_batch_internal.h" + +namespace leveldb { + +// WriteBatch header has an 8-byte sequence number followed by a 4-byte count. +static const size_t kHeader = 12; + +WriteBatch::WriteBatch() { + Clear(); +} + +WriteBatch::~WriteBatch() { } + +WriteBatch::Handler::~Handler() { } + +void WriteBatch::Clear() { + rep_.clear(); + rep_.resize(kHeader); +} + +Status WriteBatch::Iterate(Handler* handler) const { + Slice input(rep_); + if (input.size() < kHeader) { + return Status::Corruption("malformed WriteBatch (too small)"); + } + + input.remove_prefix(kHeader); + Slice key, value; + int found = 0; + while (!input.empty()) { + found++; + char tag = input[0]; + input.remove_prefix(1); + switch (tag) { + case kTypeValue: + if (GetLengthPrefixedSlice(&input, &key) && + GetLengthPrefixedSlice(&input, &value)) { + handler->Put(key, value); + } else { + return Status::Corruption("bad WriteBatch Put"); + } + break; + case kTypeDeletion: + if (GetLengthPrefixedSlice(&input, &key)) { + handler->Delete(key); + } else { + return Status::Corruption("bad WriteBatch Delete"); + } + break; + default: + return Status::Corruption("unknown WriteBatch tag"); + } + } + if (found != WriteBatchInternal::Count(this)) { + return Status::Corruption("WriteBatch has wrong count"); + } else { + return Status::OK(); + } +} + +int WriteBatchInternal::Count(const WriteBatch* b) { + return DecodeFixed32(b->rep_.data() + 8); +} + +void WriteBatchInternal::SetCount(WriteBatch* b, int n) { + EncodeFixed32(&b->rep_[8], n); +} + +void WriteBatch::Put(const Slice& key, const Slice& value) { + WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); + rep_.push_back(static_cast<char>(kTypeValue)); + PutLengthPrefixedSlice(&rep_, key); + PutLengthPrefixedSlice(&rep_, value); +} + +void WriteBatch::Delete(const Slice& key) { + WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); + rep_.push_back(static_cast<char>(kTypeDeletion)); + PutLengthPrefixedSlice(&rep_, key); +} + +void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { + assert(contents.size() >= kHeader); + b->rep_.assign(contents.data(), contents.size()); +} + +void WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src) { + SetCount(dst, Count(dst) + Count(src)); + assert(src->rep_.size() >= kHeader); + dst->rep_.append(src->rep_.data() + kHeader, src->rep_.size() - kHeader); +} + +} // namespace leveldb diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/db/write_batch_internal.h b/src/third_party/wiredtiger/api/leveldb/leveldb/db/write_batch_internal.h new file mode 100644 index 00000000000..c8421cce124 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/db/write_batch_internal.h @@ -0,0 +1,53 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ +#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ + +#include "leveldb_wt.h" +#include "db/dbformat.h" + +namespace leveldb { + +// WriteBatchInternal provides static methods for manipulating a +// WriteBatch that we don't want in the public WriteBatch interface. +class WriteBatchInternal { + public: +#ifdef HAVE_ROCKSDB + // WriteBatch methods with column_family_id instead of ColumnFamilyHandle* + static void Put(WriteBatch* batch, uint32_t column_family_id, + const Slice& key, const Slice& value); + + static void Put(WriteBatch* batch, uint32_t column_family_id, + const SliceParts& key, const SliceParts& value); + + static void Delete(WriteBatch* batch, uint32_t column_family_id, + const Slice& key); + + static void Merge(WriteBatch* batch, uint32_t column_family_id, + const Slice& key, const Slice& value); +#endif + // Return the number of entries in the batch. + static int Count(const WriteBatch* batch); + + // Set the count for the number of entries in the batch. + static void SetCount(WriteBatch* batch, int n); + + static Slice Contents(const WriteBatch* batch) { + return Slice(batch->rep_); + } + + static size_t ByteSize(const WriteBatch* batch) { + return batch->rep_.size(); + } + + static void SetContents(WriteBatch* batch, const Slice& contents); + + static void Append(WriteBatch* dst, const WriteBatch* src); +}; + +} // namespace leveldb + + +#endif // STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/cache.h b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/cache.h new file mode 100644 index 00000000000..94be8e919a8 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/cache.h @@ -0,0 +1,110 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Cache is an interface that maps keys to values. It has internal +// synchronization and may be safely accessed concurrently from +// multiple threads. It may automatically evict entries to make room +// for new entries. Values have a specified charge against the cache +// capacity. For example, a cache where the values are variable +// length strings, may use the length of the string as the charge for +// the string. +// +// A builtin cache implementation with a least-recently-used eviction +// policy is provided. Clients may use their own implementations if +// they want something more sophisticated (like scan-resistance, a +// custom eviction policy, variable cache sizing, etc.) + +#ifndef STORAGE_LEVELDB_INCLUDE_CACHE_H_ +#define STORAGE_LEVELDB_INCLUDE_CACHE_H_ + +#include "leveldb_wt_config.h" +#if defined(HAVE_ROCKSDB) && !defined(leveldb) +#define leveldb rocksdb +#endif + +#include <memory> +#include <stdint.h> +#include "slice.h" + +namespace leveldb { + +class Cache; + +// Create a new cache with a fixed size capacity. This implementation +// of Cache uses a least-recently-used eviction policy. +extern Cache* NewLRUCache(size_t capacity); +#ifdef HAVE_ROCKSDB +extern Cache* NewLRUCache(size_t capacity, int numSharedBits); +extern Cache* NewLRUCache(size_t capacity, int numSharedBits, + int removeScanCountLimit); +#endif + +class Cache { + public: + Cache() { } + + // Destroys all existing entries by calling the "deleter" + // function that was passed to the constructor. + virtual ~Cache(); + + // Opaque handle to an entry stored in the cache. + struct Handle { }; + + // Insert a mapping from key->value into the cache and assign it + // the specified charge against the total cache capacity. + // + // Returns a handle that corresponds to the mapping. The caller + // must call this->Release(handle) when the returned mapping is no + // longer needed. + // + // When the inserted entry is no longer needed, the key and + // value will be passed to "deleter". + virtual Handle* Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)) = 0; + + // If the cache has no mapping for "key", returns NULL. + // + // Else return a handle that corresponds to the mapping. The caller + // must call this->Release(handle) when the returned mapping is no + // longer needed. + virtual Handle* Lookup(const Slice& key) = 0; + + // Release a mapping returned by a previous Lookup(). + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual void Release(Handle* handle) = 0; + + // Return the value encapsulated in a handle returned by a + // successful Lookup(). + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual void* Value(Handle* handle) = 0; + + // If the cache contains entry for key, erase it. Note that the + // underlying entry will be kept around until all existing handles + // to it have been released. + virtual void Erase(const Slice& key) = 0; + + // Return a new numeric id. May be used by multiple clients who are + // sharing the same cache to partition the key space. Typically the + // client will allocate a new id at startup and prepend the id to + // its cache keys. + virtual uint64_t NewId() = 0; + + private: + void LRU_Remove(Handle* e); + void LRU_Append(Handle* e); + void Unref(Handle* e); + + struct Rep; + Rep* rep_; + + // No copying allowed + Cache(const Cache&); + void operator=(const Cache&); +}; + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_UTIL_CACHE_H_ diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/comparator.h b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/comparator.h new file mode 100644 index 00000000000..78d83a4d08e --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/comparator.h @@ -0,0 +1,74 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ +#define STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ + +#include "leveldb_wt_config.h" +#if defined(HAVE_ROCKSDB) && !defined(leveldb) +#define leveldb rocksdb +#endif + +#include <stdint.h> +#include <string> + +namespace leveldb { + +class Slice; + +// A Comparator object provides a total order across slices that are +// used as keys in an sstable or a database. A Comparator implementation +// must be thread-safe since leveldb may invoke its methods concurrently +// from multiple threads. +class Comparator { + public: + virtual ~Comparator(); + + // Three-way comparison. Returns value: + // < 0 iff "a" < "b", + // == 0 iff "a" == "b", + // > 0 iff "a" > "b" + virtual int Compare(const Slice& a, const Slice& b) const = 0; + + // The name of the comparator. Used to check for comparator + // mismatches (i.e., a DB created with one comparator is + // accessed using a different comparator. + // + // The client of this package should switch to a new name whenever + // the comparator implementation changes in a way that will cause + // the relative ordering of any two keys to change. + // + // Names starting with "leveldb." are reserved and should not be used + // by any clients of this package. + virtual const char* Name() const = 0; + + // Advanced functions: these are used to reduce the space requirements + // for internal data structures like index blocks. + + // If *start < limit, changes *start to a short string in [start,limit). + // Simple comparator implementations may return with *start unchanged, + // i.e., an implementation of this method that does nothing is correct. + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const = 0; + + // Changes *key to a short string >= *key. + // Simple comparator implementations may return with *key unchanged, + // i.e., an implementation of this method that does nothing is correct. + virtual void FindShortSuccessor(std::string* key) const = 0; + +#ifdef HAVE_HYPERLEVELDB + // If unsure, return 0; + virtual uint64_t KeyNum(const Slice& key) const; +#endif +}; + +// Return a builtin comparator that uses lexicographic byte-wise +// ordering. The result remains the property of this module and +// must not be deleted. +extern const Comparator* BytewiseComparator(); + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/db.h b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/db.h new file mode 100644 index 00000000000..df8fcbbe9f8 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/db.h @@ -0,0 +1,350 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_INCLUDE_DB_H_ +#define STORAGE_LEVELDB_INCLUDE_DB_H_ + +#include "leveldb_wt_config.h" +#if defined(HAVE_ROCKSDB) && !defined(leveldb) +#define leveldb rocksdb +#endif + +#include <memory> +#include <stdint.h> +#include <stdio.h> +#include <vector> +#include "iterator.h" +#include "options.h" +#include "write_batch.h" +#ifdef HAVE_HYPERLEVELDB +#include "replay_iterator.h" +#endif + +namespace leveldb { + +// Update Makefile if you change these +static const int kMajorVersion = 1; +static const int kMinorVersion = 17; + +struct ReadOptions; +struct WriteOptions; +class WriteBatch; + +#ifdef HAVE_ROCKSDB +struct FlushOptions; +class ColumnFamilyHandle { + public: + virtual ~ColumnFamilyHandle() {} +}; +extern const std::string kDefaultColumnFamilyName; + +struct ColumnFamilyDescriptor { + std::string name; + ColumnFamilyOptions options; + ColumnFamilyDescriptor() + : name(kDefaultColumnFamilyName), options(ColumnFamilyOptions()) {} + ColumnFamilyDescriptor(const std::string& _name, + const ColumnFamilyOptions& _options) + : name(_name), options(_options) {} +}; +#endif + +// Abstract handle to particular state of a DB. +// A Snapshot is an immutable object and can therefore be safely +// accessed from multiple threads without any external synchronization. +class Snapshot { + protected: + virtual ~Snapshot(); +}; + +// A range of keys +struct Range { + Slice start; // Included in the range + Slice limit; // Not included in the range + + Range() { } + Range(const Slice& s, const Slice& l) : start(s), limit(l) { } +}; + +#if HAVE_BASHOLEVELDB +// Abstract holder for a DB value. +// This allows callers to manage their own value buffers and have +// DB values copied directly into those buffers. +class Value { + public: + virtual Value& assign(const char* data, size_t size) = 0; + + protected: + virtual ~Value(); +}; +#endif + +// A DB is a persistent ordered map from keys to values. +// A DB is safe for concurrent access from multiple threads without +// any external synchronization. +class DB { + public: + // Open the database with the specified "name". + // Stores a pointer to a heap-allocated database in *dbptr and returns + // OK on success. + // Stores NULL in *dbptr and returns a non-OK status on error. + // Caller should delete *dbptr when it is no longer needed. + static Status Open(const Options& options, + const std::string& name, + DB** dbptr); + +#ifdef HAVE_ROCKSDB + // Open DB with column families. + // db_options specify database specific options + // column_families is the vector of all column families in the databse, + // containing column family name and options. You need to open ALL column + // families in the database. To get the list of column families, you can use + // ListColumnFamilies(). Also, you can open only a subset of column families + // for read-only access. + // The default column family name is 'default' and it's stored + // in rocksdb::kDefaultColumnFamilyName. + // If everything is OK, handles will on return be the same size + // as column_families --- handles[i] will be a handle that you + // will use to operate on column family column_family[i] + static Status Open(const Options& db_options, const std::string& name, + const std::vector<ColumnFamilyDescriptor>& column_families, + std::vector<ColumnFamilyHandle*>* handles, DB** dbptr); + + // ListColumnFamilies will open the DB specified by argument name + // and return the list of all column families in that DB + // through column_families argument. The ordering of + // column families in column_families is unspecified. + static Status ListColumnFamilies(const Options& db_options, + const std::string& name, + std::vector<std::string>* column_families); + + // Create a column_family and return the handle of column family + // through the argument handle. + virtual Status CreateColumnFamily(const Options& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) = 0; + + // Drop a column family specified by column_family handle. This call + // only records a drop record in the manifest and prevents the column + // family from flushing and compacting. + virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) = 0; + + // Set the database entry for "key" to "value". + // Returns OK on success, and a non-OK status on error. + // Note: consider setting options.sync = true. + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) = 0; + + // Remove the database entry (if any) for "key". Returns OK on + // success, and a non-OK status on error. It is not an error if "key" + // did not exist in the database. + // Note: consider setting options.sync = true. + virtual Status Delete(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key) = 0; + + // Merge the database entry for "key" with "value". Returns OK on success, + // and a non-OK status on error. The semantics of this operation is + // determined by the user provided merge_operator when opening DB. + // Note: consider setting options.sync = true. + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) = 0; + + // May return some other Status on an error. + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) = 0; + + // If keys[i] does not exist in the database, then the i'th returned + // status will be one for which Status::IsNotFound() is true, and + // (*values)[i] will be set to some arbitrary value (often ""). Otherwise, + // the i'th returned status will have Status::ok() true, and (*values)[i] + // will store the value associated with keys[i]. + // + // (*values) will always be resized to be the same size as (keys). + // Similarly, the number of returned statuses will be the number of keys. + // Note: keys will not be "de-duplicated". Duplicate keys will return + // duplicate values in order. + virtual std::vector<Status> MultiGet( + const ReadOptions& options, + const std::vector<ColumnFamilyHandle*>& column_family, + const std::vector<Slice>& keys, std::vector<std::string>* values) = 0; + + // If the key definitely does not exist in the database, then this method + // returns false, else true. If the caller wants to obtain value when the key + // is found in memory, a bool for 'value_found' must be passed. 'value_found' + // will be true on return if value has been set properly. + // This check is potentially lighter-weight than invoking DB::Get(). One way + // to make this lighter weight is to avoid doing any IOs. + // Default implementation here returns true and sets 'value_found' to false + virtual bool KeyMayExist(const ReadOptions&, + ColumnFamilyHandle*, const Slice&, + std::string*, bool* value_found = NULL) { + if (value_found != NULL) { + *value_found = false; + } + return true; + } + + virtual Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) = 0; + + virtual bool GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value) = 0; + + // Flush all mem-table data. + virtual Status Flush(const FlushOptions& options, + ColumnFamilyHandle* column_family) = 0; +#endif + + DB() { } + virtual ~DB(); + + // Set the database entry for "key" to "value". Returns OK on success, + // and a non-OK status on error. + // Note: consider setting options.sync = true. + virtual Status Put(const WriteOptions& options, + const Slice& key, + const Slice& value) = 0; + + // Remove the database entry (if any) for "key". Returns OK on + // success, and a non-OK status on error. It is not an error if "key" + // did not exist in the database. + // Note: consider setting options.sync = true. + virtual Status Delete(const WriteOptions& options, const Slice& key) = 0; + + // Apply the specified updates to the database. + // Returns OK on success, non-OK on failure. + // Note: consider setting options.sync = true. + virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0; + + // If the database contains an entry for "key" store the + // corresponding value in *value and return OK. + // + // If there is no entry for "key" leave *value unchanged and return + // a status for which Status::IsNotFound() returns true. + // + // May return some other Status on an error. + virtual Status Get(const ReadOptions& options, + const Slice& key, std::string* value) = 0; +#if HAVE_BASHOLEVELDB + virtual Status Get(const ReadOptions& options, + const Slice& key, Value* value) = 0; +#endif + + // Return a heap-allocated iterator over the contents of the database. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + // + // Caller should delete the iterator when it is no longer needed. + // The returned iterator should be deleted before this db is deleted. + virtual Iterator* NewIterator(const ReadOptions& options) = 0; + + // Return a handle to the current DB state. Iterators created with + // this handle will all observe a stable snapshot of the current DB + // state. The caller must call ReleaseSnapshot(result) when the + // snapshot is no longer needed. + virtual const Snapshot* GetSnapshot() = 0; + + // Release a previously acquired snapshot. The caller must not + // use "snapshot" after this call. + virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0; + + // DB implementations can export properties about their state + // via this method. If "property" is a valid property understood by this + // DB implementation, fills "*value" with its current value and returns + // true. Otherwise returns false. + // + // + // Valid property names include: + // + // "leveldb.num-files-at-level<N>" - return the number of files at level <N>, + // where <N> is an ASCII representation of a level number (e.g. "0"). + // "leveldb.stats" - returns a multi-line string that describes statistics + // about the internal operation of the DB. + // "leveldb.sstables" - returns a multi-line string that describes all + // of the sstables that make up the db contents. + virtual bool GetProperty(const Slice& property, std::string* value) = 0; + + // For each i in [0,n-1], store in "sizes[i]", the approximate + // file system space used by keys in "[range[i].start .. range[i].limit)". + // + // Note that the returned sizes measure file system space usage, so + // if the user data compresses by a factor of ten, the returned + // sizes will be one-tenth the size of the corresponding user data size. + // + // The results may not include the sizes of recently written data. + virtual void GetApproximateSizes(const Range* range, int n, + uint64_t* sizes) = 0; + + // Compact the underlying storage for the key range [*begin,*end]. + // In particular, deleted and overwritten versions are discarded, + // and the data is rearranged to reduce the cost of operations + // needed to access the data. This operation should typically only + // be invoked by users who understand the underlying implementation. + // + // begin==NULL is treated as a key before all keys in the database. + // end==NULL is treated as a key after all keys in the database. + // Therefore the following call will compact the entire database: + // db->CompactRange(NULL, NULL); + virtual void CompactRange(const Slice* begin, const Slice* end) = 0; + + // Suspends the background compaction thread. This methods + // returns once suspended. + virtual void SuspendCompactions() = 0; + // Resumes a suspended background compation thread. + virtual void ResumeCompactions() = 0; + +#ifdef HAVE_HYPERLEVELDB + // Create a live backup of a live LevelDB instance. + // The backup is stored in a directory named "backup-<name>" under the top + // level of the open LevelDB database. The implementation is permitted, and + // even encouraged, to improve the performance of this call through + // hard-links. + virtual Status LiveBackup(const Slice& name) = 0; + + // Return an opaque timestamp that identifies the current point in time of the + // database. This timestamp may be subsequently presented to the + // NewReplayIterator method to create a ReplayIterator. + virtual void GetReplayTimestamp(std::string* timestamp) = 0; + + // Set the lower bound for manual garbage collection. This method only takes + // effect when Options.manual_garbage_collection is true. + virtual void AllowGarbageCollectBeforeTimestamp(const std::string& timestamp) = 0; + + // Validate the timestamp + virtual bool ValidateTimestamp(const std::string& timestamp) = 0; + + // Compare two timestamps and return -1, 0, 1 for lt, eq, gt + virtual int CompareTimestamps(const std::string& lhs, const std::string& rhs) = 0; + + // Return a ReplayIterator that returns every write operation performed after + // the timestamp. + virtual Status GetReplayIterator(const std::string& timestamp, + ReplayIterator** iter) = 0; + + // Release a previously allocated replay iterator. + virtual void ReleaseReplayIterator(ReplayIterator* iter) = 0; +#endif + private: + // No copying allowed + DB(const DB&); + void operator=(const DB&); +}; + +// Destroy the contents of the specified database. +// Be very careful using this method. +Status DestroyDB(const std::string& name, const Options& options); + +// If a DB cannot be opened, you may attempt to call this method to +// resurrect as much of the contents of the database as possible. +// Some data may be lost, so be careful when calling this function +// on a database that contains important information. +Status RepairDB(const std::string& dbname, const Options& options); + +}; // namespace leveldb + +#endif // STORAGE_LEVELDB_INCLUDE_DB_H_ diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/env.h b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/env.h new file mode 100644 index 00000000000..4ad67d36fea --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/env.h @@ -0,0 +1,349 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An Env is an interface used by the leveldb implementation to access +// operating system functionality like the filesystem etc. Callers +// may wish to provide a custom Env object when opening a database to +// get fine gain control; e.g., to rate limit file system operations. +// +// All Env implementations are safe for concurrent access from +// multiple threads without any external synchronization. + +#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_ +#define STORAGE_LEVELDB_INCLUDE_ENV_H_ + +#include "leveldb_wt_config.h" +#if defined(HAVE_ROCKSDB) && !defined(leveldb) +#define leveldb rocksdb +#endif + +#include <string> +#include <vector> +#include <stdarg.h> +#include <stdint.h> +#if HAVE_BASHOLEVELDB +#include "perf_count.h" +#endif +#include "status.h" + +namespace leveldb { + +class FileLock; +class Logger; +class RandomAccessFile; +class SequentialFile; +class Slice; +class WritableFile; + +class Env { + public: + Env() { } + virtual ~Env(); + + // Return a default environment suitable for the current operating + // system. Sophisticated users may wish to provide their own Env + // implementation instead of relying on this default environment. + // + // The result of Default() belongs to leveldb and must never be deleted. + static Env* Default(); + + // Create a brand new sequentially-readable file with the specified name. + // On success, stores a pointer to the new file in *result and returns OK. + // On failure stores NULL in *result and returns non-OK. If the file does + // not exist, returns a non-OK status. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewSequentialFile(const std::string& fname, + SequentialFile** result) = 0; + + // Create a brand new random access read-only file with the + // specified name. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores NULL in *result and + // returns non-OK. If the file does not exist, returns a non-OK + // status. + // + // The returned file may be concurrently accessed by multiple threads. + virtual Status NewRandomAccessFile(const std::string& fname, + RandomAccessFile** result) = 0; + + // Create an object that writes to a new file with the specified + // name. Deletes any existing file with the same name and creates a + // new file. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores NULL in *result and + // returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewWritableFile(const std::string& fname, + WritableFile** result) = 0; + + // Returns true iff the named file exists. + virtual bool FileExists(const std::string& fname) = 0; + + // Store in *result the names of the children of the specified directory. + // The names are relative to "dir". + // Original contents of *results are dropped. + virtual Status GetChildren(const std::string& dir, + std::vector<std::string>* result) = 0; + + // Delete the named file. + virtual Status DeleteFile(const std::string& fname) = 0; + + // Create the specified directory. + virtual Status CreateDir(const std::string& dirname) = 0; + + // Delete the specified directory. + virtual Status DeleteDir(const std::string& dirname) = 0; + + // Store the size of fname in *file_size. + virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0; + + // Rename file src to target. + virtual Status RenameFile(const std::string& src, + const std::string& target) = 0; + + // Lock the specified file. Used to prevent concurrent access to + // the same db by multiple processes. On failure, stores NULL in + // *lock and returns non-OK. + // + // On success, stores a pointer to the object that represents the + // acquired lock in *lock and returns OK. The caller should call + // UnlockFile(*lock) to release the lock. If the process exits, + // the lock will be automatically released. + // + // If somebody else already holds the lock, finishes immediately + // with a failure. I.e., this call does not wait for existing locks + // to go away. + // + // May create the named file if it does not already exist. + virtual Status LockFile(const std::string& fname, FileLock** lock) = 0; + + // Release the lock acquired by a previous successful call to LockFile. + // REQUIRES: lock was returned by a successful LockFile() call + // REQUIRES: lock has not already been unlocked. + virtual Status UnlockFile(FileLock* lock) = 0; + + // Arrange to run "(*function)(arg)" once in a background thread. + // + // "function" may run in an unspecified thread. Multiple functions + // added to the same Env may run concurrently in different threads. + // I.e., the caller may not assume that background work items are + // serialized. + virtual void Schedule( + void (*function)(void* arg), + void* arg) = 0; + + // Start a new thread, invoking "function(arg)" within the new thread. + // When "function(arg)" returns, the thread will be destroyed. + virtual void StartThread(void (*function)(void* arg), void* arg) = 0; + + // *path is set to a temporary directory that can be used for testing. It may + // or many not have just been created. The directory may or may not differ + // between runs of the same process, but subsequent calls will return the + // same directory. + virtual Status GetTestDirectory(std::string* path) = 0; + + // Create and return a log file for storing informational messages. + virtual Status NewLogger(const std::string& fname, Logger** result) = 0; + + // Returns the number of micro-seconds since some fixed point in time. Only + // useful for computing deltas of time. + virtual uint64_t NowMicros() = 0; + + // Sleep/delay the thread for the perscribed number of micro-seconds. + virtual void SleepForMicroseconds(int micros) = 0; + +#if HAVE_BASHOLEVELDB + // Riak specific: Where supported, give count of background jobs pending. + virtual int GetBackgroundBacklog() const {return(0);}; + + // Riak specific: Get object that is tracking various software counters + virtual PerformanceCounters * GetPerformanceCounters() {return(gPerfCounters);} +#endif + + private: + // No copying allowed + Env(const Env&); + void operator=(const Env&); +}; + +// A file abstraction for reading sequentially through a file +class SequentialFile { + public: + SequentialFile() { } + virtual ~SequentialFile(); + + // Read up to "n" bytes from the file. "scratch[0..n-1]" may be + // written by this routine. Sets "*result" to the data that was + // read (including if fewer than "n" bytes were successfully read). + // May set "*result" to point at data in "scratch[0..n-1]", so + // "scratch[0..n-1]" must be live when "*result" is used. + // If an error was encountered, returns a non-OK status. + // + // REQUIRES: External synchronization + virtual Status Read(size_t n, Slice* result, char* scratch) = 0; + + // Skip "n" bytes from the file. This is guaranteed to be no + // slower that reading the same data, but may be faster. + // + // If end of file is reached, skipping will stop at the end of the + // file, and Skip will return OK. + // + // REQUIRES: External synchronization + virtual Status Skip(uint64_t n) = 0; + + private: + // No copying allowed + SequentialFile(const SequentialFile&); + void operator=(const SequentialFile&); +}; + +// A file abstraction for randomly reading the contents of a file. +class RandomAccessFile { + public: + RandomAccessFile() { } + virtual ~RandomAccessFile(); + + // Read up to "n" bytes from the file starting at "offset". + // "scratch[0..n-1]" may be written by this routine. Sets "*result" + // to the data that was read (including if fewer than "n" bytes were + // successfully read). May set "*result" to point at data in + // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when + // "*result" is used. If an error was encountered, returns a non-OK + // status. + // + // Safe for concurrent use by multiple threads. + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const = 0; + + private: + // No copying allowed + RandomAccessFile(const RandomAccessFile&); + void operator=(const RandomAccessFile&); +}; + +// A file abstraction for sequential writing. The implementation +// must provide buffering since callers may append small fragments +// at a time to the file. +class WritableFile { + public: + WritableFile() { } + virtual ~WritableFile(); + + virtual Status Append(const Slice& data) = 0; + virtual Status Close() = 0; + virtual Status Flush() = 0; + virtual Status Sync() = 0; + + private: + // No copying allowed + WritableFile(const WritableFile&); + void operator=(const WritableFile&); +}; + +// An interface for writing log messages. +class Logger { + public: + Logger() { } + virtual ~Logger(); + + // Write an entry to the log file with the specified format. + virtual void Logv(const char* format, va_list ap) = 0; + + private: + // No copying allowed + Logger(const Logger&); + void operator=(const Logger&); +}; + + +// Identifies a locked file. +class FileLock { + public: + FileLock() { } + virtual ~FileLock(); + private: + // No copying allowed + FileLock(const FileLock&); + void operator=(const FileLock&); +}; + +// Log the specified data to *info_log if info_log is non-NULL. +extern void Log(Logger* info_log, const char* format, ...) +# if defined(__GNUC__) || defined(__clang__) + __attribute__((__format__ (__printf__, 2, 3))) +# endif + ; + +// A utility routine: write "data" to the named file. +extern Status WriteStringToFile(Env* env, const Slice& data, + const std::string& fname); + +// A utility routine: read contents of named file into *data +extern Status ReadFileToString(Env* env, const std::string& fname, + std::string* data); + +// An implementation of Env that forwards all calls to another Env. +// May be useful to clients who wish to override just part of the +// functionality of another Env. +class EnvWrapper : public Env { + public: + // Initialize an EnvWrapper that delegates all calls to *t + explicit EnvWrapper(Env* t) : target_(t) { } + virtual ~EnvWrapper(); + + // Return the target to which this Env forwards all calls + Env* target() const { return target_; } + + // The following text is boilerplate that forwards all methods to target() + Status NewSequentialFile(const std::string& f, SequentialFile** r) { + return target_->NewSequentialFile(f, r); + } + Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) { + return target_->NewRandomAccessFile(f, r); + } + Status NewWritableFile(const std::string& f, WritableFile** r) { + return target_->NewWritableFile(f, r); + } + bool FileExists(const std::string& f) { return target_->FileExists(f); } + Status GetChildren(const std::string& dir, std::vector<std::string>* r) { + return target_->GetChildren(dir, r); + } + Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); } + Status CreateDir(const std::string& d) { return target_->CreateDir(d); } + Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); } + Status GetFileSize(const std::string& f, uint64_t* s) { + return target_->GetFileSize(f, s); + } + Status RenameFile(const std::string& s, const std::string& t) { + return target_->RenameFile(s, t); + } + Status LockFile(const std::string& f, FileLock** l) { + return target_->LockFile(f, l); + } + Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); } + void Schedule(void (*f)(void*), void* a) { + return target_->Schedule(f, a); + } + void StartThread(void (*f)(void*), void* a) { + return target_->StartThread(f, a); + } + virtual Status GetTestDirectory(std::string* path) { + return target_->GetTestDirectory(path); + } + virtual Status NewLogger(const std::string& fname, Logger** result) { + return target_->NewLogger(fname, result); + } + uint64_t NowMicros() { + return target_->NowMicros(); + } + void SleepForMicroseconds(int micros) { + target_->SleepForMicroseconds(micros); + } + private: + Env* target_; +}; + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_INCLUDE_ENV_H_ diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/filter_policy.h b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/filter_policy.h new file mode 100644 index 00000000000..e434ef4b241 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/filter_policy.h @@ -0,0 +1,78 @@ +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A database can be configured with a custom FilterPolicy object. +// This object is responsible for creating a small filter from a set +// of keys. These filters are stored in leveldb and are consulted +// automatically by leveldb to decide whether or not to read some +// information from disk. In many cases, a filter can cut down the +// number of disk seeks form a handful to a single disk seek per +// DB::Get() call. +// +// Most people will want to use the builtin bloom filter support (see +// NewBloomFilterPolicy() below). + +#ifndef STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_ +#define STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_ + +#include "leveldb_wt_config.h" +#if defined(HAVE_ROCKSDB) && !defined(leveldb) +#define leveldb rocksdb +#endif + +#include <string> + +namespace leveldb { + +class Slice; + +class FilterPolicy { + public: + virtual ~FilterPolicy(); + + // Return the name of this policy. Note that if the filter encoding + // changes in an incompatible way, the name returned by this method + // must be changed. Otherwise, old incompatible filters may be + // passed to methods of this type. + virtual const char* Name() const = 0; + + // keys[0,n-1] contains a list of keys (potentially with duplicates) + // that are ordered according to the user supplied comparator. + // Append a filter that summarizes keys[0,n-1] to *dst. + // + // Warning: do not change the initial contents of *dst. Instead, + // append the newly constructed filter to *dst. + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) + const = 0; + + // "filter" contains the data appended by a preceding call to + // CreateFilter() on this class. This method must return true if + // the key was in the list of keys passed to CreateFilter(). + // This method may return true or false if the key was not on the + // list, but it should aim to return false with a high probability. + virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0; +}; + +// Return a new filter policy that uses a bloom filter with approximately +// the specified number of bits per key. A good value for bits_per_key +// is 10, which yields a filter with ~ 1% false positive rate. +// +// Callers must delete the result after any database that is using the +// result has been closed. +// +// Note: if you are using a custom comparator that ignores some parts +// of the keys being compared, you must not use NewBloomFilterPolicy() +// and must provide your own FilterPolicy that also ignores the +// corresponding parts of the keys. For example, if the comparator +// ignores trailing spaces, it would be incorrect to use a +// FilterPolicy (like NewBloomFilterPolicy) that does not ignore +// trailing spaces in keys. +extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key); +#if HAVE_BASHOLEVELDB +extern const FilterPolicy* NewBloomFilterPolicy2(int bits_per_key); +#endif + +} + +#endif // STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_ diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/iterator.h b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/iterator.h new file mode 100644 index 00000000000..2d97d180b17 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/iterator.h @@ -0,0 +1,105 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An iterator yields a sequence of key/value pairs from a source. +// The following class defines the interface. Multiple implementations +// are provided by this library. In particular, iterators are provided +// to access the contents of a Table or a DB. +// +// Multiple threads can invoke const methods on an Iterator without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Iterator must use +// external synchronization. + +#ifndef STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ +#define STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ + +#include "leveldb_wt_config.h" +#if defined(HAVE_ROCKSDB) && !defined(leveldb) +#define leveldb rocksdb +#endif + +#include "slice.h" +#include "status.h" + +namespace leveldb { + +class Iterator { + public: + Iterator(); + virtual ~Iterator(); + + // An iterator is either positioned at a key/value pair, or + // not valid. This method returns true iff the iterator is valid. + virtual bool Valid() const = 0; + + // Position at the first key in the source. The iterator is Valid() + // after this call iff the source is not empty. + virtual void SeekToFirst() = 0; + + // Position at the last key in the source. The iterator is + // Valid() after this call iff the source is not empty. + virtual void SeekToLast() = 0; + + // Position at the first key in the source that at or past target + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or past target. + virtual void Seek(const Slice& target) = 0; + + // Moves to the next entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the last entry in the source. + // REQUIRES: Valid() + virtual void Next() = 0; + + // Moves to the previous entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the first entry in source. + // REQUIRES: Valid() + virtual void Prev() = 0; + + // Return the key for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: Valid() + virtual Slice key() const = 0; + + // Return the value for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: !AtEnd() && !AtStart() + virtual Slice value() const = 0; + + // If an error has occurred, return it. Else return an ok status. + virtual Status status() const = 0; + + // Clients are allowed to register function/arg1/arg2 triples that + // will be invoked when this iterator is destroyed. + // + // Note that unlike all of the preceding methods, this method is + // not abstract and therefore clients should not override it. + typedef void (*CleanupFunction)(void* arg1, void* arg2); + void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2); + + private: + struct Cleanup { + CleanupFunction function; + void* arg1; + void* arg2; + Cleanup* next; + }; + Cleanup cleanup_; + + // No copying allowed + Iterator(const Iterator&); + void operator=(const Iterator&); +}; + +// Return an empty iterator (yields nothing). +extern Iterator* NewEmptyIterator(); + +// Return an empty iterator with the specified status. +extern Iterator* NewErrorIterator(const Status& status); + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/options.h b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/options.h new file mode 100644 index 00000000000..9dcf73fc2a0 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/options.h @@ -0,0 +1,258 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ +#define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ + +#include "leveldb_wt_config.h" +#if defined(HAVE_ROCKSDB) && !defined(leveldb) +#define leveldb rocksdb +#endif + +#include <memory> +#include <stddef.h> + +namespace leveldb { + +class Cache; +class Comparator; +class Env; +class FilterPolicy; +class Logger; +class Snapshot; + +// DB contents are stored in a set of blocks, each of which holds a +// sequence of key,value pairs. Each block may be compressed before +// being stored in a file. The following enum describes which +// compression method (if any) is used to compress a block. +enum CompressionType { + // NOTE: do not change the values of existing entries, as these are + // part of the persistent format on disk. + kNoCompression = 0x0, + kSnappyCompression = 0x1 +#ifdef HAVE_ROCKSDB + , kZlibCompression = 0x2 +#endif +}; + +// Options to control the behavior of a database (passed to DB::Open) +struct Options { + // ------------------- + // Parameters that affect behavior + + // Comparator used to define the order of keys in the table. + // Default: a comparator that uses lexicographic byte-wise ordering + // + // REQUIRES: The client must ensure that the comparator supplied + // here has the same name and orders keys *exactly* the same as the + // comparator provided to previous open calls on the same DB. + const Comparator* comparator; + + // If true, the database will be created if it is missing. + // Default: false + bool create_if_missing; + + // If true, an error is raised if the database already exists. + // Default: false + bool error_if_exists; + + // If true, the implementation will do aggressive checking of the + // data it is processing and will stop early if it detects any + // errors. This may have unforeseen ramifications: for example, a + // corruption of one DB entry may cause a large number of entries to + // become unreadable or for the entire DB to become unopenable. + // Default: false + bool paranoid_checks; + +#ifdef HAVE_ROCKSDB + // By default, RocksDB uses only one background thread for flush and + // compaction. Calling this function will set it up such that total of + // `total_threads` is used. Good value for `total_threads` is the number of + // cores. You almost definitely want to call this function if your system is + // bottlenecked by RocksDB. + Options* IncreaseParallelism(int = 16) { return this; } + Options* OptimizeLevelStyleCompaction() { return this; } +#endif + +#if HAVE_BASHOLEVELDB + // Riak specific: this variable replaces paranoid_checks at one + // one place in the code. This variable alone controls whether or not + // compaction read operations check CRC values. Riak needs + // the compaction CRC check, but not other paranoid_checks ... so + // this independent control. + // Default: true + bool verify_compactions; +#endif + + // Use the specified object to interact with the environment, + // e.g. to read/write files, schedule background work, etc. + // Default: Env::Default() + Env* env; + + // Any internal progress/error information generated by the db will + // be written to info_log if it is non-NULL, or to a file stored + // in the same directory as the DB contents if info_log is NULL. + // Default: NULL + Logger* info_log; + + // ------------------- + // Parameters that affect performance + + // Amount of data to build up in memory (backed by an unsorted log + // on disk) before converting to a sorted on-disk file. + // + // Larger values increase performance, especially during bulk loads. + // Up to two write buffers may be held in memory at the same time, + // so you may wish to adjust this parameter to control memory usage. + // Also, a larger write buffer will result in a longer recovery time + // the next time the database is opened. + // + // Default: 4MB + size_t write_buffer_size; + + // Number of open files that can be used by the DB. You may need to + // increase this if your database has a large working set (budget + // one open file per 2MB of working set). + // + // Default: 1000 + int max_open_files; + + // Control over blocks (user data is stored in a set of blocks, and + // a block is the unit of reading from disk). + + // If non-NULL, use the specified cache for blocks. + // If NULL, leveldb will automatically create and use an 8MB internal cache. + // Default: NULL + Cache* block_cache; + + // Approximate size of user data packed per block. Note that the + // block size specified here corresponds to uncompressed data. The + // actual size of the unit read from disk may be smaller if + // compression is enabled. This parameter can be changed dynamically. + // + // Default: 4K + size_t block_size; + + // Number of keys between restart points for delta encoding of keys. + // This parameter can be changed dynamically. Most clients should + // leave this parameter alone. + // + // Default: 16 + int block_restart_interval; + + // Compress blocks using the specified compression algorithm. This + // parameter can be changed dynamically. + // + // Default: kSnappyCompression, which gives lightweight but fast + // compression. + // + // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz: + // ~200-500MB/s compression + // ~400-800MB/s decompression + // Note that these speeds are significantly faster than most + // persistent storage speeds, and therefore it is typically never + // worth switching to kNoCompression. Even if the input data is + // incompressible, the kSnappyCompression implementation will + // efficiently detect that and will switch to uncompressed mode. + CompressionType compression; + + // If non-NULL, use the specified filter policy to reduce disk reads. + // Many applications will benefit from passing the result of + // NewBloomFilterPolicy() here. + // + // Default: NULL + const FilterPolicy* filter_policy; + +#ifdef HAVE_HYPERLEVELDB + // Is the database used with the Replay mechanism? If yes, the lower bound on + // values to compact is (somewhat) left up to the application; if no, then + // LevelDB functions as usual, and uses snapshots to determine the lower + // bound. HyperLevelDB will always maintain the integrity of snapshots, so + // the application merely has the option to hold data as if it's holding a + // snapshot. This just prevents compaction from grabbing data before the app + // can get a snapshot. + // + // Default: false/no. + bool manual_garbage_collection; +#endif + + // Create an Options object with default values for all fields. + Options(); +}; + +#ifdef HAVE_ROCKSDB +struct ColumnFamilyOptions : public Options { + ColumnFamilyOptions() : Options() {} +}; + +struct DBOptions : public Options { + DBOptions() : Options() {} +}; +#endif + +// Options that control read operations +struct ReadOptions { + // If true, all data read from underlying storage will be + // verified against corresponding checksums. + // Default: false + bool verify_checksums; + + // Should the data read for this iteration be cached in memory? + // Callers may wish to set this field to false for bulk scans. + // Default: true + bool fill_cache; + + // If "snapshot" is non-NULL, read as of the supplied snapshot + // (which must belong to the DB that is being read and which must + // not have been released). If "snapshot" is NULL, use an impliicit + // snapshot of the state at the beginning of this read operation. + // Default: NULL + const Snapshot* snapshot; + + ReadOptions() + : verify_checksums(false), + fill_cache(true), + snapshot(NULL) { + } +}; + +// Options that control write operations +struct WriteOptions { + // If true, the write will be flushed from the operating system + // buffer cache (by calling WritableFile::Sync()) before the write + // is considered complete. If this flag is true, writes will be + // slower. + // + // If this flag is false, and the machine crashes, some recent + // writes may be lost. Note that if it is just the process that + // crashes (i.e., the machine does not reboot), no writes will be + // lost even if sync==false. + // + // In other words, a DB write with sync==false has similar + // crash semantics as the "write()" system call. A DB write + // with sync==true has similar crash semantics to a "write()" + // system call followed by "fsync()". + // + // Default: false + bool sync; + + WriteOptions() + : sync(false) { + } +}; + +#ifdef HAVE_ROCKSDB +// Options that control flush operations +struct FlushOptions { + // If true, the flush will wait until the flush is done. + // Default: true + bool wait; + + FlushOptions() : wait(true) {} +}; +#endif + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/slice.h b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/slice.h new file mode 100644 index 00000000000..1eb66dd825f --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/slice.h @@ -0,0 +1,127 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Slice is a simple structure containing a pointer into some external +// storage and a size. The user of a Slice must ensure that the slice +// is not used after the corresponding external storage has been +// deallocated. +// +// Multiple threads can invoke const methods on a Slice without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Slice must use +// external synchronization. + +#ifndef STORAGE_LEVELDB_INCLUDE_SLICE_H_ +#define STORAGE_LEVELDB_INCLUDE_SLICE_H_ + +#include "leveldb_wt_config.h" +#if defined(HAVE_ROCKSDB) && !defined(leveldb) +#define leveldb rocksdb +#endif + +#include <assert.h> +#include <stddef.h> +#include <string.h> +#include <string> + +namespace leveldb { + +class Slice { + public: + // Create an empty slice. + Slice() : data_(""), size_(0) { } + + // Create a slice that refers to d[0,n-1]. + Slice(const char* d, size_t n) : data_(d), size_(n) { } + + // Create a slice that refers to the contents of "s" + Slice(const std::string& s) : data_(s.data()), size_(s.size()) { } + + // Create a slice that refers to s[0,strlen(s)-1] + Slice(const char* s) : data_(s), size_(strlen(s)) { } + + // Return a pointer to the beginning of the referenced data + const char* data() const { return data_; } + + // Return the length (in bytes) of the referenced data + size_t size() const { return size_; } + + // Return true iff the length of the referenced data is zero + bool empty() const { return size_ == 0; } + + // Return the ith byte in the referenced data. + // REQUIRES: n < size() + char operator[](size_t n) const { + assert(n < size()); + return data_[n]; + } + + // Change this slice to refer to an empty array + void clear() { data_ = ""; size_ = 0; } + + // Drop the first "n" bytes from this slice. + void remove_prefix(size_t n) { + assert(n <= size()); + data_ += n; + size_ -= n; + } + + // Return a string that contains the copy of the referenced data. + std::string ToString() const { return std::string(data_, size_); } + + // Three-way comparison. Returns value: + // < 0 iff "*this" < "b", + // == 0 iff "*this" == "b", + // > 0 iff "*this" > "b" + int compare(const Slice& b) const; + + // Return true iff "x" is a prefix of "*this" + bool starts_with(const Slice& x) const { + return ((size_ >= x.size_) && + (memcmp(data_, x.data_, x.size_) == 0)); + } + +// The LevelDB JNI layer peeks in here +// private: + const char* data_; + size_t size_; + + // Intentionally copyable +}; + +#ifdef HAVE_ROCKSDB +// A set of Slices that are virtually concatenated together. 'parts' points +// to an array of Slices. The number of elements in the array is 'num_parts'. +struct SliceParts { + SliceParts(const Slice* _parts, int _num_parts) : + parts(_parts), num_parts(_num_parts) { } + + const Slice* parts; + int num_parts; +}; +#endif + +inline bool operator==(const Slice& x, const Slice& y) { + return ((x.size() == y.size()) && + (memcmp(x.data(), y.data(), x.size()) == 0)); +} + +inline bool operator!=(const Slice& x, const Slice& y) { + return !(x == y); +} + +inline int Slice::compare(const Slice& b) const { + const size_t min_len = (size_ < b.size_) ? size_ : b.size_; + int r = memcmp(data_, b.data_, min_len); + if (r == 0) { + if (size_ < b.size_) r = -1; + else if (size_ > b.size_) r = +1; + } + return r; +} + +} // namespace leveldb + + +#endif // STORAGE_LEVELDB_INCLUDE_SLICE_H_ diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/status.h b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/status.h new file mode 100644 index 00000000000..3c21f64462b --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/status.h @@ -0,0 +1,111 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Status encapsulates the result of an operation. It may indicate success, +// or it may indicate an error with an associated error message. +// +// Multiple threads can invoke const methods on a Status without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Status must use +// external synchronization. + +#ifndef STORAGE_LEVELDB_INCLUDE_STATUS_H_ +#define STORAGE_LEVELDB_INCLUDE_STATUS_H_ + +#include "leveldb_wt_config.h" +#if defined(HAVE_ROCKSDB) && !defined(leveldb) +#define leveldb rocksdb +#endif + +#include <string> +#include "slice.h" + +namespace leveldb { + +class Status { + public: + // Create a success status. + Status() : state_(NULL) { } + ~Status() { delete[] state_; } + + // Copy the specified status. + Status(const Status& s); + void operator=(const Status& s); + + // Return a success status. + static Status OK() { return Status(); } + + // Return error status of an appropriate type. + static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kNotFound, msg, msg2); + } + static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kCorruption, msg, msg2); + } + static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kNotSupported, msg, msg2); + } + static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kInvalidArgument, msg, msg2); + } + static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIOError, msg, msg2); + } + + // Returns true iff the status indicates success. + bool ok() const { return (state_ == NULL); } + + // Returns true iff the status indicates a NotFound error. + bool IsNotFound() const { return code() == kNotFound; } + + // Returns true iff the status indicates a Corruption error. + bool IsCorruption() const { return code() == kCorruption; } + + // Returns true iff the status indicates an IOError. + bool IsIOError() const { return code() == kIOError; } + + // Return a string representation of this status suitable for printing. + // Returns the string "OK" for success. + std::string ToString() const; + + private: + // OK status has a NULL state_. Otherwise, state_ is a new[] array + // of the following form: + // state_[0..3] == length of message + // state_[4] == code + // state_[5..] == message + const char* state_; + + enum Code { + kOk = 0, + kNotFound = 1, + kCorruption = 2, + kNotSupported = 3, + kInvalidArgument = 4, + kIOError = 5 + }; + + Code code() const { + return (state_ == NULL) ? kOk : static_cast<Code>(state_[4]); + } + + Status(Code code, const Slice& msg, const Slice& msg2); + static const char* CopyState(const char* s); +}; + +inline Status::Status(const Status& s) { + state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_); +} +inline void Status::operator=(const Status& s) { + // The following condition catches both aliasing (when this == &s), + // and the common case where both s and *this are ok. + if (state_ != s.state_) { + delete[] state_; + state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_); + } +} + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_INCLUDE_STATUS_H_ diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/write_batch.h b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/write_batch.h new file mode 100644 index 00000000000..293b41ad818 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/write_batch.h @@ -0,0 +1,142 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBatch holds a collection of updates to apply atomically to a DB. +// +// The updates are applied in the order in which they are added +// to the WriteBatch. For example, the value of "key" will be "v3" +// after the following batch is written: +// +// batch.Put("key", "v1"); +// batch.Delete("key"); +// batch.Put("key", "v2"); +// batch.Put("key", "v3"); +// +// Multiple threads can invoke const methods on a WriteBatch without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same WriteBatch must use +// external synchronization. + +#ifndef STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ +#define STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ + +#include "leveldb_wt_config.h" +#if defined(HAVE_ROCKSDB) && !defined(leveldb) +#define leveldb rocksdb +#endif + +#include <string> +#include "status.h" + +namespace leveldb { + +class Slice; +#if HAVE_ROCKSDB +class ColumnFamilyHandle; +struct SliceParts; +#endif + +class WriteBatch { + public: +#ifdef HAVE_ROCKSDB + explicit WriteBatch(size_t reserved_bytes = 0); +#else + WriteBatch(); +#endif + ~WriteBatch(); + + // Store the mapping "key->value" in the database. + void Put(const Slice& key, const Slice& value); + + // If the database contains a mapping for "key", erase it. Else do nothing. + void Delete(const Slice& key); + + // Clear all updates buffered in this batch. + void Clear(); + +#ifdef HAVE_ROCKSDB + void Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value); + + // Variant of Put() that gathers output like writev(2). The key and value + // that will be written to the database are concatentations of arrays of + // slices. + void Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value); + + void Delete(ColumnFamilyHandle* column_family, const Slice& key); +#endif + + // Support for iterating over the contents of a batch. + class Handler { + public: + virtual ~Handler(); +#ifdef HAVE_ROCKSDB + // default implementation will just call Put without column family for + // backwards compatibility. If the column family is not default, + // the function is noop + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + if (column_family_id == 0) { + // Put() historically doesn't return status. We didn't want to be + // backwards incompatible so we didn't change the return status + // (this is a public API). We do an ordinary get and return Status::OK() + Put(key, value); + return Status::OK(); + } + return Status::InvalidArgument( + "non-default column family and PutCF not implemented"); + } + // Merge and LogData are not pure virtual. Otherwise, we would break + // existing clients of Handler on a source code level. The default + // implementation of Merge simply throws a runtime exception. + virtual Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + if (column_family_id == 0) { + Merge(key, value); + return Status::OK(); + } + return Status::InvalidArgument( + "non-default column family and MergeCF not implemented"); + } + virtual void Merge(const Slice& key, const Slice& value); + // The default implementation of LogData does nothing. + virtual void LogData(const Slice& blob); + virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) { + if (column_family_id == 0) { + Delete(key); + return Status::OK(); + } + return Status::InvalidArgument( + "non-default column family and DeleteCF not implemented"); + } + // Continue is called by WriteBatch::Iterate. If it returns false, + // iteration is halted. Otherwise, it continues iterating. The default + // implementation always returns true. + virtual bool Continue(); +#endif + virtual void Put(const Slice& key, const Slice& value) = 0; + virtual void Delete(const Slice& key) = 0; + }; + Status Iterate(Handler* handler) const; + +#ifdef HAVE_ROCKSDB + // Retrieve data size of the batch. + size_t GetDataSize() const { return rep_.size(); } + + // Returns the number of updates in the batch + int Count() const; +#endif + + private: + friend class WriteBatchInternal; + + std::string rep_; // See comment in write_batch.cc for the format of rep_ + + // Intentionally copyable +}; + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/port/port.h b/src/third_party/wiredtiger/api/leveldb/leveldb/port/port.h new file mode 100644 index 00000000000..1f83635a82c --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/port/port.h @@ -0,0 +1,38 @@ +/*- + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _PORT_H_ +#define _PORT_H_ 1 +/* Stub portability header for imported LevelDB code. */ + +#include "wiredtiger.h" + +namespace port { + const int kLittleEndian = 1; +} + +#endif diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/util/arena.h b/src/third_party/wiredtiger/api/leveldb/leveldb/util/arena.h new file mode 100644 index 00000000000..8f7dde226c4 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/util/arena.h @@ -0,0 +1,68 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_ARENA_H_ +#define STORAGE_LEVELDB_UTIL_ARENA_H_ + +#include <cstddef> +#include <vector> +#include <assert.h> +#include <stdint.h> + +namespace leveldb { + +class Arena { + public: + Arena(); + ~Arena(); + + // Return a pointer to a newly allocated memory block of "bytes" bytes. + char* Allocate(size_t bytes); + + // Allocate memory with the normal alignment guarantees provided by malloc + char* AllocateAligned(size_t bytes); + + // Returns an estimate of the total memory usage of data allocated + // by the arena (including space allocated but not yet used for user + // allocations). + size_t MemoryUsage() const { + return blocks_memory_ + blocks_.capacity() * sizeof(char*); + } + + private: + char* AllocateFallback(size_t bytes); + char* AllocateNewBlock(size_t block_bytes); + + // Allocation state + char* alloc_ptr_; + size_t alloc_bytes_remaining_; + + // Array of new[] allocated memory blocks + std::vector<char*> blocks_; + + // Bytes of memory in blocks allocated so far + size_t blocks_memory_; + + // No copying allowed + Arena(const Arena&); + void operator=(const Arena&); +}; + +inline char* Arena::Allocate(size_t bytes) { + // The semantics of what to return are a bit messy if we allow + // 0-byte allocations, so we disallow them here (we don't need + // them for our internal use). + assert(bytes > 0); + if (bytes <= alloc_bytes_remaining_) { + char* result = alloc_ptr_; + alloc_ptr_ += bytes; + alloc_bytes_remaining_ -= bytes; + return result; + } + return AllocateFallback(bytes); +} + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_UTIL_ARENA_H_ diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/util/coding.cc b/src/third_party/wiredtiger/api/leveldb/leveldb/util/coding.cc new file mode 100644 index 00000000000..ad1f457a16a --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/util/coding.cc @@ -0,0 +1,163 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/coding.h" + +namespace leveldb { + +char* EncodeVarint32(char* dst, uint32_t v) { + // Operate on characters as unsigneds + unsigned char* ptr = reinterpret_cast<unsigned char*>(dst); + static const int B = 128; + if (v < (1<<7)) { + *(ptr++) = v; + } else if (v < (1<<14)) { + *(ptr++) = v | B; + *(ptr++) = v>>7; + } else if (v < (1<<21)) { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = v>>14; + } else if (v < (1<<28)) { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = (v>>14) | B; + *(ptr++) = v>>21; + } else { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = (v>>14) | B; + *(ptr++) = (v>>21) | B; + *(ptr++) = v>>28; + } + return reinterpret_cast<char*>(ptr); +} + +const char* GetVarint32PtrFallback(const char* p, + const char* limit, + uint32_t* value) { + uint32_t result = 0; + for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) { + uint32_t byte = *(reinterpret_cast<const unsigned char*>(p)); + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return reinterpret_cast<const char*>(p); + } + } + return NULL; +} + +const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) { + uint64_t result = 0; + for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) { + uint64_t byte = *(reinterpret_cast<const unsigned char*>(p)); + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return reinterpret_cast<const char*>(p); + } + } + return NULL; +} + +#ifdef HAVE_ROCKSDB +void BitStreamPutInt(char* dst, size_t dstlen, size_t offset, + uint32_t bits, uint64_t value) { + assert((offset + bits + 7)/8 <= dstlen); + assert(bits <= 64); + + unsigned char* ptr = reinterpret_cast<unsigned char*>(dst); + + size_t byteOffset = offset / 8; + size_t bitOffset = offset % 8; + + // This prevents unused variable warnings when compiling. +#ifndef NDEBUG + // Store truncated value. + uint64_t origValue = (bits < 64)?(value & (((uint64_t)1 << bits) - 1)):value; + uint32_t origBits = bits; +#endif + + while (bits > 0) { + size_t bitsToGet = std::min<size_t>(bits, 8 - bitOffset); + unsigned char mask = ((1 << bitsToGet) - 1); + + ptr[byteOffset] = (ptr[byteOffset] & ~(mask << bitOffset)) + + ((value & mask) << bitOffset); + + value >>= bitsToGet; + byteOffset += 1; + bitOffset = 0; + bits -= bitsToGet; + } + + assert(origValue == BitStreamGetInt(dst, dstlen, offset, origBits)); +} + +uint64_t BitStreamGetInt(const char* src, size_t srclen, size_t offset, + uint32_t bits) { + assert((offset + bits + 7)/8 <= srclen); + assert(bits <= 64); + + const unsigned char* ptr = reinterpret_cast<const unsigned char*>(src); + + uint64_t result = 0; + + size_t byteOffset = offset / 8; + size_t bitOffset = offset % 8; + size_t shift = 0; + + while (bits > 0) { + size_t bitsToGet = std::min<size_t>(bits, 8 - bitOffset); + unsigned char mask = ((1 << bitsToGet) - 1); + + result += (uint64_t)((ptr[byteOffset] >> bitOffset) & mask) << shift; + + shift += bitsToGet; + byteOffset += 1; + bitOffset = 0; + bits -= bitsToGet; + } + + return result; + } + +void BitStreamPutInt(std::string* dst, size_t offset, uint32_t bits, + uint64_t value) { + assert((offset + bits + 7)/8 <= dst->size()); + + const size_t kTmpBufLen = sizeof(value) + 1; + char tmpBuf[kTmpBufLen]; + + // Number of bytes of tmpBuf being used + const size_t kUsedBytes = (offset%8 + bits)/8; + + // Copy relevant parts of dst to tmpBuf + for (size_t idx = 0; idx <= kUsedBytes; ++idx) { + tmpBuf[idx] = (*dst)[offset/8 + idx]; + } + + BitStreamPutInt(tmpBuf, kTmpBufLen, offset%8, bits, value); + + // Copy tmpBuf back to dst + for (size_t idx = 0; idx <= kUsedBytes; ++idx) { + (*dst)[offset/8 + idx] = tmpBuf[idx]; + + // Do the check here too as we are working with a buffer. + assert(((bits < 64)?(value & (((uint64_t)1 << bits) - 1)):value) == + BitStreamGetInt(dst, offset, bits)); + } +} +#endif + +} // namespace leveldb diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/util/coding.h b/src/third_party/wiredtiger/api/leveldb/leveldb/util/coding.h new file mode 100644 index 00000000000..ed56ef4ea2d --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/util/coding.h @@ -0,0 +1,311 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Endian-neutral encoding: +// * Fixed-length numbers are encoded with least-significant byte first +// * In addition we support variable length "varint" encoding +// * Strings are encoded prefixed by their length in varint format + +#ifndef STORAGE_LEVELDB_UTIL_CODING_H_ +#define STORAGE_LEVELDB_UTIL_CODING_H_ + +#include <algorithm> +#include <stdint.h> +#include <string.h> +#include <string> +#include "leveldb_wt.h" +#include "port/port.h" + +namespace leveldb { + +// The maximum length of a varint in bytes for 32 and 64 bits respectively. +const unsigned int kMaxVarint32Length = 5; +const unsigned int kMaxVarint64Length = 10; + +// Standard Put... routines append to a string +extern void PutFixed32(std::string* dst, uint32_t value); +extern void PutFixed64(std::string* dst, uint64_t value); +extern void PutVarint32(std::string* dst, uint32_t value); +extern void PutVarint64(std::string* dst, uint64_t value); +extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value); + +// Standard Get... routines parse a value from the beginning of a Slice +// and advance the slice past the parsed value. +extern bool GetVarint32(Slice* input, uint32_t* value); +extern bool GetVarint64(Slice* input, uint64_t* value); +extern bool GetLengthPrefixedSlice(Slice* input, Slice* result); + +#ifdef HAVE_ROCKSDB +extern void PutLengthPrefixedSliceParts(std::string* dst, + const SliceParts& slice_parts); +extern bool GetFixed64(Slice* input, uint64_t* value); +// This function assumes data is well-formed. +extern Slice GetLengthPrefixedSlice(const char* data); + +extern Slice GetSliceUntil(Slice* slice, char delimiter); +#endif + +// Pointer-based variants of GetVarint... These either store a value +// in *v and return a pointer just past the parsed value, or return +// NULL on error. These routines only look at bytes in the range +// [p..limit-1] +extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v); +extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v); + +// Returns the length of the varint32 or varint64 encoding of "v" +extern int VarintLength(uint64_t v); + +// Lower-level versions of Put... that write directly into a character buffer +// REQUIRES: dst has enough space for the value being written +extern void EncodeFixed32(char* dst, uint32_t value); +extern void EncodeFixed64(char* dst, uint64_t value); + +// Lower-level versions of Put... that write directly into a character buffer +// and return a pointer just past the last byte written. +// REQUIRES: dst has enough space for the value being written +extern char* EncodeVarint32(char* dst, uint32_t value); +extern char* EncodeVarint64(char* dst, uint64_t value); + +// Lower-level versions of Get... that read directly from a character buffer +// without any bounds checking. + +inline uint32_t DecodeFixed32(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint32_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0]))) + | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8) + | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16) + | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24)); + } +} + +inline uint64_t DecodeFixed64(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint64_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + uint64_t lo = DecodeFixed32(ptr); + uint64_t hi = DecodeFixed32(ptr + 4); + return (hi << 32) | lo; + } +} + +// Internal routine for use by fallback path of GetVarint32Ptr +extern const char* GetVarint32PtrFallback(const char* p, + const char* limit, + uint32_t* value); +inline const char* GetVarint32Ptr(const char* p, + const char* limit, + uint32_t* value) { + if (p < limit) { + uint32_t result = *(reinterpret_cast<const unsigned char*>(p)); + if ((result & 128) == 0) { + *value = result; + return p + 1; + } + } + return GetVarint32PtrFallback(p, limit, value); +} + +// Writes an unsigned integer with bits number of bits with its least +// significant bit at offset. +// Bits are numbered from 0 to 7 in the first byte, 8 to 15 in the second and +// so on. +// value is truncated to the bits number of least significant bits. +// REQUIRES: (offset+bits+7)/8 <= dstlen +// REQUIRES: bits <= 64 +extern void BitStreamPutInt(char* dst, size_t dstlen, size_t offset, + uint32_t bits, uint64_t value); + +// Reads an unsigned integer with bits number of bits with its least +// significant bit at offset. +// Bits are numbered in the same way as ByteStreamPutInt(). +// REQUIRES: (offset+bits+7)/8 <= srclen +// REQUIRES: bits <= 64 +extern uint64_t BitStreamGetInt(const char* src, size_t srclen, size_t offset, + uint32_t bits); + +// Convenience functions +extern void BitStreamPutInt(std::string* dst, size_t offset, uint32_t bits, + uint64_t value); +extern uint64_t BitStreamGetInt(const std::string* src, size_t offset, + uint32_t bits); +extern uint64_t BitStreamGetInt(const Slice* src, size_t offset, + uint32_t bits); + +// -- Implementation of the functions declared above +inline void EncodeFixed32(char* buf, uint32_t value) { +#if __BYTE_ORDER == __LITTLE_ENDIAN + memcpy(buf, &value, sizeof(value)); +#else + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; +#endif +} + +inline void EncodeFixed64(char* buf, uint64_t value) { +#if __BYTE_ORDER == __LITTLE_ENDIAN + memcpy(buf, &value, sizeof(value)); +#else + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; + buf[4] = (value >> 32) & 0xff; + buf[5] = (value >> 40) & 0xff; + buf[6] = (value >> 48) & 0xff; + buf[7] = (value >> 56) & 0xff; +#endif +} + +inline void PutFixed32(std::string* dst, uint32_t value) { + char buf[sizeof(value)]; + EncodeFixed32(buf, value); + dst->append(buf, sizeof(buf)); +} + +inline void PutFixed64(std::string* dst, uint64_t value) { + char buf[sizeof(value)]; + EncodeFixed64(buf, value); + dst->append(buf, sizeof(buf)); +} + +inline void PutVarint32(std::string* dst, uint32_t v) { + char buf[5]; + char* ptr = EncodeVarint32(buf, v); + dst->append(buf, ptr - buf); +} + +inline char* EncodeVarint64(char* dst, uint64_t v) { + static const unsigned int B = 128; + unsigned char* ptr = reinterpret_cast<unsigned char*>(dst); + while (v >= B) { + *(ptr++) = (v & (B - 1)) | B; + v >>= 7; + } + *(ptr++) = static_cast<unsigned char>(v); + return reinterpret_cast<char*>(ptr); +} + +inline void PutVarint64(std::string* dst, uint64_t v) { + char buf[10]; + char* ptr = EncodeVarint64(buf, v); + dst->append(buf, ptr - buf); +} + +inline void PutLengthPrefixedSlice(std::string* dst, const Slice& value) { + PutVarint32(dst, value.size()); + dst->append(value.data(), value.size()); +} + +#ifdef HAVE_ROCKSDB +inline void PutLengthPrefixedSliceParts(std::string* dst, + const SliceParts& slice_parts) { + uint32_t total_bytes = 0; + for (int i = 0; i < slice_parts.num_parts; ++i) { + total_bytes += slice_parts.parts[i].size(); + } + PutVarint32(dst, total_bytes); + for (int i = 0; i < slice_parts.num_parts; ++i) { + dst->append(slice_parts.parts[i].data(), slice_parts.parts[i].size()); + } +} +#endif + +inline int VarintLength(uint64_t v) { + int len = 1; + while (v >= 128) { + v >>= 7; + len++; + } + return len; +} + +#ifdef HAVE_ROCKSDB +inline bool GetFixed64(Slice* input, uint64_t* value) { + if (input->size() < sizeof(uint64_t)) { + return false; + } + *value = DecodeFixed64(input->data()); + input->remove_prefix(sizeof(uint64_t)); + return true; +} +#endif + +inline bool GetVarint32(Slice* input, uint32_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint32Ptr(p, limit, value); + if (q == NULL) { + return false; + } else { + *input = Slice(q, limit - q); + return true; + } +} + +inline bool GetVarint64(Slice* input, uint64_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint64Ptr(p, limit, value); + if (q == NULL) { + return false; + } else { + *input = Slice(q, limit - q); + return true; + } +} + +inline bool GetLengthPrefixedSlice(Slice* input, Slice* result) { + uint32_t len = 0; + if (GetVarint32(input, &len) && input->size() >= len) { + *result = Slice(input->data(), len); + input->remove_prefix(len); + return true; + } else { + return false; + } +} + +#ifdef HAVE_ROCKSDB +inline Slice GetLengthPrefixedSlice(const char* data) { + uint32_t len = 0; + // +5: we assume "data" is not corrupted + const char *p = GetVarint32Ptr(data, data + 5 /* limit */, &len); + return Slice(p, len); +} + +inline Slice GetSliceUntil(Slice* slice, char delimiter) { + uint32_t len = 0; + for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) { + // nothing + } + + Slice ret(slice->data(), len); + slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0)); + return ret; +} +#endif + +inline uint64_t BitStreamGetInt(const std::string* src, size_t offset, + uint32_t bits) { + return BitStreamGetInt(src->data(), src->size(), offset, bits); +} + +inline uint64_t BitStreamGetInt(const Slice* src, size_t offset, + uint32_t bits) { + return BitStreamGetInt(src->data(), src->size(), offset, bits); +} + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_UTIL_CODING_H_ diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/util/comparator.cc b/src/third_party/wiredtiger/api/leveldb/leveldb/util/comparator.cc new file mode 100644 index 00000000000..57c89628af9 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/util/comparator.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include <algorithm> +#include <stdint.h> +#include "leveldb_wt.h" +#include "port/port.h" +#include "util/logging.h" + +namespace leveldb { + +Comparator::~Comparator() { } + +#ifdef HAVE_HYPERLEVELDB +uint64_t Comparator::KeyNum(const Slice& key) const { + return 0; +} +#endif + +namespace { +class BytewiseComparatorImpl : public Comparator { + public: + BytewiseComparatorImpl() { } + + virtual const char* Name() const { + return "leveldb.BytewiseComparator"; + } + + virtual int Compare(const Slice& a, const Slice& b) const { + return a.compare(b); + } + + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const { + // Find length of common prefix + size_t min_length = std::min(start->size(), limit.size()); + size_t diff_index = 0; + while ((diff_index < min_length) && + ((*start)[diff_index] == limit[diff_index])) { + diff_index++; + } + + if (diff_index >= min_length) { + // Do not shorten if one string is a prefix of the other + } else { + uint8_t diff_byte = static_cast<uint8_t>((*start)[diff_index]); + if (diff_byte < static_cast<uint8_t>(0xff) && + diff_byte + 1 < static_cast<uint8_t>(limit[diff_index])) { + (*start)[diff_index]++; + start->resize(diff_index + 1); + assert(Compare(*start, limit) < 0); + } + } + } + + virtual void FindShortSuccessor(std::string* key) const { + // Find first character that can be incremented + size_t n = key->size(); + for (size_t i = 0; i < n; i++) { + const uint8_t byte = (*key)[i]; + if (byte != static_cast<uint8_t>(0xff)) { + (*key)[i] = byte + 1; + key->resize(i+1); + return; + } + } + // *key is a run of 0xffs. Leave it alone. + } +}; +} // namespace + +static const Comparator* bytewise = new BytewiseComparatorImpl; + +const Comparator* BytewiseComparator() { + return bytewise; +} + +} // namespace leveldb diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/util/env.cc b/src/third_party/wiredtiger/api/leveldb/leveldb/util/env.cc new file mode 100644 index 00000000000..00a04f0dc3e --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/util/env.cc @@ -0,0 +1,96 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb_wt.h" + +namespace leveldb { + +Env::~Env() { +} + +SequentialFile::~SequentialFile() { +} + +RandomAccessFile::~RandomAccessFile() { +} + +WritableFile::~WritableFile() { +} + +Logger::~Logger() { +} + +FileLock::~FileLock() { +} + +void Log(Logger* info_log, const char* format, ...) { + if (info_log != NULL) { + va_list ap; + va_start(ap, format); + info_log->Logv(format, ap); + va_end(ap); + } +} + +static Status DoWriteStringToFile(Env* env, const Slice& data, + const std::string& fname, + bool should_sync) { + WritableFile* file; + Status s = env->NewWritableFile(fname, &file); + if (!s.ok()) { + return s; + } + s = file->Append(data); + if (s.ok() && should_sync) { + s = file->Sync(); + } + if (s.ok()) { + s = file->Close(); + } + delete file; // Will auto-close if we did not close above + if (!s.ok()) { + env->DeleteFile(fname); + } + return s; +} + +Status WriteStringToFile(Env* env, const Slice& data, + const std::string& fname) { + return DoWriteStringToFile(env, data, fname, false); +} + +Status WriteStringToFileSync(Env* env, const Slice& data, + const std::string& fname) { + return DoWriteStringToFile(env, data, fname, true); +} + +Status ReadFileToString(Env* env, const std::string& fname, std::string* data) { + data->clear(); + SequentialFile* file; + Status s = env->NewSequentialFile(fname, &file); + if (!s.ok()) { + return s; + } + static const int kBufferSize = 8192; + char* space = new char[kBufferSize]; + while (true) { + Slice fragment; + s = file->Read(kBufferSize, &fragment, space); + if (!s.ok()) { + break; + } + data->append(fragment.data(), fragment.size()); + if (fragment.empty()) { + break; + } + } + delete[] space; + delete file; + return s; +} + +EnvWrapper::~EnvWrapper() { +} + +} // namespace leveldb diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/util/env_posix.cc b/src/third_party/wiredtiger/api/leveldb/leveldb/util/env_posix.cc new file mode 100644 index 00000000000..084ae160807 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/util/env_posix.cc @@ -0,0 +1,625 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include <deque> +#include <dirent.h> +#include <errno.h> +#include <fcntl.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/types.h> +#include <time.h> +#include <unistd.h> +#if defined(LEVELDB_PLATFORM_ANDROID) +#include <sys/stat.h> +#endif +#include "leveldb_wt.h" +#include "port/port.h" +#include "util/logging.h" +#include "util/posix_logger.h" + +namespace leveldb { + +namespace { + +static Status IOError(const std::string& context, int err_number) { + return Status::IOError(context, strerror(err_number)); +} + +class PosixSequentialFile: public SequentialFile { + private: + std::string filename_; + FILE* file_; + + public: + PosixSequentialFile(const std::string& fname, FILE* f) + : filename_(fname), file_(f) { } + virtual ~PosixSequentialFile() { fclose(file_); } + + virtual Status Read(size_t n, Slice* result, char* scratch) { + Status s; +#ifdef HAVE_FREAD_UNLOCKED + size_t r = fread_unlocked(scratch, 1, n, file_); +#else + size_t r = fread(scratch, 1, n, file_); +#endif + *result = Slice(scratch, r); + if (r < n) { + if (feof(file_)) { + // We leave status as ok if we hit the end of the file + } else { + // A partial read with an error: return a non-ok status + s = IOError(filename_, errno); + } + } + return s; + } + + virtual Status Skip(uint64_t n) { + if (fseek(file_, n, SEEK_CUR)) { + return IOError(filename_, errno); + } + return Status::OK(); + } +}; + +// pread() based random-access +class PosixRandomAccessFile: public RandomAccessFile { + private: + std::string filename_; + int fd_; + + public: + PosixRandomAccessFile(const std::string& fname, int fd) + : filename_(fname), fd_(fd) { } + virtual ~PosixRandomAccessFile() { close(fd_); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + ssize_t r = pread(fd_, scratch, n, static_cast<off_t>(offset)); + *result = Slice(scratch, (r < 0) ? 0 : r); + if (r < 0) { + // An error: return a non-ok status + s = IOError(filename_, errno); + } + return s; + } +}; + +// mmap() based random-access +class PosixMmapReadableFile: public RandomAccessFile { + private: + std::string filename_; + void* mmapped_region_; + size_t length_; + + public: + // base[0,length-1] contains the mmapped contents of the file. + PosixMmapReadableFile(const std::string& fname, void* base, size_t length) + : filename_(fname), mmapped_region_(base), length_(length) { } + virtual ~PosixMmapReadableFile() { munmap(mmapped_region_, length_); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + if (offset + n > length_) { + *result = Slice(); + s = IOError(filename_, EINVAL); + } else { + *result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n); + } + return s; + } +}; + +// We preallocate up to an extra megabyte and use memcpy to append new +// data to the file. This is safe since we either properly close the +// file before reading from it, or for log files, the reading code +// knows enough to skip zero suffixes. +class PosixMmapFile : public WritableFile { + private: + std::string filename_; + int fd_; + size_t page_size_; + size_t map_size_; // How much extra memory to map at a time + char* base_; // The mapped region + char* limit_; // Limit of the mapped region + char* dst_; // Where to write next (in range [base_,limit_]) + char* last_sync_; // Where have we synced up to + uint64_t file_offset_; // Offset of base_ in file + + // Have we done an munmap of unsynced data? + bool pending_sync_; + + // Roundup x to a multiple of y + static size_t Roundup(size_t x, size_t y) { + return ((x + y - 1) / y) * y; + } + + size_t TruncateToPageBoundary(size_t s) { + s -= (s & (page_size_ - 1)); + assert((s % page_size_) == 0); + return s; + } + + bool UnmapCurrentRegion() { + bool result = true; + if (base_ != NULL) { + if (last_sync_ < limit_) { + // Defer syncing this data until next Sync() call, if any + pending_sync_ = true; + } + if (munmap(base_, limit_ - base_) != 0) { + result = false; + } + file_offset_ += limit_ - base_; + base_ = NULL; + limit_ = NULL; + last_sync_ = NULL; + dst_ = NULL; + + // Increase the amount we map the next time, but capped at 1MB + if (map_size_ < (1<<20)) { + map_size_ *= 2; + } + } + return result; + } + + bool MapNewRegion() { + assert(base_ == NULL); + if (ftruncate(fd_, file_offset_ + map_size_) < 0) { + return false; + } + void* ptr = mmap(NULL, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, + fd_, file_offset_); + if (ptr == MAP_FAILED) { + return false; + } + base_ = reinterpret_cast<char*>(ptr); + limit_ = base_ + map_size_; + dst_ = base_; + last_sync_ = base_; + return true; + } + + public: + PosixMmapFile(const std::string& fname, int fd, size_t page_size) + : filename_(fname), + fd_(fd), + page_size_(page_size), + map_size_(Roundup(65536, page_size)), + base_(NULL), + limit_(NULL), + dst_(NULL), + last_sync_(NULL), + file_offset_(0), + pending_sync_(false) { + assert((page_size & (page_size - 1)) == 0); + } + + + ~PosixMmapFile() { + if (fd_ >= 0) { + PosixMmapFile::Close(); + } + } + + virtual Status Append(const Slice& data) { + const char* src = data.data(); + size_t left = data.size(); + while (left > 0) { + assert(base_ <= dst_); + assert(dst_ <= limit_); + size_t avail = limit_ - dst_; + if (avail == 0) { + if (!UnmapCurrentRegion() || + !MapNewRegion()) { + return IOError(filename_, errno); + } + } + + size_t n = (left <= avail) ? left : avail; + memcpy(dst_, src, n); + dst_ += n; + src += n; + left -= n; + } + return Status::OK(); + } + + virtual Status Close() { + Status s; + size_t unused = limit_ - dst_; + if (!UnmapCurrentRegion()) { + s = IOError(filename_, errno); + } else if (unused > 0) { + // Trim the extra space at the end of the file + if (ftruncate(fd_, file_offset_ - unused) < 0) { + s = IOError(filename_, errno); + } + } + + if (close(fd_) < 0) { + if (s.ok()) { + s = IOError(filename_, errno); + } + } + + fd_ = -1; + base_ = NULL; + limit_ = NULL; + return s; + } + + virtual Status Flush() { + return Status::OK(); + } + + virtual Status Sync() { + Status s; + + if (pending_sync_) { + // Some unmapped data was not synced + pending_sync_ = false; +#ifdef HAVE_FDATASYNC + if (fdatasync(fd_) < 0) { +#else + if (fsync(fd_) < 0) { +#endif + s = IOError(filename_, errno); + } + } + + if (dst_ > last_sync_) { + // Find the beginnings of the pages that contain the first and last + // bytes to be synced. + size_t p1 = TruncateToPageBoundary(last_sync_ - base_); + size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1); + last_sync_ = dst_; + if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { + s = IOError(filename_, errno); + } + } + + return s; + } + +#ifdef HAVE_HYPERLEVELDB + virtual Status WriteAt(uint64_t offset, const Slice& data) { return Status::NotSupported("sorry!"); } +#endif +}; + +static int LockOrUnlock(int fd, bool lock) { + errno = 0; + struct flock f; + memset(&f, 0, sizeof(f)); + f.l_type = (lock ? F_WRLCK : F_UNLCK); + f.l_whence = SEEK_SET; + f.l_start = 0; + f.l_len = 0; // Lock/unlock entire file + return fcntl(fd, F_SETLK, &f); +} + +class PosixFileLock : public FileLock { + public: + int fd_; +}; + +class PosixEnv : public Env { + public: + PosixEnv(); + virtual ~PosixEnv() { + fprintf(stderr, "Destroying Env::Default()\n"); + exit(1); + } + + virtual Status NewSequentialFile(const std::string& fname, + SequentialFile** result) { + FILE* f = fopen(fname.c_str(), "r"); + if (f == NULL) { + *result = NULL; + return IOError(fname, errno); + } else { + *result = new PosixSequentialFile(fname, f); + return Status::OK(); + } + } + + virtual Status NewRandomAccessFile(const std::string& fname, + RandomAccessFile** result) { + *result = NULL; + Status s; + int fd = open(fname.c_str(), O_RDONLY); + if (fd < 0) { + s = IOError(fname, errno); + } else if (sizeof(void*) >= 8) { + // Use mmap when virtual address-space is plentiful. + uint64_t size; + s = GetFileSize(fname, &size); + if (s.ok()) { + void* base = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); + if (base != MAP_FAILED) { + *result = new PosixMmapReadableFile(fname, base, size); + } else { + s = IOError(fname, errno); + } + } + close(fd); + } else { + *result = new PosixRandomAccessFile(fname, fd); + } + return s; + } + + virtual Status NewWritableFile(const std::string& fname, + WritableFile** result) { + Status s; + const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); + if (fd < 0) { + *result = NULL; + s = IOError(fname, errno); + } else { + *result = new PosixMmapFile(fname, fd, page_size_); + } + return s; + } + + virtual bool FileExists(const std::string& fname) { + return access(fname.c_str(), F_OK) == 0; + } + + virtual Status GetChildren(const std::string& dir, + std::vector<std::string>* result) { + result->clear(); + DIR* d = opendir(dir.c_str()); + if (d == NULL) { + return IOError(dir, errno); + } + struct dirent* entry; + while ((entry = readdir(d)) != NULL) { + result->push_back(entry->d_name); + } + closedir(d); + return Status::OK(); + } + + virtual Status DeleteFile(const std::string& fname) { + Status result; + if (unlink(fname.c_str()) != 0) { + result = IOError(fname, errno); + } + return result; + }; + + virtual Status CreateDir(const std::string& name) { + Status result; + if (mkdir(name.c_str(), 0755) != 0) { + result = IOError(name, errno); + } + return result; + }; + + virtual Status DeleteDir(const std::string& name) { + Status result; + if (rmdir(name.c_str()) != 0) { + result = IOError(name, errno); + } + return result; + }; + + virtual Status GetFileSize(const std::string& fname, uint64_t* size) { + Status s; + struct stat sbuf; + if (stat(fname.c_str(), &sbuf) != 0) { + *size = 0; + s = IOError(fname, errno); + } else { + *size = sbuf.st_size; + } + return s; + } + + virtual Status RenameFile(const std::string& src, const std::string& target) { + Status result; + if (rename(src.c_str(), target.c_str()) != 0) { + result = IOError(src, errno); + } + return result; + } + + virtual Status LockFile(const std::string& fname, FileLock** lock) { + *lock = NULL; + Status result; + int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644); + if (fd < 0) { + result = IOError(fname, errno); + } else if (LockOrUnlock(fd, true) == -1) { + result = IOError("lock " + fname, errno); + close(fd); + } else { + PosixFileLock* my_lock = new PosixFileLock; + my_lock->fd_ = fd; + *lock = my_lock; + } + return result; + } + + virtual Status UnlockFile(FileLock* lock) { + PosixFileLock* my_lock = reinterpret_cast<PosixFileLock*>(lock); + Status result; + if (LockOrUnlock(my_lock->fd_, false) == -1) { + result = IOError("unlock", errno); + } + close(my_lock->fd_); + delete my_lock; + return result; + } + + virtual void Schedule(void (*function)(void*), void* arg); + + virtual void StartThread(void (*function)(void* arg), void* arg); + + virtual Status GetTestDirectory(std::string* result) { + const char* env = getenv("TEST_TMPDIR"); + if (env && env[0] != '\0') { + *result = env; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "/tmp/leveldbtest-%d", int(geteuid())); + *result = buf; + } + // Directory may already exist + CreateDir(*result); + return Status::OK(); + } + + static uint64_t gettid() { + pthread_t tid = pthread_self(); + uint64_t thread_id = 0; + memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid))); + return thread_id; + } + + virtual Status NewLogger(const std::string& fname, Logger** result) { + FILE* f = fopen(fname.c_str(), "w"); + if (f == NULL) { + *result = NULL; + return IOError(fname, errno); + } else { + *result = new PosixLogger(f, &PosixEnv::gettid); + return Status::OK(); + } + } + + virtual uint64_t NowMicros() { + struct timeval tv; + gettimeofday(&tv, NULL); + return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec; + } + + virtual void SleepForMicroseconds(int micros) { + usleep(micros); + } + +#ifdef HAVE_HYPERLEVELDB + virtual Status CopyFile(const std::string&, const std::string&) { return Status::NotSupported("sorry!"); } + virtual Status LinkFile(const std::string&, const std::string&) { return Status::NotSupported("sorry!"); } +#endif + + private: + void PthreadCall(const char* label, int result) { + if (result != 0) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + exit(1); + } + } + + // BGThread() is the body of the background thread + void BGThread(); + static void* BGThreadWrapper(void* arg) { + reinterpret_cast<PosixEnv*>(arg)->BGThread(); + return NULL; + } + + size_t page_size_; + pthread_mutex_t mu_; + pthread_cond_t bgsignal_; + pthread_t bgthread_; + bool started_bgthread_; + + // Entry per Schedule() call + struct BGItem { void* arg; void (*function)(void*); }; + typedef std::deque<BGItem> BGQueue; + BGQueue queue_; +}; + +PosixEnv::PosixEnv() : page_size_(getpagesize()), + started_bgthread_(false) { + PthreadCall("mutex_init", pthread_mutex_init(&mu_, NULL)); + PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, NULL)); +} + +void PosixEnv::Schedule(void (*function)(void*), void* arg) { + PthreadCall("lock", pthread_mutex_lock(&mu_)); + + // Start background thread if necessary + if (!started_bgthread_) { + started_bgthread_ = true; + PthreadCall( + "create thread", + pthread_create(&bgthread_, NULL, &PosixEnv::BGThreadWrapper, this)); + } + + // If the queue is currently empty, the background thread may currently be + // waiting. + if (queue_.empty()) { + PthreadCall("signal", pthread_cond_signal(&bgsignal_)); + } + + // Add to priority queue + queue_.push_back(BGItem()); + queue_.back().function = function; + queue_.back().arg = arg; + + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); +} + +void PosixEnv::BGThread() { + while (true) { + // Wait until there is an item that is ready to run + PthreadCall("lock", pthread_mutex_lock(&mu_)); + while (queue_.empty()) { + PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_)); + } + + void (*function)(void*) = queue_.front().function; + void* arg = queue_.front().arg; + queue_.pop_front(); + + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + (*function)(arg); + } +} + +namespace { +struct StartThreadState { + void (*user_function)(void*); + void* arg; +}; +} +static void* StartThreadWrapper(void* arg) { + StartThreadState* state = reinterpret_cast<StartThreadState*>(arg); + state->user_function(state->arg); + delete state; + return NULL; +} + +void PosixEnv::StartThread(void (*function)(void* arg), void* arg) { + pthread_t t; + StartThreadState* state = new StartThreadState; + state->user_function = function; + state->arg = arg; + PthreadCall("start thread", + pthread_create(&t, NULL, &StartThreadWrapper, state)); +} + +} // namespace + +static pthread_once_t once = PTHREAD_ONCE_INIT; +static Env* default_env; +static void InitDefaultEnv() { default_env = new PosixEnv; } + +Env* Env::Default() { + pthread_once(&once, InitDefaultEnv); + return default_env; +} + +} // namespace leveldb diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/util/logging.cc b/src/third_party/wiredtiger/api/leveldb/leveldb/util/logging.cc new file mode 100644 index 00000000000..96526e76123 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/util/logging.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/logging.h" + +#include <errno.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include "leveldb_wt.h" + +namespace leveldb { + +void AppendNumberTo(std::string* str, uint64_t num) { + char buf[30]; + snprintf(buf, sizeof(buf), "%llu", (unsigned long long) num); + str->append(buf); +} + +void AppendEscapedStringTo(std::string* str, const Slice& value) { + for (size_t i = 0; i < value.size(); i++) { + char c = value[i]; + if (c >= ' ' && c <= '~') { + str->push_back(c); + } else { + char buf[10]; + snprintf(buf, sizeof(buf), "\\x%02x", + static_cast<unsigned int>(c) & 0xff); + str->append(buf); + } + } +} + +std::string NumberToString(uint64_t num) { + std::string r; + AppendNumberTo(&r, num); + return r; +} + +std::string EscapeString(const Slice& value) { + std::string r; + AppendEscapedStringTo(&r, value); + return r; +} + +bool ConsumeChar(Slice* in, char c) { + if (!in->empty() && (*in)[0] == c) { + in->remove_prefix(1); + return true; + } else { + return false; + } +} + +bool ConsumeDecimalNumber(Slice* in, uint64_t* val) { + uint64_t v = 0; + int digits = 0; + while (!in->empty()) { + char c = (*in)[0]; + if (c >= '0' && c <= '9') { + ++digits; + const int delta = (c - '0'); + static const uint64_t kMaxUint64 = ~static_cast<uint64_t>(0); + if (v > kMaxUint64/10 || + (v == kMaxUint64/10 && (uint64_t)delta > kMaxUint64%10)) { + // Overflow + return false; + } + v = (v * 10) + delta; + in->remove_prefix(1); + } else { + break; + } + } + *val = v; + return (digits > 0); +} + +} // namespace leveldb diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/util/logging.h b/src/third_party/wiredtiger/api/leveldb/leveldb/util/logging.h new file mode 100644 index 00000000000..b0c5da813e8 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/util/logging.h @@ -0,0 +1,47 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Must not be included from any .h files to avoid polluting the namespace +// with macros. + +#ifndef STORAGE_LEVELDB_UTIL_LOGGING_H_ +#define STORAGE_LEVELDB_UTIL_LOGGING_H_ + +#include <stdio.h> +#include <stdint.h> +#include <string> +#include "port/port.h" + +namespace leveldb { + +class Slice; +class WritableFile; + +// Append a human-readable printout of "num" to *str +extern void AppendNumberTo(std::string* str, uint64_t num); + +// Append a human-readable printout of "value" to *str. +// Escapes any non-printable characters found in "value". +extern void AppendEscapedStringTo(std::string* str, const Slice& value); + +// Return a human-readable printout of "num" +extern std::string NumberToString(uint64_t num); + +// Return a human-readable version of "value". +// Escapes any non-printable characters found in "value". +extern std::string EscapeString(const Slice& value); + +// If *in starts with "c", advances *in past the first character and +// returns true. Otherwise, returns false. +extern bool ConsumeChar(Slice* in, char c); + +// Parse a human-readable number from "*in" into *value. On success, +// advances "*in" past the consumed number and sets "*val" to the +// numeric value. Otherwise, returns false and leaves *in in an +// unspecified state. +extern bool ConsumeDecimalNumber(Slice* in, uint64_t* val); + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_UTIL_LOGGING_H_ diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/util/options.cc b/src/third_party/wiredtiger/api/leveldb/leveldb/util/options.cc new file mode 100644 index 00000000000..a8c79233bb5 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/util/options.cc @@ -0,0 +1,26 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb_wt.h" + +namespace leveldb { + +Options::Options() + : comparator(BytewiseComparator()), + create_if_missing(false), + error_if_exists(false), + paranoid_checks(false), + env(Env::Default()), + info_log(NULL), + write_buffer_size(4<<20), + max_open_files(1000), + block_cache(NULL), + block_size(4096), + block_restart_interval(16), + compression(kSnappyCompression), + filter_policy(NULL) { +} + + +} // namespace leveldb diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/util/posix_logger.h b/src/third_party/wiredtiger/api/leveldb/leveldb/util/posix_logger.h new file mode 100644 index 00000000000..f15de45e05e --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/util/posix_logger.h @@ -0,0 +1,98 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Logger implementation that can be shared by all environments +// where enough posix functionality is available. + +#ifndef STORAGE_LEVELDB_UTIL_POSIX_LOGGER_H_ +#define STORAGE_LEVELDB_UTIL_POSIX_LOGGER_H_ + +#include <algorithm> +#include <stdio.h> +#include <sys/time.h> +#include <time.h> +#include "leveldb_wt.h" + +namespace leveldb { + +class PosixLogger : public Logger { + private: + FILE* file_; + uint64_t (*gettid_)(); // Return the thread id for the current thread + public: + PosixLogger(FILE* f, uint64_t (*gettid)()) : file_(f), gettid_(gettid) { } + virtual ~PosixLogger() { + fclose(file_); + } + virtual void Logv(const char* format, va_list ap) { + const uint64_t thread_id = (*gettid_)(); + + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + for (int iter = 0; iter < 2; iter++) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 30000; + base = new char[bufsize]; + } + char* p = base; + char* limit = base + bufsize; + + struct timeval now_tv; + gettimeofday(&now_tv, NULL); + const time_t seconds = now_tv.tv_sec; + struct tm t; + localtime_r(&seconds, &t); + p += snprintf(p, limit - p, + "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", + t.tm_year + 1900, + t.tm_mon + 1, + t.tm_mday, + t.tm_hour, + t.tm_min, + t.tm_sec, + static_cast<int>(now_tv.tv_usec), + static_cast<long long unsigned int>(thread_id)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + p += vsnprintf(p, limit - p, format, backup_ap); + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + fwrite(base, 1, p - base, file_); + fflush(file_); + if (base != buffer) { + delete[] base; + } + break; + } + } +}; + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_UTIL_POSIX_LOGGER_H_ diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/util/random.h b/src/third_party/wiredtiger/api/leveldb/leveldb/util/random.h new file mode 100644 index 00000000000..66e0c94e7cb --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/util/random.h @@ -0,0 +1,72 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_RANDOM_H_ +#define STORAGE_LEVELDB_UTIL_RANDOM_H_ + +#include <stdint.h> + +namespace leveldb { + +// A very simple random number generator. Not especially good at +// generating truly random bits, but good enough for our needs in this +// package. +class Random { + private: + uint32_t seed_; + public: + explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { } + uint32_t Next() { + static const uint32_t M = 2147483647L; // 2^31-1 + static const uint64_t A = 16807; // bits 14, 8, 7, 5, 2, 1, 0 + // We are computing + // seed_ = (seed_ * A) % M, where M = 2^31-1 + // + // seed_ must not be zero or M, or else all subsequent computed values + // will be zero or M respectively. For all other values, seed_ will end + // up cycling through every number in [1,M-1] + uint64_t product = seed_ * A; + + // Compute (product % M) using the fact that ((x << 31) % M) == x. + seed_ = static_cast<uint32_t>((product >> 31) + (product & M)); + // The first reduction may overflow by 1 bit, so we may need to + // repeat. mod == M is not possible; using > allows the faster + // sign-bit-based test. + if (seed_ > M) { + seed_ -= M; + } + return seed_; + } + // Returns a uniformly distributed value in the range [0..n-1] + // REQUIRES: n > 0 + uint32_t Uniform(int n) { return Next() % n; } + + // Randomly returns true ~"1/n" of the time, and false otherwise. + // REQUIRES: n > 0 + bool OneIn(int n) { return (Next() % n) == 0; } + + // Skewed: pick "base" uniformly from range [0,max_log] and then + // return "base" random bits. The effect is to pick a number in the + // range [0,2^max_log-1] with exponential bias towards smaller numbers. + uint32_t Skewed(int max_log) { + return Uniform(1 << Uniform(max_log + 1)); + } + + // Shuffle the array into random order + void Shuffle(int *array, int n) { + if (n > 1) { + int i; + for (i=0; i<n-1; i++) { + int j = i + Next() / (2147483647 / (n-i) + 1); + int t = array[j]; + array[j] = array[i]; + array[i] = t; + } + } + } +}; + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_UTIL_RANDOM_H_ diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/util/status.cc b/src/third_party/wiredtiger/api/leveldb/leveldb/util/status.cc new file mode 100644 index 00000000000..e8edd9dbb11 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb/util/status.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include <stdio.h> +#include "leveldb_wt.h" + +namespace leveldb { + +const char* Status::CopyState(const char* state) { + uint32_t size; + memcpy(&size, state, sizeof(size)); + char* result = new char[size + 5]; + memcpy(result, state, size + 5); + return result; +} + +Status::Status(Code code_arg, const Slice& msg, const Slice& msg2) { + assert(code_arg != kOk); + const uint32_t len1 = msg.size(); + const uint32_t len2 = msg2.size(); + const uint32_t size = len1 + (len2 ? (2 + len2) : 0); + char* result = new char[size + 5]; + memcpy(result, &size, sizeof(size)); + result[4] = static_cast<char>(code_arg); + memcpy(result + 5, msg.data(), len1); + if (len2) { + result[5 + len1] = ':'; + result[6 + len1] = ' '; + memcpy(result + 7 + len1, msg2.data(), len2); + } + state_ = result; +} + +std::string Status::ToString() const { + if (state_ == NULL) { + return "OK"; + } else { + char tmp[30]; + const char* type; + switch (code()) { + case kOk: + type = "OK"; + break; + case kNotFound: + type = "NotFound: "; + break; + case kCorruption: + type = "Corruption: "; + break; + case kNotSupported: + type = "Not implemented: "; + break; + case kInvalidArgument: + type = "Invalid argument: "; + break; + case kIOError: + type = "IO error: "; + break; + default: + snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", + static_cast<int>(code())); + type = tmp; + break; + } + std::string result(type); + uint32_t length; + memcpy(&length, state_, sizeof(length)); + result.append(state_ + 5, length); + return result; + } +} + +} // namespace leveldb diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb_test.cc b/src/third_party/wiredtiger/api/leveldb/leveldb_test.cc new file mode 100644 index 00000000000..25cfe0c379e --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb_test.cc @@ -0,0 +1,141 @@ +/*- + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <assert.h> +#include <iostream> +#include "leveldb_wt.h" + +using namespace std; + +extern "C" int main() { + leveldb::DB* db; + leveldb::Options options; + options.create_if_missing = true; + leveldb::Status s = leveldb::DB::Open(options, "WTLDB_HOME", &db); + assert(s.ok()); + + s = db->Put(leveldb::WriteOptions(), "key", "value"); + s = db->Put(leveldb::WriteOptions(), "key2", "value2"); + s = db->Put(leveldb::WriteOptions(), "key3", "value3"); + s = db->Put(leveldb::WriteOptions(), "key4", "value4"); + assert(s.ok()); + +#ifdef HAVE_HYPERLEVELDB + leveldb::ReplayIterator* replay_start; + leveldb::ReplayIterator* replay_ts; + leveldb::ReplayIterator* replay_now; + leveldb::ReplayIterator* replay_last; + std::string timestamp; + std::string timestamp_last; + + cout << "Perform Live Backup" << endl; + s = db->LiveBackup("test"); + + // Test out a bunch of the ReplayIterator methods. + db->GetReplayTimestamp(×tamp); + cout << "timestamp 1 " << timestamp << endl << "Put key5" << endl; + s = db->Put(leveldb::WriteOptions(), "key5", "value5"); + db->GetReplayTimestamp(×tamp_last); + // Verify a bunch of timestamp comparisons + cout << "timestamp 2 " << timestamp_last << endl; + cout << "CompareTimestamps tests" << endl; + assert(db->CompareTimestamps(timestamp, timestamp_last) < 0); + assert(db->CompareTimestamps("all", timestamp_last) < 0); + assert(db->CompareTimestamps(timestamp, "now") < 0); + assert(db->CompareTimestamps("now", timestamp_last) == 0); + assert(db->CompareTimestamps(timestamp_last, "now") == 0); + assert(db->CompareTimestamps("now", timestamp) > 0); + assert(db->CompareTimestamps("now", "all") > 0); + + s = db->GetReplayIterator("all", &replay_start); + assert(replay_start->Valid()); + cout << "Replay at all(start):" << endl; + cout << replay_start->key().ToString() << ": " << replay_start->value().ToString() << endl; + s = db->GetReplayIterator(timestamp, &replay_ts); + assert(replay_ts->Valid()); + cout << "Replay at timestamp " << timestamp << ":" << endl; + cout << replay_ts->key().ToString() << ": " << replay_ts->value().ToString() << endl; + s = db->GetReplayIterator("now", &replay_now); + assert(replay_now->Valid()); + cout << "Replay at now(end):" << endl; + cout << replay_now->key().ToString() << ": " << replay_now->value().ToString() << endl; + s = db->GetReplayIterator(timestamp_last, &replay_last); + assert(replay_last->Valid()); + cout << "Replay at last timestamp " << timestamp_last << ":" << endl; + cout << replay_last->key().ToString() << ": " << replay_last->value().ToString() << endl; + assert(replay_now->key().ToString() == replay_last->key().ToString()); + cout << "Replay walk from all/start:" << endl; + while (replay_start->Valid()) { + cout << replay_start->key().ToString() << ": " << replay_start->value().ToString() << endl; + replay_start->Next(); + } + // We reached the end of log, iterator should still not be valid. + // But if we write something, the iterator should find it and become + // valid again. + assert(!replay_start->Valid()); + s = db->Put(leveldb::WriteOptions(), "key6", "value6"); + assert(replay_start->Valid()); + db->ReleaseReplayIterator(replay_start); + db->ReleaseReplayIterator(replay_ts); + db->ReleaseReplayIterator(replay_now); + db->ReleaseReplayIterator(replay_last); +#endif + + // Read through the main database + cout << "Read main database:" << endl; + leveldb::ReadOptions read_options; + read_options.snapshot = db->GetSnapshot(); + leveldb::Iterator* iter = db->NewIterator(read_options); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + cout << iter->key().ToString() << ": " << iter->value().ToString() << endl; + } + + delete iter; + db->ReleaseSnapshot(read_options.snapshot); + + delete db; + +#ifdef HAVE_HYPERLEVELDB + // Read through the backup database + leveldb::DB* db_bkup; + options.create_if_missing = false; + s = leveldb::DB::Open(options, "WTLDB_HOME/backup-test", &db_bkup); + read_options.snapshot = db_bkup->GetSnapshot(); + iter = db_bkup->NewIterator(read_options); + cout << "Read Backup database:" << endl; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + cout << iter->key().ToString() << ": " << iter->value().ToString() << endl; + } + + delete iter; + db_bkup->ReleaseSnapshot(read_options.snapshot); + + delete db_bkup; +#endif + + return (0); +} diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb_wt.cc b/src/third_party/wiredtiger/api/leveldb/leveldb_wt.cc new file mode 100644 index 00000000000..8fc7d1ca092 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb_wt.cc @@ -0,0 +1,810 @@ +/*- + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "leveldb_wt.h" +#include <errno.h> +#include <sys/stat.h> +#include <unistd.h> +#include <sstream> + +#if HAVE_BASHOLEVELDB +namespace leveldb { +Value::~Value() {} + +class StringValue : public Value { + public: + explicit StringValue(std::string& val) : value_(val) {} + ~StringValue() {} + + StringValue& assign(const char* data, size_t size) { + value_.assign(data, size); + return *this; + } + + private: + std::string& value_; +}; +} +#endif + +Status leveldb::DestroyDB(const std::string& name, const Options& options) { + WT_CONNECTION *conn; + int ret, t_ret; + /* If the database doesn't exist, there is nothing to destroy. */ + if (access((name + "/WiredTiger").c_str(), F_OK) != 0) + return Status::OK(); + if ((ret = ::wiredtiger_open(name.c_str(), NULL, NULL, &conn)) != 0) + return WiredTigerErrorToStatus(ret, NULL); + WT_SESSION *session; + if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) + goto cleanup; + if ((ret = session->drop(session, WT_URI, "force")) != 0) + goto cleanup; + +cleanup: + if ((t_ret = conn->close(conn, NULL)) != 0 && ret == 0) + ret = t_ret; + return WiredTigerErrorToStatus(ret, NULL); +} + +Status leveldb::RepairDB(const std::string& dbname, const Options& options) { + return Status::NotSupported("sorry!"); +} + +/* Destructors required for interfaces. */ +leveldb::DB::~DB() {} +Snapshot::~Snapshot() {} + +Status WiredTigerErrorToStatus(int wiredTigerError, const char *msg) { + if (wiredTigerError == 0) + return Status::OK(); + + if (msg == NULL) + msg = wiredtiger_strerror(wiredTigerError); + + if (wiredTigerError != WT_NOTFOUND) + printf("Failing status: %d -> %s\n", wiredTigerError, msg); + + if (wiredTigerError == WT_NOTFOUND) + return Status::NotFound(Slice(msg)); + else if (wiredTigerError == WT_ERROR || wiredTigerError == WT_PANIC) + return Status::Corruption(Slice(msg)); + else if (wiredTigerError == ENOTSUP) + return Status::NotSupported(Slice(msg)); + else if (wiredTigerError == EINVAL) + return Status::InvalidArgument(Slice(msg)); + else if (wiredTigerError == EPERM || wiredTigerError == ENOENT || + wiredTigerError == EIO || wiredTigerError == EBADF || + wiredTigerError == EEXIST || wiredTigerError == ENOSPC) + return Status::IOError(Slice(msg)); + else if (wiredTigerError == WT_ROLLBACK) + return Status::IOError("ROLLBACK"); // TODO: Is this the best translation? + else + return Status::Corruption(Slice(msg)); +} + +/* Iterators, from leveldb/table/iterator.cc */ +Iterator::Iterator() { + cleanup_.function = NULL; + cleanup_.next = NULL; +} + +Iterator::~Iterator() { + if (cleanup_.function != NULL) { + (*cleanup_.function)(cleanup_.arg1, cleanup_.arg2); + for (Cleanup* c = cleanup_.next; c != NULL; ) { + (*c->function)(c->arg1, c->arg2); + Cleanup* next = c->next; + delete c; + c = next; + } + } +} + +void Iterator::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) { + assert(func != NULL); + Cleanup* c; + if (cleanup_.function == NULL) { + c = &cleanup_; + } else { + c = new Cleanup; + c->next = cleanup_.next; + cleanup_.next = c; + } + c->function = func; + c->arg1 = arg1; + c->arg2 = arg2; +} + +namespace { +class EmptyIterator : public Iterator { + public: + EmptyIterator(const Status& s) : status_(s) { } + virtual bool Valid() const { return false; } + virtual void Seek(const Slice& target) { } + virtual void SeekToFirst() { } + virtual void SeekToLast() { } + virtual void Next() { assert(false); } + virtual void Prev() { assert(false); } + Slice key() const { assert(false); return Slice(); } + Slice value() const { assert(false); return Slice(); } + virtual Status status() const { return status_; } + private: + Status status_; +}; +} // namespace + +Iterator* NewEmptyIterator() { + return new EmptyIterator(Status::OK()); +} + +Iterator* NewErrorIterator(const Status& status) { + return new EmptyIterator(status); +} + +namespace { +class FilterPolicyImpl : public FilterPolicy { +public: + FilterPolicyImpl(int bits_per_key) : bits_per_key_(bits_per_key) {} + ~FilterPolicyImpl() {} + virtual const char *Name() const { return "FilterPolicyImpl"; } + virtual void CreateFilter(const Slice *keys, int n, std::string *dst) const {} + virtual bool KeyMayMatch(const Slice &key, const Slice &filter) const { return true; } + + int bits_per_key_; +}; +}; + +namespace leveldb { +FilterPolicy::~FilterPolicy() {} + +const FilterPolicy *NewBloomFilterPolicy(int bits_per_key) { + return new FilterPolicyImpl(bits_per_key); +} +#if HAVE_BASHOLEVELDB +const FilterPolicy *NewBloomFilterPolicy2(int bits_per_key) { + return NewBloomFilterPolicy(bits_per_key); +} +#endif + +Cache::~Cache() {} +Cache *NewLRUCache(size_t capacity) { + return new CacheImpl(capacity); +} +} + +int +wtleveldb_create( + WT_CONNECTION *conn, const Options &options, std::string const &uri) +{ + int ret; + std::stringstream s_table; + s_table << WT_TABLE_CONFIG; + s_table << "internal_page_max=" << options.block_size << ","; + s_table << "leaf_page_max=" << options.block_size << ","; + s_table << "leaf_item_max=" << options.block_size / 4 << ","; + if (options.compression == leveldb::kSnappyCompression) + s_table << "block_compressor=snappy,"; +#ifdef HAVE_ROCKSDB + if (options.compression == leveldb::kZlibCompression) + s_table << "block_compressor=zlib,"; +#endif + s_table << "lsm=("; + s_table << "chunk_size=" << options.write_buffer_size << ","; + if (options.filter_policy) { + int bits = ((FilterPolicyImpl *)options.filter_policy)->bits_per_key_; + s_table << "bloom_bit_count=" << bits << ","; + // Approximate the optimal number of hashes + s_table << "bloom_hash_count=" << (int)(0.6 * bits) << ","; + } + s_table << "),"; + WT_SESSION *session; + std::string table_config = s_table.str(); + if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) + return (ret); + if ((ret = session->create(session, uri.c_str(), table_config.c_str())) != 0) + return (ret); + if ((ret = session->close(session, NULL)) != 0) + return (ret); + + return (0); +} + +Status +leveldb::DB::Open(const Options &options, const std::string &name, leveldb::DB **dbptr) +{ + // Build the wiredtiger_open config. + std::stringstream s_conn; + s_conn << WT_CONN_CONFIG; + if (options.create_if_missing) { + (void)mkdir(name.c_str(), 0777); + s_conn << "create,"; + } + if (options.error_if_exists) + s_conn << "exclusive,"; +#ifndef HAVE_BUILTIN_EXTENSION_SNAPPY + if (options.compression == kSnappyCompression) + s_conn << "extensions=[libwiredtiger_snappy.so],"; +#endif +#ifdef HAVE_ROCKSDB +#ifndef HAVE_BUILTIN_ZLIB + if (options.compression == kZlibCompression) + s_conn << "extensions=[libwiredtiger_zlib.so],"; +#endif +#endif + size_t cache_size = 2 * options.write_buffer_size; + cache_size += (size_t)options.max_open_files * (4 << 20); + if (options.block_cache) + cache_size += ((CacheImpl *)options.block_cache)->capacity_; + else + cache_size += 100 << 20; + s_conn << "cache_size=" << cache_size << ","; + std::string conn_config = s_conn.str(); + + WT_CONNECTION *conn; + printf("Open: home %s config %s\r\n",name.c_str(),conn_config.c_str()); + int ret = ::wiredtiger_open(name.c_str(), NULL, conn_config.c_str(), &conn); + if (ret == ENOENT) + return Status::NotFound(Slice("Database does not exist.")); + else if (ret == EEXIST) + return Status::NotFound(Slice("Database already exists.")); + else if (ret != 0) + return WiredTigerErrorToStatus(ret, NULL); + + if (options.create_if_missing) + ret = wtleveldb_create(conn, options, WT_URI); + + if (ret != 0) { + conn->close(conn, NULL); + return WiredTigerErrorToStatus(ret, NULL); + } + *dbptr = new DbImpl(conn); + return Status::OK(); +} + +// Set the database entry for "key" to "value". Returns OK on success, +// and a non-OK status on error. +// Note: consider setting options.sync = true. +Status +DbImpl::Put(const WriteOptions& options, + const Slice& key, const Slice& value) +{ + WT_CURSOR *cursor = GetContext()->GetCursor(); + WT_ITEM item; + + item.data = key.data(); + item.size = key.size(); + cursor->set_key(cursor, &item); + item.data = value.data(); + item.size = value.size(); + cursor->set_value(cursor, &item); + int ret = cursor->insert(cursor); + return WiredTigerErrorToStatus(ret, NULL); +} + +// Remove the database entry (if any) for "key". Returns OK on +// success, and a non-OK status on error. It is not an error if "key" +// did not exist in the database. +// Note: consider setting options.sync = true. +Status +DbImpl::Delete(const WriteOptions& options, const Slice& key) +{ + WT_CURSOR *cursor = GetContext()->GetCursor(); + WT_ITEM item; + + item.data = key.data(); + item.size = key.size(); + cursor->set_key(cursor, &item); + int ret = cursor->remove(cursor); + // Reset the WiredTiger cursor so it doesn't keep any pages pinned. Track + // failures in debug builds since we don't expect failure, but don't pass + // failures on - it's not necessary for correct operation. + if (ret == 0) { + int t_ret = cursor->reset(cursor); + assert(t_ret == 0); + } else if (ret == WT_NOTFOUND) + ret = 0; + return WiredTigerErrorToStatus(ret, NULL); +} + +void +WriteBatchHandler::Put(const Slice& key, const Slice& value) { + WT_CURSOR *cursor = context_->GetCursor(); + WT_ITEM item; + + item.data = key.data(); + item.size = key.size(); + cursor->set_key(cursor, &item); + item.data = value.data(); + item.size = value.size(); + cursor->set_value(cursor, &item); + int ret = cursor->insert(cursor); + if (ret != 0 && status_ == 0) + status_ = ret; +} + +void WriteBatchHandler::Delete(const Slice& key) { + WT_CURSOR *cursor = context_->GetCursor(); + WT_ITEM item; + + item.data = key.data(); + item.size = key.size(); + cursor->set_key(cursor, &item); + int ret = cursor->remove(cursor); + if (ret != 0 && ret != WT_NOTFOUND && status_ == 0) + status_ = ret; +} + +// Apply the specified updates to the database. +// Returns OK on success, non-OK on failure. +// Note: consider setting options.sync = true. +Status +DbImpl::Write(const WriteOptions& options, WriteBatch* updates) +{ + const char *errmsg = NULL; + Status status = Status::OK(); + OperationContext *context = GetContext(); + WT_SESSION *session = context->GetSession(); + int ret = 0, t_ret; + +#ifdef HAVE_ROCKSDB + int need_txn = (updates->Count() > 1); +#else + int need_txn = 1; +#endif + + for (;;) { + if (need_txn && (ret = session->begin_transaction(session, NULL)) != 0) { + errmsg = "Begin transaction failed in Write batch"; + goto err; + } + + WriteBatchHandler handler(this, context); +#if 0 + status = updates->Iterate(&handler); +#else + try { + status = updates->Iterate(&handler); + } catch(...) { + if (need_txn) + (void)session->rollback_transaction(session, NULL); + throw; + } +#endif + if (!status.ok() || (ret = handler.GetWiredTigerStatus()) != WT_ROLLBACK) + break; + // Roll back the transaction on deadlock so we can try again + if (need_txn && (ret = session->rollback_transaction(session, NULL)) != 0) { + errmsg = "Rollback transaction failed in Write batch"; + goto err; + } + } + + if (need_txn && status.ok() && ret == 0) { + ret = session->commit_transaction(session, NULL); + } else if (need_txn) { + t_ret = session->rollback_transaction(session, NULL); + if (ret == 0) + ret = t_ret; + } + +err: + if (status.ok() && ret != 0) + status = WiredTigerErrorToStatus(ret, errmsg); + return status; +} + +// If the database contains an entry for "key" store the +// corresponding value in *value and return OK. +// +// If there is no entry for "key" leave *value unchanged and return +// a status for which Status::IsNotFound() returns true. +// +// May return some other Status on an error. +Status +DbImpl::Get(const ReadOptions& options, + const Slice& key, std::string* value) +{ + WT_CURSOR *cursor = GetContext(options)->GetCursor(); + const char *errmsg = NULL; + + WT_ITEM item; + item.data = key.data(); + item.size = key.size(); + cursor->set_key(cursor, &item); + int ret = cursor->search(cursor); + if (ret == 0) { + ret = cursor->get_value(cursor, &item); + if (ret == 0) { + // Make a copy of the value to return, then the cursor can be reset + *value = std::string((const char *)item.data, item.size); + ret = cursor->reset(cursor); + } + } else if (ret == WT_NOTFOUND) + errmsg = "DB::Get key not found"; + return WiredTigerErrorToStatus(ret, errmsg); +} + +#if HAVE_BASHOLEVELDB +// If the database contains an entry for "key" store the +// corresponding value in *value and return OK. +// +// If there is no entry for "key" leave *value unchanged and return +// a status for which Status::IsNotFound() returns true. +// +// May return some other Status on an error. +Status +DbImpl::Get(const ReadOptions& options, + const Slice& key, Value* value) +{ + const char *errmsg = NULL; + + WT_CURSOR *cursor = GetContext(options)->GetCursor(); + WT_ITEM item; + item.data = key.data(); + item.size = key.size(); + cursor->set_key(cursor, &item); + int ret = cursor->search(cursor); + if (ret == 0) { + ret = cursor->get_value(cursor, &item); + if (ret == 0) { + // This call makes a copy, reset the cursor afterwards. + value->assign((const char *)item.data, item.size); + ret = cursor->reset(cursor); + } + } else if (ret == WT_NOTFOUND) + errmsg = "DB::Get key not found"; +err: + return WiredTigerErrorToStatus(ret, errmsg); +} +#endif + +// Return a heap-allocated iterator over the contents of the database. +// The result of NewIterator() is initially invalid (caller must +// call one of the Seek methods on the iterator before using it). +// +// Caller should delete the iterator when it is no longer needed. +// The returned iterator should be deleted before this db is deleted. +Iterator * +DbImpl::NewIterator(const ReadOptions& options) +{ + /* Iterators own the cursor until they are closed. */ + OperationContext *context = GetContext(options); + WT_CURSOR *c = context->GetCursor(); + context->SetCursor(NULL); + return new IteratorImpl(this, c); +} + +SnapshotImpl::SnapshotImpl(DbImpl *db) : + Snapshot(), db_(db), context_(db->NewContext()), status_(Status::OK()) +{ +} + +// Return a handle to the current DB state. Iterators created with +// this handle will all observe a stable snapshot of the current DB +// state. The caller must call ReleaseSnapshot(result) when the +// snapshot is no longer needed. +const Snapshot * +DbImpl::GetSnapshot() +{ + SnapshotImpl *si = new SnapshotImpl(this); + WT_SESSION *session = si->GetContext()->GetSession(); + int ret = session->begin_transaction(session, NULL); + assert(ret == 0); + return si; +} + +// Release a previously acquired snapshot. The caller must not +// use "snapshot" after this call. +void +DbImpl::ReleaseSnapshot(const Snapshot* snapshot) +{ + SnapshotImpl *si = + static_cast<SnapshotImpl *>(const_cast<Snapshot *>(snapshot)); + if (si != NULL) { + // We started a transaction: we could commit it here, but it will be rolled + // back automatically by closing the session, which we have to do anyway. + int ret = si->GetContext()->Close(); + assert(ret == 0); + delete si; + } +} + +// DB implementations can export properties about their state +// via this method. If "property" is a valid property understood by this +// DB implementation, fills "*value" with its current value and returns +// true. Otherwise returns false. +// +// +// Valid property names include: +// +// "leveldb.num-files-at-level<N>" - return the number of files at level <N>, +// where <N> is an ASCII representation of a level number (e.g. "0"). +// "leveldb.stats" - returns a multi-line string that describes statistics +// about the internal operation of the DB. +// "leveldb.sstables" - returns a multi-line string that describes all +// of the sstables that make up the db contents. +bool +DbImpl::GetProperty(const Slice& property, std::string* value) +{ + /* Not supported */ + return false; +} + +// For each i in [0,n-1], store in "sizes[i]", the approximate +// file system space used by keys in "[range[i].start .. range[i].limit)". +// +// Note that the returned sizes measure file system space usage, so +// if the user data compresses by a factor of ten, the returned +// sizes will be one-tenth the size of the corresponding user data size. +// +// The results may not include the sizes of recently written data. +void +DbImpl::GetApproximateSizes(const Range* range, int n, + uint64_t* sizes) +{ + int i; + + /* XXX Not supported */ + for (i = 0; i < n; i++) + sizes[i] = 1; +} + +// Compact the underlying storage for the key range [*begin,*end]. +// In particular, deleted and overwritten versions are discarded, +// and the data is rearranged to reduce the cost of operations +// needed to access the data. This operation should typically only +// be invoked by users who understand the underlying implementation. +// +// begin==NULL is treated as a key before all keys in the database. +// end==NULL is treated as a key after all keys in the database. +// Therefore the following call will compact the entire database: +// db->CompactRange(NULL, NULL); +void +DbImpl::CompactRange(const Slice* begin, const Slice* end) +{ + // The compact doesn't need a cursor, but the context always opens a + // cursor when opening the session - so grab that, and use the session. + WT_CURSOR *cursor = GetContext()->GetCursor(); + WT_SESSION *session = cursor->session; + int ret = session->compact(session, WT_URI, NULL); + assert(ret == 0); +} + +// Suspends the background compaction thread. This methods +// returns once suspended. +void +DbImpl::SuspendCompactions() +{ + /* Not supported */ +} + +// Resumes a suspended background compation thread. +void +DbImpl::ResumeCompactions() +{ + /* Not supported */ +} + +IteratorImpl::~IteratorImpl() +{ + if (cursor_ != NULL) { + OperationContext *context = db_->GetContext(); + /* + * If we are in the same thread where the iterator was opened, and there is + * no cursor stashed there, return it. + */ + if (cursor_->session == context->GetSession()) { +#ifdef HAVE_ROCKSDB + if (context->GetCursor(id_) == NULL) { + context->SetCursor(id_, cursor_); + cursor_ = NULL; + } +#else + if (context->GetCursor() == NULL) { + context->SetCursor(cursor_); + cursor_ = NULL; + } +#endif + } + if (cursor_ != NULL) { + int ret = cursor_->close(cursor_); + assert(ret == 0); + } + } +} + +// Position at the first key in the source. The iterator is Valid() +// after this call iff the source is not empty. +void +IteratorImpl::SeekToFirst() +{ + int ret; + WT_ITEM item; + + if (!Status().ok()) + return; + + if ((ret = cursor_->reset(cursor_)) != 0) { + SetError(ret); + return; + } + ret = cursor_->next(cursor_); + if (ret == WT_NOTFOUND) { + valid_ = false; + return; + } else if (ret != 0) { + SetError(ret); + return; + } + if ((ret = cursor_->get_key(cursor_, &item)) != 0) { + SetError(ret); + return; + } + key_ = Slice((const char *)item.data, item.size); + if ((ret = cursor_->get_value(cursor_, &item)) != 0) { + SetError(ret); + return; + } + value_ = Slice((const char *)item.data, item.size); + valid_ = true; +} + +// Position at the last key in the source. The iterator is +// Valid() after this call iff the source is not empty. +void +IteratorImpl::SeekToLast() +{ + int ret; + WT_ITEM item; + + if (!Status().ok()) + return; + + if ((ret = cursor_->reset(cursor_)) != 0) { + SetError(ret); + return; + } + ret = cursor_->prev(cursor_); + if (ret == WT_NOTFOUND) { + valid_ = false; + return; + } else if (ret != 0) { + SetError(ret); + return; + } + if ((ret = cursor_->get_key(cursor_, &item)) != 0) { + SetError(ret); + return; + } + key_ = Slice((const char *)item.data, item.size); + if ((ret = cursor_->get_value(cursor_, &item)) != 0) { + SetError(ret); + return; + } + value_ = Slice((const char *)item.data, item.size); + valid_ = true; +} + +// Position at the first key in the source that at or past target +// The iterator is Valid() after this call iff the source contains +// an entry that comes at or past target. +void +IteratorImpl::Seek(const Slice& target) +{ + WT_ITEM item; + + if (!Status().ok()) + return; + + item.data = target.data(); + item.size = target.size(); + cursor_->set_key(cursor_, &item); + int cmp, ret = cursor_->search_near(cursor_, &cmp); + if (ret == 0 && cmp < 0) + ret = cursor_->next(cursor_); + if (ret != 0) { + if (ret != WT_NOTFOUND) + SetError(ret); + valid_ = false; + return; + } + if ((ret = cursor_->get_key(cursor_, &item)) != 0) { + SetError(ret); + return; + } + key_ = Slice((const char *)item.data, item.size); + if ((ret = cursor_->get_value(cursor_, &item)) != 0) { + SetError(ret); + return; + } + value_ = Slice((const char *)item.data, item.size); + valid_ = true; +} + +// Moves to the next entry in the source. After this call, Valid() is +// true iff the iterator was not positioned at the last entry in the source. +// REQUIRES: Valid() +void +IteratorImpl::Next() +{ + int ret; + WT_ITEM item; + + if (!Status().ok() || !valid_) + return; + + ret = cursor_->next(cursor_); + if (ret != 0) { + if (ret != WT_NOTFOUND) + SetError(ret); + valid_ = false; + return; + } + if ((ret = cursor_->get_key(cursor_, &item)) != 0) { + SetError(ret); + return; + } + key_ = Slice((const char *)item.data, item.size); + if ((ret = cursor_->get_value(cursor_, &item)) != 0) { + SetError(ret); + return; + } + value_ = Slice((const char *)item.data, item.size); + valid_ = true; +} + +// Moves to the previous entry in the source. After this call, Valid() is +// true iff the iterator was not positioned at the first entry in source. +// REQUIRES: Valid() +void +IteratorImpl::Prev() +{ + WT_ITEM item; + + if (!Status().ok() || !valid_) + return; + + int ret = cursor_->prev(cursor_); + if (ret != 0) { + if (ret != WT_NOTFOUND) + SetError(ret); + valid_ = false; + return; + } + if ((ret = cursor_->get_key(cursor_, &item)) != 0) { + SetError(ret); + return; + } + key_ = Slice((const char *)item.data, item.size); + if ((ret = cursor_->get_value(cursor_, &item)) != 0) { + SetError(ret); + return; + } + value_ = Slice((const char *)item.data, item.size); + valid_ = true; +} diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb_wt.h b/src/third_party/wiredtiger/api/leveldb/leveldb_wt.h new file mode 100644 index 00000000000..dc185183034 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/leveldb_wt.h @@ -0,0 +1,461 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef _INCLUDE_LEVELDB_WT_H +#define _INCLUDE_LEVELDB_WT_H 1 + +#include "leveldb_wt_config.h" + +#include "leveldb/cache.h" +#include "leveldb/comparator.h" +#include "leveldb/db.h" +#include "leveldb/env.h" +#include "leveldb/filter_policy.h" +#include "leveldb/options.h" +#include "leveldb/slice.h" +#include "leveldb/status.h" +#include "leveldb/write_batch.h" +#if HAVE_BASHO_LEVELDB +#include "basho/perf_count.h" +#endif + +#include "wiredtiger.h" + +#define WT_URI "table:data" +#define WT_CONN_CONFIG \ + "log=(enabled),checkpoint=(wait=180),checkpoint_sync=false," \ + "session_max=8192,mmap=false," \ + "transaction_sync=(enabled=true,method=none)," +// Note: LSM doesn't split, build full pages from the start +#define WT_TABLE_CONFIG "type=lsm,split_pct=100,leaf_item_max=1KB," \ + "lsm=(chunk_size=100MB,bloom_config=(leaf_page_max=8MB))," +#define WT_TIMESTAMP_FORMAT "%d.%llu" +// We're also only interested in operations to the user file. Skip over +// any changes to the metadata. +// !!! Currently WT guarantees that the metadata file is always at +// fileid 0 and the implementation here only uses one table. This will +// breakdown if either of those assumptions changes. +#define WT_VALID_OPERATION(fileid, optype) \ + ((fileid) != 0 && \ + ((optype) == WT_LOGOP_COL_PUT || \ + (optype) == WT_LOGOP_COL_REMOVE || \ + (optype) == WT_LOGOP_ROW_PUT || \ + (optype) == WT_LOGOP_ROW_REMOVE)) + +using leveldb::Cache; +using leveldb::FilterPolicy; +using leveldb::Iterator; +using leveldb::Options; +using leveldb::ReadOptions; +using leveldb::WriteBatch; +using leveldb::WriteOptions; +using leveldb::Range; +using leveldb::Slice; +using leveldb::Snapshot; +using leveldb::Status; +#if HAVE_BASHOLEVELDB +using leveldb::Value; +#endif +#if HAVE_ROCKSDB +using leveldb::FlushOptions; +using leveldb::ColumnFamilyHandle; +#endif + +extern Status WiredTigerErrorToStatus(int wiredTigerError, const char *msg = ""); + +/* POSIX thread-local storage */ +template <class T> +class ThreadLocal { +public: + static void cleanup(void *val) { + delete (T *)val; + } + + ThreadLocal() { + int ret = pthread_key_create(&key_, cleanup); + assert(ret == 0); + } + + ~ThreadLocal() { + int ret = pthread_key_delete(key_); + assert(ret == 0); + } + + T *Get() { + return (T *)(pthread_getspecific(key_)); + } + + void Set(T *value) { + int ret = pthread_setspecific(key_, value); + assert(ret == 0); + } + +private: + pthread_key_t key_; +}; + +/* WiredTiger implementations. */ +class DbImpl; + +/* Context for operations (including snapshots, write batches, transactions) */ +class OperationContext { +public: + OperationContext(WT_CONNECTION *conn) { + int ret = conn->open_session(conn, NULL, "isolation=snapshot", &session_); + assert(ret == 0); + ret = session_->open_cursor( + session_, WT_URI, NULL, NULL, &cursor_); + assert(ret == 0); + } + + ~OperationContext() { +#ifdef WANT_SHUTDOWN_RACES + int ret = Close(); + assert(ret == 0); +#endif + } + + int Close() { + int ret = 0; + if (session_ != NULL) + ret = session_->close(session_, NULL); + session_ = NULL; + return (ret); + } + + WT_CURSOR *GetCursor() { return cursor_; } + void SetCursor(WT_CURSOR *c) { cursor_ = c; } +#ifdef HAVE_ROCKSDB + WT_CURSOR *GetCursor(u_int i) { + return (i < cursors_.size()) ? cursors_[i] : NULL; + } + void SetCursor(u_int i, WT_CURSOR *c) { + if (i >= cursors_.size()) + cursors_.resize(i + 1); + cursors_[i] = c; + } +#endif + WT_SESSION *GetSession() { return session_; } + +private: + WT_SESSION *session_; + WT_CURSOR *cursor_; +#ifdef HAVE_ROCKSDB + std::vector<WT_CURSOR *> cursors_; +#endif +}; + +class CacheImpl : public Cache { +public: + CacheImpl(size_t capacity) : Cache(), capacity_(capacity) {} + virtual ~CacheImpl() {} + + virtual Handle* Insert(const Slice&, void*, size_t, + void (*)(const Slice&, void*)) { return 0; } + virtual Handle* Lookup(const Slice&) { return 0; } + virtual void Release(Handle*) {} + virtual void* Value(Handle*) { return 0; } + virtual void Erase(const Slice&) {} + virtual uint64_t NewId() { return 0; } + + size_t capacity_; +}; + +#ifdef HAVE_ROCKSDB +// ColumnFamilyHandleImpl is the class that clients use to access different +// column families. It has non-trivial destructor, which gets called when client +// is done using the column family +class ColumnFamilyHandleImpl : public ColumnFamilyHandle { + public: + ColumnFamilyHandleImpl(DbImpl* db, std::string const &name, uint32_t id) : db_(db), id_(id), name_(name) {} + ColumnFamilyHandleImpl(const ColumnFamilyHandleImpl ©from) : db_(copyfrom.db_), id_(copyfrom.id_), name_(copyfrom.name_) {} + virtual ~ColumnFamilyHandleImpl() {} + virtual uint32_t GetID() const { return id_; } + + std::string const &GetName() const { return name_; } + std::string const GetURI() const { return "table:" + name_; } + + private: + DbImpl* db_; + uint32_t id_; + std::string const name_; +}; +#endif + +class IteratorImpl : public Iterator { +public: + IteratorImpl(DbImpl *db, WT_CURSOR *cursor, uint32_t id=0) : db_(db), cursor_(cursor), id_(id) {} + virtual ~IteratorImpl(); + + // An iterator is either positioned at a key/value pair, or + // not valid. This method returns true iff the iterator is valid. + virtual bool Valid() const { return valid_; } + + virtual void SeekToFirst(); + + virtual void SeekToLast(); + + virtual void Seek(const Slice& target); + + virtual void Next(); + + virtual void Prev(); + + virtual Slice key() const { + return key_; + } + + virtual Slice value() const { + return value_; + } + + virtual Status status() const { + return status_; + } + +private: + DbImpl *db_; + WT_CURSOR *cursor_; + Slice key_, value_; + Status status_; + bool valid_; + uint32_t id_; + + void SetError(int wiredTigerError) { + valid_ = false; + status_ = WiredTigerErrorToStatus(wiredTigerError, NULL); + } + + // No copying allowed + IteratorImpl(const IteratorImpl&); + void operator=(const IteratorImpl&); +}; + +class SnapshotImpl : public Snapshot { +friend class DbImpl; +friend class IteratorImpl; +public: + SnapshotImpl(DbImpl *db); + virtual ~SnapshotImpl() { delete context_; } +protected: + OperationContext *GetContext() const { return context_; } + Status GetStatus() const { return status_; } + Status SetupTransaction(); +private: + DbImpl *db_; + OperationContext *context_; + Status status_; +}; + +class DbImpl : public leveldb::DB { +friend class IteratorImpl; +friend class SnapshotImpl; +public: + DbImpl(WT_CONNECTION *conn) : + DB(), conn_(conn), context_(new ThreadLocal<OperationContext>) {} + virtual ~DbImpl() { + delete context_; + int ret = conn_->close(conn_, NULL); + assert(ret == 0); + } + + virtual Status Put(const WriteOptions& options, + const Slice& key, + const Slice& value); + + virtual Status Delete(const WriteOptions& options, const Slice& key); + + virtual Status Write(const WriteOptions& options, WriteBatch* updates); + + virtual Status Get(const ReadOptions& options, + const Slice& key, std::string* value); + +#if HAVE_BASHOLEVELDB + virtual Status Get(const ReadOptions& options, + const Slice& key, Value* value); +#endif + +#ifdef HAVE_HYPERLEVELDB + virtual Status LiveBackup(const Slice& name); + virtual void GetReplayTimestamp(std::string* timestamp); + virtual void AllowGarbageCollectBeforeTimestamp(const std::string& timestamp); + virtual bool ValidateTimestamp(const std::string& timestamp); + virtual int CompareTimestamps(const std::string& lhs, const std::string& rhs); + virtual Status GetReplayIterator(const std::string& timestamp, + leveldb::ReplayIterator** iter); + virtual void ReleaseReplayIterator(leveldb::ReplayIterator* iter); +#endif + +#ifdef HAVE_ROCKSDB + virtual Status CreateColumnFamily(const Options& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle); + + // Drop a column family specified by column_family handle. This call + // only records a drop record in the manifest and prevents the column + // family from flushing and compacting. + virtual Status DropColumnFamily(ColumnFamilyHandle* column_family); + + // Set the database entry for "key" to "value". + // Returns OK on success, and a non-OK status on error. + // Note: consider setting options.sync = true. + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value); + + // Remove the database entry (if any) for "key". Returns OK on + // success, and a non-OK status on error. It is not an error if "key" + // did not exist in the database. + // Note: consider setting options.sync = true. + virtual Status Delete(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key); + + // Merge the database entry for "key" with "value". Returns OK on success, + // and a non-OK status on error. The semantics of this operation is + // determined by the user provided merge_operator when opening DB. + // Note: consider setting options.sync = true. + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value); + + // May return some other Status on an error. + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value); + + // If keys[i] does not exist in the database, then the i'th returned + // status will be one for which Status::IsNotFound() is true, and + // (*values)[i] will be set to some arbitrary value (often ""). Otherwise, + // the i'th returned status will have Status::ok() true, and (*values)[i] + // will store the value associated with keys[i]. + // + // (*values) will always be resized to be the same size as (keys). + // Similarly, the number of returned statuses will be the number of keys. + // Note: keys will not be "de-duplicated". Duplicate keys will return + // duplicate values in order. + virtual std::vector<Status> MultiGet( + const ReadOptions& options, + const std::vector<ColumnFamilyHandle*>& column_family, + const std::vector<Slice>& keys, std::vector<std::string>* values); + + virtual Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family); + + virtual bool GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value); + + // Flush all mem-table data. + virtual Status Flush(const FlushOptions& options, + ColumnFamilyHandle* column_family); + + ColumnFamilyHandleImpl *GetCF(uint32_t id) { + return (id < columns_.size()) ? static_cast<ColumnFamilyHandleImpl *>(columns_[id]) : NULL; + } + void SetColumns(std::vector<ColumnFamilyHandle *> &cols) { + columns_ = cols; + } +#endif + + virtual Iterator* NewIterator(const ReadOptions& options); + + virtual const Snapshot* GetSnapshot(); + + virtual void ReleaseSnapshot(const Snapshot* snapshot); + + virtual bool GetProperty(const Slice& property, std::string* value); + + virtual void GetApproximateSizes(const Range* range, int n, + uint64_t* sizes); + + virtual void CompactRange(const Slice* begin, const Slice* end); + + virtual void SuspendCompactions(); + + virtual void ResumeCompactions(); + + OperationContext *GetContext() { + OperationContext *ctx = context_->Get(); + if (ctx == NULL) { + ctx = NewContext(); + context_->Set(ctx); + } + return (ctx); + } + +private: + WT_CONNECTION *conn_; + ThreadLocal<OperationContext> *context_; +#ifdef HAVE_ROCKSDB + std::vector<ColumnFamilyHandle*> columns_; +#endif + + OperationContext *NewContext() { + return new OperationContext(conn_); + } + + OperationContext *GetContext(const ReadOptions &options) { + if (options.snapshot == NULL) + return GetContext(); + else { + const SnapshotImpl *si = + static_cast<const SnapshotImpl *>(options.snapshot); + assert(si->GetStatus().ok()); + return si->GetContext(); + } + } + + // No copying allowed + DbImpl(const DbImpl&); + void operator=(const DbImpl&); +}; + +// Implemention of WriteBatch::Handler +class WriteBatchHandler : public WriteBatch::Handler { +public: + WriteBatchHandler(DbImpl *db, OperationContext *context) : db_(db), context_(context), status_(0) {} + virtual ~WriteBatchHandler() {} + int GetWiredTigerStatus() { return status_; } + + virtual void Put(const Slice& key, const Slice& value); + + virtual void Delete(const Slice& key); + +#ifdef HAVE_ROCKSDB + // Implementations are in rocksdb_wt.cc + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value); + virtual Status DeleteCF(uint32_t column_family_id, const Slice& key); +#endif + +private: + DbImpl *db_; + OperationContext *context_; + int status_; +}; + +#endif diff --git a/src/third_party/wiredtiger/api/leveldb/rocks_wt.cc b/src/third_party/wiredtiger/api/leveldb/rocks_wt.cc new file mode 100644 index 00000000000..6ccab2c1e78 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/rocks_wt.cc @@ -0,0 +1,315 @@ +/*- + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "leveldb_wt.h" +#include <errno.h> +#include <sys/stat.h> +#include <unistd.h> +#include <sstream> + +using leveldb::Cache; +using leveldb::DB; +using leveldb::FlushOptions; +using leveldb::FilterPolicy; +using leveldb::Iterator; +using leveldb::Options; +using leveldb::ReadOptions; +using leveldb::WriteBatch; +using leveldb::WriteOptions; +using leveldb::Range; +using leveldb::Slice; +using leveldb::Snapshot; +using leveldb::Status; + +static int +wtrocks_get_cursor(OperationContext *context, ColumnFamilyHandle *cfhp, WT_CURSOR **cursorp, int acquire=0) +{ + ColumnFamilyHandleImpl *cf = + static_cast<ColumnFamilyHandleImpl *>(cfhp); + if (cf == NULL) { + fprintf(stderr, "Missing column!\n"); + assert(0); + } + WT_CURSOR *c = context->GetCursor(cf->GetID()); + if (c == NULL) { + WT_SESSION *session = context->GetSession(); + int ret; + if ((ret = session->open_cursor( + session, cf->GetURI().c_str(), NULL, NULL, &c)) != 0) { + fprintf(stderr, "Failed to open cursor on %s: %s\n", cf->GetURI().c_str(), wiredtiger_strerror(ret)); + return (ret); + } + if (!acquire) + context->SetCursor(cf->GetID(), c); + } else if (acquire) + context->SetCursor(cf->GetID(), NULL); + *cursorp = c; + return (0); +} + +Status +DB::ListColumnFamilies( + Options const &options, std::string const &name, + std::vector<std::string> *column_families) +{ + std::vector<std::string> cf; + DB *dbptr; + Status status = DB::Open(options, name, &dbptr); + if (!status.ok()) + return status; + DbImpl *db = static_cast<DbImpl *>(dbptr); + OperationContext *context = db->GetContext(); + WT_SESSION *session = context->GetSession(); + WT_CURSOR *c; + int ret = session->open_cursor(session, "metadata:", NULL, NULL, &c); + if (ret != 0) + goto err; + c->set_key(c, "table:"); + /* Position on the first table entry */ + int cmp; + ret = c->search_near(c, &cmp); + if (ret != 0 || (cmp < 0 && (ret = c->next(c)) != 0)) + goto err; + /* Add entries while we are getting "table" URIs. */ + for (; ret == 0; ret = c->next(c)) { + const char *key; + if ((ret = c->get_key(c, &key)) != 0) + goto err; + if (strncmp(key, "table:", strlen("table:")) != 0) + break; + printf("List column families: [%d] = %s\n", (int)cf.size(), key); + cf.push_back(std::string(key + strlen("table:"))); + } + +err: delete db; + /* + * WT_NOTFOUND is not an error: it just means we got to the end of the + * list of tables. + */ + if (ret == 0 || ret == WT_NOTFOUND) { + *column_families = cf; + ret = 0; + } + return WiredTigerErrorToStatus(ret); +} + +Status +DB::Open(Options const &options, std::string const &name, const std::vector<ColumnFamilyDescriptor> &column_families, std::vector<ColumnFamilyHandle*> *handles, DB**dbptr) +{ + Status status = Open(options, name, dbptr); + if (!status.ok()) + return status; + DbImpl *db = static_cast<DbImpl *>(*dbptr); + std::vector<ColumnFamilyHandle*> cfhandles( + column_families.size()); + for (size_t i = 0; i < column_families.size(); i++) { + printf("Open column families: [%d] = %s\n", (int)i, column_families[i].name.c_str()); + cfhandles[i] = new ColumnFamilyHandleImpl( + db, column_families[i].name, (int)i); + } + db->SetColumns(*handles = cfhandles); + return Status::OK(); +} + +void +WriteBatch::Handler::Merge(const Slice& key, const Slice& value) +{ +} + +void +WriteBatch::Handler::LogData(const Slice& blob) +{ +} + +Status +WriteBatchHandler::PutCF( + uint32_t column_family_id, const Slice& key, const Slice& value) +{ + WT_CURSOR *cursor; + int ret = wtrocks_get_cursor(context_, db_->GetCF(column_family_id), &cursor); + if (ret != 0) + return WiredTigerErrorToStatus(ret); + WT_ITEM item; + item.data = key.data(); + item.size = key.size(); + cursor->set_key(cursor, &item); + item.data = value.data(); + item.size = value.size(); + cursor->set_value(cursor, &item); + ret = cursor->insert(cursor); + return WiredTigerErrorToStatus(ret); +} + +Status +WriteBatchHandler::DeleteCF(uint32_t column_family_id, const Slice& key) +{ + WT_CURSOR *cursor; + int ret = wtrocks_get_cursor(context_, db_->GetCF(column_family_id), &cursor); + if (ret != 0) + return WiredTigerErrorToStatus(ret); + WT_ITEM item; + item.data = key.data(); + item.size = key.size(); + cursor->set_key(cursor, &item); + ret = cursor->remove(cursor); + if (ret == 0) { + int t_ret = cursor->reset(cursor); + assert(t_ret == 0); + } else if (ret == WT_NOTFOUND) + ret = 0; + return WiredTigerErrorToStatus(ret); +} + +Status +DbImpl::Merge(WriteOptions const&, ColumnFamilyHandle*, Slice const&, Slice const&) +{ + return WiredTigerErrorToStatus(ENOTSUP); +} + +Status +DbImpl::CreateColumnFamily(Options const &options, std::string const &name, ColumnFamilyHandle **cfhp) +{ + extern int wtleveldb_create(WT_CONNECTION *, + const Options &, std::string const &uri); + int ret = wtleveldb_create(conn_, options, "table:" + name); + if (ret != 0) + return WiredTigerErrorToStatus(ret); + int id = (int)columns_.size(); + *cfhp = new ColumnFamilyHandleImpl(this, name, id); + printf("Create column family: [%d] = %s\n", id, name.c_str()); + columns_.push_back(*cfhp); + return Status::OK(); +} + +Status +DbImpl::DropColumnFamily(ColumnFamilyHandle *cfhp) +{ + ColumnFamilyHandleImpl *cf = + static_cast<ColumnFamilyHandleImpl *>(cfhp); + WT_SESSION *session = GetContext()->GetSession(); + int ret = session->drop(session, cf->GetURI().c_str(), NULL); + return WiredTigerErrorToStatus(ret); +} + +Status +DbImpl::Delete(WriteOptions const &write_options, ColumnFamilyHandle *cfhp, Slice const &key) +{ + WT_CURSOR *cursor; + int ret = wtrocks_get_cursor(GetContext(), cfhp, &cursor); + if (ret != 0) + return WiredTigerErrorToStatus(ret); + WT_ITEM item; + item.data = key.data(); + item.size = key.size(); + cursor->set_key(cursor, &item); + ret = cursor->remove(cursor); + // Reset the WiredTiger cursor so it doesn't keep any pages pinned. + // Track failures in debug builds since we don't expect failure, but + // don't pass failures on - it's not necessary for correct operation. + int t_ret = cursor->reset(cursor); + assert(t_ret == 0); + return WiredTigerErrorToStatus(ret); +} + +Status +DbImpl::Flush(FlushOptions const&, ColumnFamilyHandle* cfhp) +{ + ColumnFamilyHandleImpl *cf = + static_cast<ColumnFamilyHandleImpl *>(cfhp); + WT_SESSION *session = GetContext()->GetSession(); + return WiredTigerErrorToStatus(session->checkpoint(session, ("target=(\"" + cf->GetURI() + "\")").c_str())); +} + +Status +DbImpl::Get(ReadOptions const &options, ColumnFamilyHandle *cfhp, Slice const &key, std::string *value) +{ + const char *errmsg = NULL; + OperationContext *context = GetContext(options); + + WT_CURSOR *cursor; + int ret = wtrocks_get_cursor(context, cfhp, &cursor); + if (ret != 0) + return WiredTigerErrorToStatus(ret); + + WT_ITEM item; + item.data = key.data(); + item.size = key.size(); + cursor->set_key(cursor, &item); + if ((ret = cursor->search(cursor)) == 0 && + (ret = cursor->get_value(cursor, &item)) == 0) { + *value = std::string((const char *)item.data, item.size); + ret = cursor->reset(cursor); + } + if (ret == WT_NOTFOUND) + errmsg = "DB::Get key not found"; + return WiredTigerErrorToStatus(ret, errmsg); +} + +bool +DbImpl::GetProperty(ColumnFamilyHandle*, Slice const&, std::string*) +{ + return false; +} + +std::vector<Status> +DbImpl::MultiGet(ReadOptions const&, std::vector<ColumnFamilyHandle*> const&, std::vector<Slice> const&, std::vector<std::string, std::allocator<std::string> >*) +{ + std::vector<Status> ret; + ret.push_back(WiredTigerErrorToStatus(ENOTSUP)); + return ret; +} + +Iterator * +DbImpl::NewIterator(ReadOptions const &options, ColumnFamilyHandle *cfhp) +{ + OperationContext *context = GetContext(options); + + WT_CURSOR *c; + int ret = wtrocks_get_cursor(context, cfhp, &c, 1); + assert(ret == 0); + return new IteratorImpl(this, c, + static_cast<ColumnFamilyHandleImpl *>(cfhp)->GetID()); +} + +Status +DbImpl::Put(WriteOptions const &options, ColumnFamilyHandle *cfhp, Slice const &key, Slice const &value) +{ + WT_CURSOR *cursor; + int ret = wtrocks_get_cursor(GetContext(), cfhp, &cursor); + if (ret != 0) + return WiredTigerErrorToStatus(ret); + + WT_ITEM item; + item.data = key.data(); + item.size = key.size(); + cursor->set_key(cursor, &item); + item.data = value.data(); + item.size = value.size(); + cursor->set_value(cursor, &item); + ret = cursor->insert(cursor); + return WiredTigerErrorToStatus(ret, NULL); +} diff --git a/src/third_party/wiredtiger/api/leveldb/rocksdb/LICENSE b/src/third_party/wiredtiger/api/leveldb/rocksdb/LICENSE new file mode 100644 index 00000000000..b1329018690 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/rocksdb/LICENSE @@ -0,0 +1,35 @@ +BSD License + +For rocksdb software + +Copyright (c) 2014, Facebook, Inc. +All rights reserved. +--------------------------------------------------------------------- + +Copyright (c) 2011 The LevelDB Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/src/third_party/wiredtiger/api/leveldb/rocksdb/PATENTS b/src/third_party/wiredtiger/api/leveldb/rocksdb/PATENTS new file mode 100644 index 00000000000..6bafb4a342f --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/rocksdb/PATENTS @@ -0,0 +1,23 @@ +Additional Grant of Patent Rights + +"Software" means the rocksdb software distributed by Facebook, Inc. + +Facebook hereby grants you a perpetual, worldwide, royalty-free, +non-exclusive, irrevocable (subject to the termination provision below) +license under any rights in any patent claims owned by Facebook, to make, +have made, use, sell, offer to sell, import, and otherwise transfer the +Software. For avoidance of doubt, no license is granted under Facebook's +rights in any patent claims that are infringed by (i) modifications to the +Software made by you or a third party, or (ii) the Software in combination +with any software or other technology provided by you or a third party. + +The license granted hereunder will terminate, automatically and without +notice, for anyone that makes any claim (including by filing any lawsuit, +assertion or other action) alleging (a) direct, indirect, or contributory +infringement or inducement to infringe any patent: (i) by Facebook or any +of its subsidiaries or affiliates, whether or not such claim is related +to the Software, (ii) by any party if such claim arises in whole or in +part from any software, product or service of Facebook or any of its +subsidiaries or affiliates, whether or not such claim is related to the +Software, or (iii) by any party relating to the Software; or (b) that +any right in any patent claim of Facebook is invalid or unenforceable. diff --git a/src/third_party/wiredtiger/api/leveldb/rocksdb/write_batch.cc b/src/third_party/wiredtiger/api/leveldb/rocksdb/write_batch.cc new file mode 100644 index 00000000000..aa7a3d239f9 --- /dev/null +++ b/src/third_party/wiredtiger/api/leveldb/rocksdb/write_batch.cc @@ -0,0 +1,275 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBatch::rep_ := +// sequence: fixed64 +// count: fixed32 +// data: record[count] +// record := +// kTypeValue varstring varstring +// kTypeMerge varstring varstring +// kTypeDeletion varstring +// kTypeColumnFamilyValue varint32 varstring varstring +// kTypeColumnFamilyMerge varint32 varstring varstring +// kTypeColumnFamilyDeletion varint32 varstring varstring +// varstring := +// len: varint32 +// data: uint8[len] + +#include "leveldb_wt.h" + +#include "db/write_batch_internal.h" + +#include <stdexcept> + +namespace rocksdb { + +// WriteBatch header has an 8-byte sequence number followed by a 4-byte count. +static const size_t kHeader = 12; + +WriteBatch::WriteBatch(size_t reserved_bytes) { + rep_.reserve((reserved_bytes > kHeader) ? reserved_bytes : kHeader); + Clear(); +} + +WriteBatch::~WriteBatch() { } + +WriteBatch::Handler::~Handler() { } + +void WriteBatch::Handler::Put(const Slice& key, const Slice& value) { + // you need to either implement Put or PutCF + throw std::runtime_error("Handler::Put not implemented!"); +} + +#ifdef NOT_YET +void WriteBatch::Handler::Merge(const Slice& key, const Slice& value) { + throw std::runtime_error("Handler::Merge not implemented!"); +} +#endif + +void WriteBatch::Handler::Delete(const Slice& key) { + // you need to either implement Delete or DeleteCF + throw std::runtime_error("Handler::Delete not implemented!"); +} + +#ifdef NOT_YET +void WriteBatch::Handler::LogData(const Slice& blob) { + // If the user has not specified something to do with blobs, then we ignore + // them. +} +#endif + +bool WriteBatch::Handler::Continue() { + return true; +} + +void WriteBatch::Clear() { + rep_.clear(); + rep_.resize(kHeader); +} + +int WriteBatch::Count() const { + return WriteBatchInternal::Count(this); +} + +Status WriteBatch::Iterate(Handler* handler) const { + Slice input(rep_); + if (input.size() < kHeader) { + return Status::Corruption("malformed WriteBatch (too small)"); + } + + input.remove_prefix(kHeader); + Slice key, value, blob; + int found = 0; + Status s; + while (s.ok() && !input.empty() && handler->Continue()) { + char tag = input[0]; + input.remove_prefix(1); + uint32_t column_family = 0; // default + switch (tag) { + case kTypeColumnFamilyValue: + if (!GetVarint32(&input, &column_family)) { + return Status::Corruption("bad WriteBatch Put"); + } + // intentional fallthrough + case kTypeValue: + if (GetLengthPrefixedSlice(&input, &key) && + GetLengthPrefixedSlice(&input, &value)) { + s = handler->PutCF(column_family, key, value); + found++; + } else { + return Status::Corruption("bad WriteBatch Put"); + } + break; + case kTypeColumnFamilyDeletion: + if (!GetVarint32(&input, &column_family)) { + return Status::Corruption("bad WriteBatch Delete"); + } + // intentional fallthrough + case kTypeDeletion: + if (GetLengthPrefixedSlice(&input, &key)) { + s = handler->DeleteCF(column_family, key); + found++; + } else { + return Status::Corruption("bad WriteBatch Delete"); + } + break; + case kTypeColumnFamilyMerge: + if (!GetVarint32(&input, &column_family)) { + return Status::Corruption("bad WriteBatch Merge"); + } + // intentional fallthrough + case kTypeMerge: + if (GetLengthPrefixedSlice(&input, &key) && + GetLengthPrefixedSlice(&input, &value)) { + s = handler->MergeCF(column_family, key, value); + found++; + } else { + return Status::Corruption("bad WriteBatch Merge"); + } + break; + case kTypeLogData: + if (GetLengthPrefixedSlice(&input, &blob)) { + handler->LogData(blob); + } else { + return Status::Corruption("bad WriteBatch Blob"); + } + break; + default: + return Status::Corruption("unknown WriteBatch tag"); + } + } + if (!s.ok()) { + return s; + } + if (found != WriteBatchInternal::Count(this)) { + return Status::Corruption("WriteBatch has wrong count"); + } else { + return Status::OK(); + } +} + +int WriteBatchInternal::Count(const WriteBatch* b) { + return DecodeFixed32(b->rep_.data() + 8); +} + +void WriteBatchInternal::SetCount(WriteBatch* b, int n) { + EncodeFixed32(&b->rep_[8], n); +} + +#ifdef NOT_YET +SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) { + return SequenceNumber(DecodeFixed64(b->rep_.data())); +} + +void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) { + EncodeFixed64(&b->rep_[0], seq); +} +#endif + +void WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id, + const Slice& key, const Slice& value) { + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast<char>(kTypeValue)); + } else { + b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSlice(&b->rep_, key); + PutLengthPrefixedSlice(&b->rep_, value); +} + +namespace { +inline uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) { + uint32_t column_family_id = 0; + if (column_family != NULL) { + ColumnFamilyHandleImpl *cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family); + column_family_id = cfh->GetID(); + } + return column_family_id; +} +} // namespace + +void WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) { + WriteBatchInternal::Put(this, GetColumnFamilyID(column_family), key, value); +} + +void WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id, + const SliceParts& key, const SliceParts& value) { + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast<char>(kTypeValue)); + } else { + b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSliceParts(&b->rep_, key); + PutLengthPrefixedSliceParts(&b->rep_, value); +} + +void WriteBatch::Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) { + WriteBatchInternal::Put(this, GetColumnFamilyID(column_family), key, value); +} + +void WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id, + const Slice& key) { + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast<char>(kTypeDeletion)); + } else { + b->rep_.push_back(static_cast<char>(kTypeColumnFamilyDeletion)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSlice(&b->rep_, key); +} + +void WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key) { + WriteBatchInternal::Delete(this, GetColumnFamilyID(column_family), key); +} + +#ifdef NOT_YET +void WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id, + const Slice& key, const Slice& value) { + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast<char>(kTypeMerge)); + } else { + b->rep_.push_back(static_cast<char>(kTypeColumnFamilyMerge)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSlice(&b->rep_, key); + PutLengthPrefixedSlice(&b->rep_, value); +} + +void WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) { + WriteBatchInternal::Merge(this, GetColumnFamilyID(column_family), key, value); +} + +void WriteBatch::PutLogData(const Slice& blob) { + rep_.push_back(static_cast<char>(kTypeLogData)); + PutLengthPrefixedSlice(&rep_, blob); +} +#endif + +void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { + assert(contents.size() >= kHeader); + b->rep_.assign(contents.data(), contents.size()); +} + +void WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src) { + SetCount(dst, Count(dst) + Count(src)); + assert(src->rep_.size() >= kHeader); + dst->rep_.append(src->rep_.data() + kHeader, src->rep_.size() - kHeader); +} + +} // namespace rocksdb |