diff options
Diffstat (limited to 'src/assistant/3rdparty/clucene/src/CLucene/index/SegmentTermDocs.cpp')
-rw-r--r-- | src/assistant/3rdparty/clucene/src/CLucene/index/SegmentTermDocs.cpp | 216 |
1 files changed, 216 insertions, 0 deletions
diff --git a/src/assistant/3rdparty/clucene/src/CLucene/index/SegmentTermDocs.cpp b/src/assistant/3rdparty/clucene/src/CLucene/index/SegmentTermDocs.cpp new file mode 100644 index 000000000..50951e9ba --- /dev/null +++ b/src/assistant/3rdparty/clucene/src/CLucene/index/SegmentTermDocs.cpp @@ -0,0 +1,216 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "SegmentHeader.h" + +#include "CLucene/store/IndexInput.h" +#include "Term.h" + +CL_NS_DEF(index) + + SegmentTermDocs::SegmentTermDocs(const SegmentReader* _parent){ + //Func - Constructor + //Pre - Paren != NULL + //Post - The instance has been created + + CND_PRECONDITION(_parent != NULL,"Parent is NULL"); + + parent = _parent; + deletedDocs = parent->deletedDocs; + + _doc = 0; + _freq = 0; + count = 0; + df = 0; + + skipInterval=0; + numSkips=0; + skipCount=0; + skipStream=NULL; + skipDoc=0; + freqPointer=0; + proxPointer=0; + skipPointer=0; + haveSkipped=false; + + freqStream = parent->freqStream->clone(); + skipInterval = parent->tis->getSkipInterval(); + } + + SegmentTermDocs::~SegmentTermDocs() { + //Func - Destructor + //Pre - true + //Post - The instance has been destroyed + + close(); + } + + TermPositions* SegmentTermDocs::__asTermPositions(){ + return NULL; + } + + void SegmentTermDocs::seek(Term* term) { + TermInfo* ti = parent->tis->get(term); + seek(ti); + _CLDELETE(ti); + } + + void SegmentTermDocs::seek(TermEnum* termEnum){ + TermInfo* ti=NULL; + + // use comparison of fieldinfos to verify that termEnum belongs to the same segment as this SegmentTermDocs + if ( termEnum->getObjectName() == SegmentTermEnum::getClassName() ){ + SegmentTermEnum* te = (SegmentTermEnum*)termEnum; + te->fieldInfos = parent->fieldInfos; + ti = te->getTermInfo(); + }else{ + ti = parent->tis->get(termEnum->term(false)); + } + + seek(ti); + _CLDELETE(ti); + } + void SegmentTermDocs::seek(const TermInfo* ti) { + count = 0; + if (ti == NULL) { + df = 0; + } else { + df = ti->docFreq; + _doc = 0; + skipDoc = 0; + skipCount = 0; + numSkips = df / skipInterval; + freqPointer = ti->freqPointer; + proxPointer = ti->proxPointer; + skipPointer = freqPointer + ti->skipOffset; + freqStream->seek(freqPointer); + haveSkipped = false; + } + } + + void SegmentTermDocs::close() { + + //Check if freqStream still exists + if (freqStream != NULL){ + freqStream->close(); //todo: items like these can probably be delete, because deleting the object also closes it...do everywhere + _CLDELETE( freqStream ); + } + if (skipStream != NULL){ + skipStream->close(); + _CLDELETE( skipStream ); + } + } + + int32_t SegmentTermDocs::doc()const { + return _doc; + } + int32_t SegmentTermDocs::freq()const { + return _freq; + } + + +bool SegmentTermDocs::next() +{ + while (true) { + if (count == df) + return false; + + uint32_t docCode = freqStream->readVInt(); + _doc += docCode >> 1; //unsigned shift + if ((docCode & 1) != 0) // if low bit is set + _freq = 1; // _freq is one + else + _freq = freqStream->readVInt(); // else read _freq + count++; + + if (deletedDocs == NULL || (_doc >= 0 && !deletedDocs->get(_doc))) + break; + skippingDoc(); + } + return true; +} + + +int32_t SegmentTermDocs::read(int32_t* docs, int32_t* freqs, int32_t length) +{ + int32_t i = 0; + // TODO: one optimization would be to get the pointer buffer for ram or mmap + // dirs and iterate over them instead of using readByte() intensive functions. + while (i < length && count < df) { + uint32_t docCode = freqStream->readVInt(); + _doc += docCode >> 1; + if ((docCode & 1) != 0) // if low bit is set + _freq = 1; // _freq is one + else + _freq = freqStream->readVInt(); // else read _freq + count++; + + if (deletedDocs == NULL || (_doc >= 0 && !deletedDocs->get(_doc))) { + docs[i] = _doc; + freqs[i] = _freq; + i++; + } + } + return i; +} + + bool SegmentTermDocs::skipTo(const int32_t target){ + if (df >= skipInterval) { // optimized case + if (skipStream == NULL) + skipStream = freqStream->clone(); // lazily clone + + if (!haveSkipped) { // lazily seek skip stream + skipStream->seek(skipPointer); + haveSkipped = true; + } + + // scan skip data + int32_t lastSkipDoc = skipDoc; + int64_t lastFreqPointer = freqStream->getFilePointer(); + int64_t lastProxPointer = -1; + int32_t numSkipped = -1 - (count % skipInterval); + + while (target > skipDoc) { + lastSkipDoc = skipDoc; + lastFreqPointer = freqPointer; + lastProxPointer = proxPointer; + + if (skipDoc != 0 && skipDoc >= _doc) + numSkipped += skipInterval; + + if(skipCount >= numSkips) + break; + + skipDoc += skipStream->readVInt(); + freqPointer += skipStream->readVInt(); + proxPointer += skipStream->readVInt(); + + skipCount++; + } + + // if we found something to skip, then skip it + if (lastFreqPointer > freqStream->getFilePointer()) { + freqStream->seek(lastFreqPointer); + skipProx(lastProxPointer); + + _doc = lastSkipDoc; + count += numSkipped; + } + + } + + // done skipping, now just scan + + do { + if (!next()) + return false; + } while (target > _doc); + return true; + } + + +CL_NS_END |