// @file file_allocator.cpp
/* Copyright 2009 10gen Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the GNU Affero General Public License in all respects
* for all of the code used other than as permitted herein. If you modify
* file(s) with this exception, you may extend this exception to your
* version of the file(s), but you are not obligated to do so. If you do not
* wish to do so, delete this exception statement from your version. If you
* delete this exception statement from all source files in the program,
* then also delete it in the license file.
*/
#include "mongo/platform/basic.h"
#include "mongo/util/file_allocator.h"
#include
#include
#include
#include
#if defined(__freebsd__)
# include
# include
#endif
#if defined(__linux__)
# include
#endif
#if defined(_WIN32)
# include
#endif
#include "mongo/platform/posix_fadvise.h"
#include "mongo/stdx/functional.h"
#include "mongo/util/concurrency/thread_name.h"
#include "mongo/util/log.h"
#include "mongo/util/mongoutils/str.h"
#include "mongo/util/paths.h"
#include "mongo/util/processinfo.h"
#include "mongo/util/time_support.h"
#include "mongo/util/timer.h"
using namespace mongoutils;
#ifndef O_NOATIME
#define O_NOATIME (0)
#endif
namespace mongo {
MONGO_LOG_DEFAULT_COMPONENT_FILE(::mongo::logger::LogComponent::kStorage);
// unique number for temporary file names
unsigned long long FileAllocator::_uniqueNumber = 0;
static SimpleMutex _uniqueNumberMutex( "uniqueNumberMutex" );
/**
* Aliases for Win32 CRT functions
*/
#if defined(_WIN32)
static inline long lseek(int fd, long offset, int origin) { return _lseek(fd, offset, origin); }
static inline int write(int fd, const void *data, int count) { return _write(fd, data, count); }
static inline int close(int fd) { return _close(fd); }
#endif
boost::filesystem::path ensureParentDirCreated(const boost::filesystem::path& p){
const boost::filesystem::path parent = p.branch_path();
if (! boost::filesystem::exists(parent)){
ensureParentDirCreated(parent);
log() << "creating directory " << parent.string() << endl;
boost::filesystem::create_directory(parent);
flushMyDirectory(parent); // flushes grandparent to ensure parent exists after crash
}
verify(boost::filesystem::is_directory(parent));
return parent;
}
FileAllocator::FileAllocator()
: _pendingMutex("FileAllocator"), _failed() {
}
void FileAllocator::start() {
boost::thread t( stdx::bind( &FileAllocator::run , this ) );
}
void FileAllocator::requestAllocation( const string &name, long &size ) {
scoped_lock lk( _pendingMutex );
if ( _failed )
return;
long oldSize = prevSize( name );
if ( oldSize != -1 ) {
size = oldSize;
return;
}
_pending.push_back( name );
_pendingSize[ name ] = size;
_pendingUpdated.notify_all();
}
void FileAllocator::allocateAsap( const string &name, unsigned long long &size ) {
scoped_lock lk( _pendingMutex );
long oldSize = prevSize( name );
if ( oldSize != -1 ) {
size = oldSize;
if ( !inProgress( name ) )
return;
}
checkFailure();
_pendingSize[ name ] = size;
if ( _pending.size() == 0 )
_pending.push_back( name );
else if ( _pending.front() != name ) {
_pending.remove( name );
list< string >::iterator i = _pending.begin();
++i;
_pending.insert( i, name );
}
_pendingUpdated.notify_all();
while( inProgress( name ) ) {
checkFailure();
_pendingUpdated.wait( lk.boost() );
}
}
void FileAllocator::waitUntilFinished() const {
if ( _failed )
return;
scoped_lock lk( _pendingMutex );
while( _pending.size() != 0 )
_pendingUpdated.wait( lk.boost() );
}
// TODO: pull this out to per-OS files once they exist
static bool useSparseFiles(int fd) {
#if defined(__linux__) || defined(__freebsd__)
struct statfs fs_stats;
int ret = fstatfs(fd, &fs_stats);
uassert(16062, "fstatfs failed: " + errnoWithDescription(), ret == 0);
#endif
#if defined(__linux__)
// these are from but that isn't available on all systems
# define NFS_SUPER_MAGIC 0x6969
return (fs_stats.f_type == NFS_SUPER_MAGIC);
#elif defined(__freebsd__)
return (str::equals(fs_stats.f_fstypename, "zfs") ||
str::equals(fs_stats.f_fstypename, "nfs") ||
str::equals(fs_stats.f_fstypename, "oldnfs"));
#elif defined(__sunos__)
// assume using ZFS which is copy-on-write so no benefit to zero-filling
// TODO: check which fs we are using like we do elsewhere
return true;
#else
return false;
#endif
}
void FileAllocator::ensureLength(int fd , long size) {
#if !defined(_WIN32)
if (useSparseFiles(fd)) {
LOG(1) << "using ftruncate to create a sparse file" << endl;
int ret = ftruncate(fd, size);
uassert(16063, "ftruncate failed: " + errnoWithDescription(), ret == 0);
return;
}
#endif
#if defined(__linux__)
int ret = posix_fallocate(fd,0,size);
if ( ret == 0 )
return;
log() << "FileAllocator: posix_fallocate failed: " << errnoWithDescription( ret ) << " falling back" << endl;
#endif
off_t filelen = lseek( fd, 0, SEEK_END );
if ( filelen < size ) {
if (filelen != 0) {
stringstream ss;
ss << "failure creating new datafile; lseek failed for fd " << fd << " with errno: " << errnoWithDescription();
uassert( 10440 , ss.str(), filelen == 0 );
}
// Check for end of disk.
uassert( 10441 , str::stream() << "Unable to allocate new file of size " << size << ' ' << errnoWithDescription(),
size - 1 == lseek(fd, size - 1, SEEK_SET) );
uassert( 10442 , str::stream() << "Unable to allocate new file of size " << size << ' ' << errnoWithDescription(),
1 == write(fd, "", 1) );
// File expansion is completed here. Do not do the zeroing out on OS-es where there
// is no risk of triggering allocation-related bugs such as
// http://support.microsoft.com/kb/2731284.
//
if (!ProcessInfo::isDataFileZeroingNeeded()) {
return;
}
lseek(fd, 0, SEEK_SET);
const long z = 256 * 1024;
const boost::scoped_array buf_holder (new char[z]);
char* buf = buf_holder.get();
memset(buf, 0, z);
long left = size;
while ( left > 0 ) {
long towrite = left;
if ( towrite > z )
towrite = z;
int written = write( fd , buf , towrite );
uassert( 10443 , errnoWithPrefix("FileAllocator: file write failed" ), written > 0 );
left -= written;
}
}
}
bool FileAllocator::hasFailed() const {
return _failed;
}
void FileAllocator::checkFailure() {
if (_failed) {
// we want to log the problem (diskfull.js expects it) but we do not want to dump a stack tracke
msgassertedNoTrace( 12520, "new file allocation failure" );
}
}
long FileAllocator::prevSize( const string &name ) const {
if ( _pendingSize.count( name ) > 0 )
return _pendingSize[ name ];
if ( boost::filesystem::exists( name ) )
return boost::filesystem::file_size( name );
return -1;
}
// caller must hold _pendingMutex lock.
bool FileAllocator::inProgress( const string &name ) const {
for( list< string >::const_iterator i = _pending.begin(); i != _pending.end(); ++i )
if ( *i == name )
return true;
return false;
}
string FileAllocator::makeTempFileName( boost::filesystem::path root ) {
while( 1 ) {
boost::filesystem::path p = root / "_tmp";
stringstream ss;
unsigned long long thisUniqueNumber;
{
// increment temporary file name counter
// TODO: SERVER-6055 -- Unify temporary file name selection
SimpleMutex::scoped_lock lk(_uniqueNumberMutex);
thisUniqueNumber = _uniqueNumber;
++_uniqueNumber;
}
ss << thisUniqueNumber;
p /= ss.str();
string fn = p.string();
if( !boost::filesystem::exists(p) )
return fn;
}
return "";
}
void FileAllocator::run( FileAllocator * fa ) {
setThreadName( "FileAllocator" );
{
// initialize unique temporary file name counter
// TODO: SERVER-6055 -- Unify temporary file name selection
SimpleMutex::scoped_lock lk(_uniqueNumberMutex);
_uniqueNumber = curTimeMicros64();
}
while( 1 ) {
{
scoped_lock lk( fa->_pendingMutex );
if ( fa->_pending.size() == 0 )
fa->_pendingUpdated.wait( lk.boost() );
}
while( 1 ) {
string name;
long size = 0;
{
scoped_lock lk( fa->_pendingMutex );
if ( fa->_pending.size() == 0 )
break;
name = fa->_pending.front();
size = fa->_pendingSize[ name ];
}
string tmp;
long fd = 0;
try {
log() << "allocating new datafile " << name << ", filling with zeroes..." << endl;
boost::filesystem::path parent = ensureParentDirCreated(name);
tmp = fa->makeTempFileName( parent );
ensureParentDirCreated(tmp);
#if defined(_WIN32)
fd = _open( tmp.c_str(), _O_RDWR | _O_CREAT | O_NOATIME, _S_IREAD | _S_IWRITE );
#else
fd = open(tmp.c_str(), O_CREAT | O_RDWR | O_NOATIME, S_IRUSR | S_IWUSR);
#endif
if ( fd < 0 ) {
log() << "FileAllocator: couldn't create " << name << " (" << tmp << ") " << errnoWithDescription() << endl;
uasserted(10439, "");
}
#if defined(POSIX_FADV_DONTNEED)
if( posix_fadvise(fd, 0, size, POSIX_FADV_DONTNEED) ) {
log() << "warning: posix_fadvise fails " << name << " (" << tmp << ") " << errnoWithDescription() << endl;
}
#endif
Timer t;
/* make sure the file is the full desired length */
ensureLength( fd , size );
close( fd );
fd = 0;
if( rename(tmp.c_str(), name.c_str()) ) {
const string& errStr = errnoWithDescription();
const string& errMessage = str::stream()
<< "error: couldn't rename " << tmp
<< " to " << name << ' ' << errStr;
msgasserted(13653, errMessage);
}
flushMyDirectory(name);
log() << "done allocating datafile " << name << ", "
<< "size: " << size/1024/1024 << "MB, "
<< " took " << ((double)t.millis())/1000.0 << " secs"
<< endl;
// no longer in a failed state. allow new writers.
fa->_failed = false;
}
catch ( const std::exception& e ) {
log() << "error: failed to allocate new file: " << name
<< " size: " << size << ' ' << e.what()
<< ". will try again in 10 seconds" << endl;
if ( fd > 0 )
close( fd );
try {
if ( ! tmp.empty() )
boost::filesystem::remove( tmp );
boost::filesystem::remove( name );
} catch ( const std::exception& e ) {
log() << "error removing files: " << e.what() << endl;
}
scoped_lock lk( fa->_pendingMutex );
fa->_failed = true;
// not erasing from pending
fa->_pendingUpdated.notify_all();
sleepsecs(10);
continue;
}
{
scoped_lock lk( fa->_pendingMutex );
fa->_pendingSize.erase( name );
fa->_pending.pop_front();
fa->_pendingUpdated.notify_all();
}
}
}
}
FileAllocator* FileAllocator::_instance = 0;
FileAllocator* FileAllocator::get(){
if ( ! _instance )
_instance = new FileAllocator();
return _instance;
}
} // namespace mongo