diff options
Diffstat (limited to 'ext/sqlite/libsqlite/src/pager.c')
-rw-r--r-- | ext/sqlite/libsqlite/src/pager.c | 216 |
1 files changed, 172 insertions, 44 deletions
diff --git a/ext/sqlite/libsqlite/src/pager.c b/ext/sqlite/libsqlite/src/pager.c index f593862104..23d3fb8009 100644 --- a/ext/sqlite/libsqlite/src/pager.c +++ b/ext/sqlite/libsqlite/src/pager.c @@ -84,6 +84,19 @@ static Pager *mainPager = 0; ** Each in-memory image of a page begins with the following header. ** This header is only visible to this pager module. The client ** code that calls pager sees only the data that follows the header. +** +** Client code should call sqlitepager_write() on a page prior to making +** any modifications to that page. The first time sqlitepager_write() +** is called, the original page contents are written into the rollback +** journal and PgHdr.inJournal and PgHdr.needSync are set. Later, once +** the journal page has made it onto the disk surface, PgHdr.needSync +** is cleared. The modified page cannot be written back into the original +** database file until the journal pages has been synced to disk and the +** PgHdr.needSync has been cleared. +** +** The PgHdr.dirty flag is set when sqlitepager_write() is called and +** is cleared again when the page content is written back to the original +** database file. */ typedef struct PgHdr PgHdr; struct PgHdr { @@ -104,6 +117,16 @@ struct PgHdr { /* Pager.nExtra bytes of local data follow the page data */ }; + +/* +** A macro used for invoking the codec if there is one +*/ +#ifdef SQLITE_HAS_CODEC +# define CODEC(P,D,N,X) if( P->xCodec ){ P->xCodec(P->pCodecArg,D,N,X); } +#else +# define CODEC(P,D,N,X) +#endif + /* ** Convert a pointer to a PgHdr into a pointer to its data ** and back again. @@ -145,9 +168,11 @@ struct Pager { int nRef; /* Number of in-memory pages with PgHdr.nRef>0 */ int mxPage; /* Maximum number of pages to hold in cache */ int nHit, nMiss, nOvfl; /* Cache hits, missing, and LRU overflows */ + void (*xCodec)(void*,void*,Pgno,int); /* Routine for en/decoding data */ + void *pCodecArg; /* First argument to xCodec() */ u8 journalOpen; /* True if journal file descriptors is valid */ - u8 journalStarted; /* True if initial magic of journal is synced */ - u8 useJournal; /* Do not use a rollback journal on this file */ + u8 journalStarted; /* True if header of journal is synced */ + u8 useJournal; /* Use a rollback journal on this file */ u8 ckptOpen; /* True if the checkpoint journal is open */ u8 ckptInUse; /* True we are in a checkpoint */ u8 ckptAutoopen; /* Open ckpt journal when main journal is opened*/ @@ -188,8 +213,8 @@ struct Pager { */ typedef struct PageRecord PageRecord; struct PageRecord { - Pgno pgno; /* The page number */ - char aData[SQLITE_PAGE_SIZE]; /* Original data for page pgno */ + Pgno pgno; /* The page number */ + char aData[SQLITE_PAGE_SIZE]; /* Original data for page pgno */ }; /* @@ -279,7 +304,13 @@ int journal_format = 3; #endif /* -** Read a 32-bit integer from the given file descriptor +** Read a 32-bit integer from the given file descriptor. Store the integer +** that is read in *pRes. Return SQLITE_OK if everything worked, or an +** error code is something goes wrong. +** +** If the journal format is 2 or 3, read a big-endian integer. If the +** journal format is 1, read an integer in the native byte-order of the +** host machine. */ static int read32bits(int format, OsFile *fd, u32 *pRes){ u32 res; @@ -295,8 +326,13 @@ static int read32bits(int format, OsFile *fd, u32 *pRes){ } /* -** Write a 32-bit integer into the given file descriptor. Writing -** is always done using the new journal format. +** Write a 32-bit integer into the given file descriptor. Return SQLITE_OK +** on success or an error code is something goes wrong. +** +** If the journal format is 2 or 3, write the integer as 4 big-endian +** bytes. If the journal format is 1, write the integer in the native +** byte order. In normal operation, only formats 2 and 3 are used. +** Journal format 1 is only used for testing. */ static int write32bits(OsFile *fd, u32 val){ unsigned char ac[4]; @@ -313,6 +349,9 @@ static int write32bits(OsFile *fd, u32 val){ /* ** Write a 32-bit integer into a page header right before the ** page data. This will overwrite the PgHdr.pDirty pointer. +** +** The integer is big-endian for formats 2 and 3 and native byte order +** for journal format 1. */ static void store32bits(u32 val, PgHdr *p, int offset){ unsigned char *ac; @@ -469,6 +508,10 @@ static int pager_unwritelock(Pager *pPager){ /* ** Compute and return a checksum for the page of data. +** +** This is not a real checksum. It is really just the sum of the +** random initial value and the page number. We considered do a checksum +** of the database, but that was found to be too slow. */ static u32 pager_cksum(Pager *pPager, Pgno pgno, const char *aData){ u32 cksum = pPager->cksumInit + pgno; @@ -529,6 +572,7 @@ static int pager_playback_one_page(Pager *pPager, OsFile *jfd, int format){ memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra); pPg->dirty = 0; pPg->needSync = 0; + CODEC(pPager, PGHDR_TO_DATA(pPg), pPg->pgno, 3); } return rc; } @@ -537,21 +581,53 @@ static int pager_playback_one_page(Pager *pPager, OsFile *jfd, int format){ ** Playback the journal and thus restore the database file to ** the state it was in before we started making changes. ** -** The journal file format is as follows: There is an initial -** file-type string for sanity checking. Then there is a single -** Pgno number which is the number of pages in the database before -** changes were made. The database is truncated to this size. -** Next come zero or more page records where each page record -** consists of a Pgno and SQLITE_PAGE_SIZE bytes of data. See -** the PageRecord structure for details. +** The journal file format is as follows: +** +** * 8 byte prefix. One of the aJournalMagic123 vectors defined +** above. The format of the journal file is determined by which +** of the three prefix vectors is seen. +** * 4 byte big-endian integer which is the number of valid page records +** in the journal. If this value is 0xffffffff, then compute the +** number of page records from the journal size. This field appears +** in format 3 only. +** * 4 byte big-endian integer which is the initial value for the +** sanity checksum. This field appears in format 3 only. +** * 4 byte integer which is the number of pages to truncate the +** database to during a rollback. +** * Zero or more pages instances, each as follows: +** + 4 byte page number. +** + SQLITE_PAGE_SIZE bytes of data. +** + 4 byte checksum (format 3 only) +** +** When we speak of the journal header, we mean the first 4 bullets above. +** Each entry in the journal is an instance of the 5th bullet. Note that +** bullets 2 and 3 only appear in format-3 journals. +** +** Call the value from the second bullet "nRec". nRec is the number of +** valid page entries in the journal. In most cases, you can compute the +** value of nRec from the size of the journal file. But if a power +** failure occurred while the journal was being written, it could be the +** case that the size of the journal file had already been increased but +** the extra entries had not yet made it safely to disk. In such a case, +** the value of nRec computed from the file size would be too large. For +** that reason, we always use the nRec value in the header. +** +** If the nRec value is 0xffffffff it means that nRec should be computed +** from the file size. This value is used when the user selects the +** no-sync option for the journal. A power failure could lead to corruption +** in this case. But for things like temporary table (which will be +** deleted when the power is restored) we don't care. +** +** Journal formats 1 and 2 do not have an nRec value in the header so we +** have to compute nRec from the file size. This has risks (as described +** above) which is why all persistent tables have been changed to use +** format 3. ** ** If the file opened as the journal file is not a well-formed -** journal file (as determined by looking at the magic number -** at the beginning) then this routine returns SQLITE_PROTOCOL. -** If any other errors occur during playback, the database will -** likely be corrupted, so the PAGER_ERR_CORRUPT bit is set in -** pPager->errMask and SQLITE_CORRUPT is returned. If it all -** works, then this routine returns SQLITE_OK. +** journal file then the database will likely already be +** corrupted, so the PAGER_ERR_CORRUPT bit is set in pPager->errMask +** and SQLITE_CORRUPT is returned. If it all works, then this routine +** returns SQLITE_OK. */ static int pager_playback(Pager *pPager, int useJournalSize){ off_t szJ; /* Size of the journal file in bytes */ @@ -571,6 +647,13 @@ static int pager_playback(Pager *pPager, int useJournalSize){ if( rc!=SQLITE_OK ){ goto end_playback; } + + /* If the journal file is too small to contain a complete header, + ** it must mean that the process that created the journal was just + ** beginning to write the journal file when it died. In that case, + ** the database file should have still been completely unchanged. + ** Nothing needs to be rolled back. We can safely ignore this journal. + */ if( szJ < sizeof(aMagic)+sizeof(Pgno) ){ goto end_playback; } @@ -594,6 +677,15 @@ static int pager_playback(Pager *pPager, int useJournalSize){ goto end_playback; } if( format>=JOURNAL_FORMAT_3 ){ + if( szJ < sizeof(aMagic) + 3*sizeof(u32) ){ + /* Ignore the journal if it is too small to contain a complete + ** header. We already did this test once above, but at the prior + ** test, we did not know the journal format and so we had to assume + ** the smallest possible header. Now we know the header is bigger + ** than the minimum so we test again. + */ + goto end_playback; + } rc = read32bits(format, &pPager->jfd, (u32*)&nRec); if( rc ) goto end_playback; rc = read32bits(format, &pPager->jfd, &pPager->cksumInit); @@ -630,7 +722,7 @@ static int pager_playback(Pager *pPager, int useJournalSize){ /* Pages that have been written to the journal but never synced ** where not restored by the loop above. We have to restore those - ** pages by reading the back from the original database. + ** pages by reading them back from the original database. */ if( rc==SQLITE_OK ){ PgHdr *pPg; @@ -640,6 +732,8 @@ static int pager_playback(Pager *pPager, int useJournalSize){ if( (int)pPg->pgno <= pPager->origDbSize ){ sqliteOsSeek(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)(pPg->pgno-1)); rc = sqliteOsRead(&pPager->fd, zBuf, SQLITE_PAGE_SIZE); + TRACE2("REFETCH %d\n", pPg->pgno); + CODEC(pPager, zBuf, pPg->pgno, 2); if( rc ) break; }else{ memset(zBuf, 0, SQLITE_PAGE_SIZE); @@ -747,6 +841,7 @@ end_ckpt_playback: void sqlitepager_set_cachesize(Pager *pPager, int mxPage){ if( mxPage>=0 ){ pPager->noSync = pPager->tempFile; + if( pPager->noSync==0 ) pPager->needSync = 0; }else{ pPager->noSync = 1; mxPage = -mxPage; @@ -772,8 +867,9 @@ void sqlitepager_set_cachesize(Pager *pPager, int mxPage){ ** when it is rolled back. ** ** FULL The journal is synced twice before writes begin on the -** database (with some additional information being written -** in between the two syncs. If we assume that writing a +** database (with some additional information - the nRec field +** of the journal header - being written in between the two +** syncs). If we assume that writing a ** single disk sector is atomic, then this mode provides ** assurance that the journal will not be corrupted to the ** point of causing damage to the database during rollback. @@ -784,6 +880,7 @@ void sqlitepager_set_cachesize(Pager *pPager, int mxPage){ void sqlitepager_set_safety_level(Pager *pPager, int level){ pPager->noSync = level==1 || pPager->tempFile; pPager->fullSync = level==3 && !pPager->tempFile; + if( pPager->noSync==0 ) pPager->needSync = 0; } /* @@ -933,7 +1030,7 @@ int sqlitepager_pagecount(Pager *pPager){ /* ** Forward declaration */ -static int syncAllPages(Pager*); +static int syncJournal(Pager*); /* ** Truncate the file to the number of pages specified. @@ -950,7 +1047,7 @@ int sqlitepager_truncate(Pager *pPager, Pgno nPage){ if( nPage>=(unsigned)pPager->dbSize ){ return SQLITE_OK; } - syncAllPages(pPager); + syncJournal(pPager); rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)nPage); if( rc==SQLITE_OK ){ pPager->dbSize = nPage; @@ -1056,23 +1153,26 @@ int sqlitepager_ref(void *pData){ } /* -** Sync the journal and then write all free dirty pages to the database -** file. +** Sync the journal. In other words, make sure all the pages that have +** been written to the journal have actually reached the surface of the +** disk. It is not safe to modify the original database file until after +** the journal has been synced. If the original database is modified before +** the journal is synced and a power failure occurs, the unsynced journal +** data would be lost and we would be unable to completely rollback the +** database changes. Database corruption would occur. +** +** This routine also updates the nRec field in the header of the journal. +** (See comments on the pager_playback() routine for additional information.) +** If the sync mode is FULL, two syncs will occur. First the whole journal +** is synced, then the nRec field is updated, then a second sync occurs. ** -** Writing all free dirty pages to the database after the sync is a -** non-obvious optimization. fsync() is an expensive operation so we -** want to minimize the number ot times it is called. After an fsync() call, -** we are free to write dirty pages back to the database. It is best -** to go ahead and write as many dirty pages as possible to minimize -** the risk of having to do another fsync() later on. Writing dirty -** free pages in this way was observed to make database operations go -** up to 10 times faster. +** For temporary databases, we do not care if we are able to rollback +** after a power failure, so sync occurs. ** -** If we are writing to temporary database, there is no need to preserve -** the integrity of the journal file, so we can save time and skip the -** fsync(). +** This routine clears the needSync field of every page current held in +** memory. */ -static int syncAllPages(Pager *pPager){ +static int syncJournal(Pager *pPager){ PgHdr *pPg; int rc = SQLITE_OK; @@ -1082,9 +1182,13 @@ static int syncAllPages(Pager *pPager){ if( pPager->needSync ){ if( !pPager->tempFile ){ assert( pPager->journalOpen ); - assert( !pPager->noSync ); + /* assert( !pPager->noSync ); // noSync might be set if synchronous + ** was turned off after the transaction was started. Ticket #615 */ #ifndef NDEBUG { + /* Make sure the pPager->nRec counter we are keeping agrees + ** with the nRec computed from the size of the journal file. + */ off_t hdrSz, pgSz, jSz; hdrSz = JOURNAL_HDR_SZ(journal_format); pgSz = JOURNAL_PG_SZ(journal_format); @@ -1094,6 +1198,7 @@ static int syncAllPages(Pager *pPager){ } #endif if( journal_format>=3 ){ + /* Write the nRec value into the journal file header */ off_t szJ; if( pPager->fullSync ){ TRACE1("SYNC\n"); @@ -1152,7 +1257,10 @@ static int pager_write_pagelist(PgHdr *pList){ while( pList ){ assert( pList->dirty ); sqliteOsSeek(&pPager->fd, (pList->pgno-1)*(off_t)SQLITE_PAGE_SIZE); + CODEC(pPager, PGHDR_TO_DATA(pList), pList->pgno, 6); + TRACE2("STORE %d\n", pList->pgno); rc = sqliteOsWrite(&pPager->fd, PGHDR_TO_DATA(pList), SQLITE_PAGE_SIZE); + CODEC(pPager, PGHDR_TO_DATA(pList), pList->pgno, 0); if( rc ) return rc; pList->dirty = 0; pList = pList->pDirty; @@ -1304,7 +1412,7 @@ int sqlitepager_get(Pager *pPager, Pgno pgno, void **ppPage){ ** it can't be helped. */ if( pPg==0 ){ - int rc = syncAllPages(pPager); + int rc = syncJournal(pPager); if( rc!=0 ){ sqlitepager_rollback(pPager); return SQLITE_IOERR; @@ -1413,6 +1521,8 @@ int sqlitepager_get(Pager *pPager, Pgno pgno, void **ppPage){ int rc; sqliteOsSeek(&pPager->fd, (pgno-1)*(off_t)SQLITE_PAGE_SIZE); rc = sqliteOsRead(&pPager->fd, PGHDR_TO_DATA(pPg), SQLITE_PAGE_SIZE); + TRACE2("FETCH %d\n", pPg->pgno); + CODEC(pPager, PGHDR_TO_DATA(pPg), pPg->pgno, 3); if( rc!=SQLITE_OK ){ off_t fileSize; if( sqliteOsFileSize(&pPager->fd,&fileSize)!=SQLITE_OK @@ -1557,7 +1667,7 @@ static int pager_open_journal(Pager *pPager){ rc = write32bits(&pPager->jfd, pPager->noSync ? 0xffffffff : 0); } if( rc==SQLITE_OK ){ - pPager->cksumInit = (u32)sqliteRandomInteger(); + sqliteRandomness(sizeof(pPager->cksumInit), &pPager->cksumInit); rc = write32bits(&pPager->jfd, pPager->cksumInit); } }else if( journal_format==JOURNAL_FORMAT_2 ){ @@ -1700,7 +1810,10 @@ int sqlitepager_write(void *pData){ szPg = SQLITE_PAGE_SIZE+4; } store32bits(pPg->pgno, pPg, -4); + CODEC(pPager, pData, pPg->pgno, 7); rc = sqliteOsWrite(&pPager->jfd, &((char*)pData)[-4], szPg); + TRACE3("JOURNAL %d %d\n", pPg->pgno, pPg->needSync); + CODEC(pPager, pData, pPg->pgno, 0); if( journal_format>=JOURNAL_FORMAT_3 ){ *(u32*)PGHDR_TO_EXTRA(pPg) = saved; } @@ -1718,7 +1831,6 @@ int sqlitepager_write(void *pData){ pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7); page_add_to_ckpt_list(pPg); } - TRACE3("JOURNAL %d %d\n", pPg->pgno, pPg->needSync); }else{ pPg->needSync = !pPager->journalStarted && !pPager->noSync; TRACE3("APPEND %d %d\n", pPg->pgno, pPg->needSync); @@ -1736,7 +1848,10 @@ int sqlitepager_write(void *pData){ if( pPager->ckptInUse && !pPg->inCkpt && (int)pPg->pgno<=pPager->ckptSize ){ assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize ); store32bits(pPg->pgno, pPg, -4); + CODEC(pPager, pData, pPg->pgno, 7); rc = sqliteOsWrite(&pPager->cpfd, &((char*)pData)[-4], SQLITE_PAGE_SIZE+4); + TRACE2("CKPT-JOURNAL %d\n", pPg->pgno); + CODEC(pPager, pData, pPg->pgno, 0); if( rc!=SQLITE_OK ){ sqlitepager_rollback(pPager); pPager->errMask |= PAGER_ERR_FULL; @@ -1896,7 +2011,8 @@ int sqlitepager_commit(Pager *pPager){ return rc; } assert( pPager->journalOpen ); - if( pPager->needSync && sqliteOsSync(&pPager->jfd)!=SQLITE_OK ){ + rc = syncJournal(pPager); + if( rc!=SQLITE_OK ){ goto commit_abort; } pPg = pager_get_all_dirty_pages(pPager); @@ -2077,6 +2193,18 @@ const char *sqlitepager_filename(Pager *pPager){ return pPager->zFilename; } +/* +** Set the codec for this pager +*/ +void sqlitepager_set_codec( + Pager *pPager, + void (*xCodec)(void*,void*,Pgno,int), + void *pCodecArg +){ + pPager->xCodec = xCodec; + pPager->pCodecArg = pCodecArg; +} + #ifdef SQLITE_TEST /* ** Print a listing of all referenced pages and their ref count. |