summaryrefslogtreecommitdiff
path: root/ext/sqlite/libsqlite/src/pager.c
diff options
context:
space:
mode:
Diffstat (limited to 'ext/sqlite/libsqlite/src/pager.c')
-rw-r--r--ext/sqlite/libsqlite/src/pager.c216
1 files changed, 172 insertions, 44 deletions
diff --git a/ext/sqlite/libsqlite/src/pager.c b/ext/sqlite/libsqlite/src/pager.c
index f593862104..23d3fb8009 100644
--- a/ext/sqlite/libsqlite/src/pager.c
+++ b/ext/sqlite/libsqlite/src/pager.c
@@ -84,6 +84,19 @@ static Pager *mainPager = 0;
** Each in-memory image of a page begins with the following header.
** This header is only visible to this pager module. The client
** code that calls pager sees only the data that follows the header.
+**
+** Client code should call sqlitepager_write() on a page prior to making
+** any modifications to that page. The first time sqlitepager_write()
+** is called, the original page contents are written into the rollback
+** journal and PgHdr.inJournal and PgHdr.needSync are set. Later, once
+** the journal page has made it onto the disk surface, PgHdr.needSync
+** is cleared. The modified page cannot be written back into the original
+** database file until the journal pages has been synced to disk and the
+** PgHdr.needSync has been cleared.
+**
+** The PgHdr.dirty flag is set when sqlitepager_write() is called and
+** is cleared again when the page content is written back to the original
+** database file.
*/
typedef struct PgHdr PgHdr;
struct PgHdr {
@@ -104,6 +117,16 @@ struct PgHdr {
/* Pager.nExtra bytes of local data follow the page data */
};
+
+/*
+** A macro used for invoking the codec if there is one
+*/
+#ifdef SQLITE_HAS_CODEC
+# define CODEC(P,D,N,X) if( P->xCodec ){ P->xCodec(P->pCodecArg,D,N,X); }
+#else
+# define CODEC(P,D,N,X)
+#endif
+
/*
** Convert a pointer to a PgHdr into a pointer to its data
** and back again.
@@ -145,9 +168,11 @@ struct Pager {
int nRef; /* Number of in-memory pages with PgHdr.nRef>0 */
int mxPage; /* Maximum number of pages to hold in cache */
int nHit, nMiss, nOvfl; /* Cache hits, missing, and LRU overflows */
+ void (*xCodec)(void*,void*,Pgno,int); /* Routine for en/decoding data */
+ void *pCodecArg; /* First argument to xCodec() */
u8 journalOpen; /* True if journal file descriptors is valid */
- u8 journalStarted; /* True if initial magic of journal is synced */
- u8 useJournal; /* Do not use a rollback journal on this file */
+ u8 journalStarted; /* True if header of journal is synced */
+ u8 useJournal; /* Use a rollback journal on this file */
u8 ckptOpen; /* True if the checkpoint journal is open */
u8 ckptInUse; /* True we are in a checkpoint */
u8 ckptAutoopen; /* Open ckpt journal when main journal is opened*/
@@ -188,8 +213,8 @@ struct Pager {
*/
typedef struct PageRecord PageRecord;
struct PageRecord {
- Pgno pgno; /* The page number */
- char aData[SQLITE_PAGE_SIZE]; /* Original data for page pgno */
+ Pgno pgno; /* The page number */
+ char aData[SQLITE_PAGE_SIZE]; /* Original data for page pgno */
};
/*
@@ -279,7 +304,13 @@ int journal_format = 3;
#endif
/*
-** Read a 32-bit integer from the given file descriptor
+** Read a 32-bit integer from the given file descriptor. Store the integer
+** that is read in *pRes. Return SQLITE_OK if everything worked, or an
+** error code is something goes wrong.
+**
+** If the journal format is 2 or 3, read a big-endian integer. If the
+** journal format is 1, read an integer in the native byte-order of the
+** host machine.
*/
static int read32bits(int format, OsFile *fd, u32 *pRes){
u32 res;
@@ -295,8 +326,13 @@ static int read32bits(int format, OsFile *fd, u32 *pRes){
}
/*
-** Write a 32-bit integer into the given file descriptor. Writing
-** is always done using the new journal format.
+** Write a 32-bit integer into the given file descriptor. Return SQLITE_OK
+** on success or an error code is something goes wrong.
+**
+** If the journal format is 2 or 3, write the integer as 4 big-endian
+** bytes. If the journal format is 1, write the integer in the native
+** byte order. In normal operation, only formats 2 and 3 are used.
+** Journal format 1 is only used for testing.
*/
static int write32bits(OsFile *fd, u32 val){
unsigned char ac[4];
@@ -313,6 +349,9 @@ static int write32bits(OsFile *fd, u32 val){
/*
** Write a 32-bit integer into a page header right before the
** page data. This will overwrite the PgHdr.pDirty pointer.
+**
+** The integer is big-endian for formats 2 and 3 and native byte order
+** for journal format 1.
*/
static void store32bits(u32 val, PgHdr *p, int offset){
unsigned char *ac;
@@ -469,6 +508,10 @@ static int pager_unwritelock(Pager *pPager){
/*
** Compute and return a checksum for the page of data.
+**
+** This is not a real checksum. It is really just the sum of the
+** random initial value and the page number. We considered do a checksum
+** of the database, but that was found to be too slow.
*/
static u32 pager_cksum(Pager *pPager, Pgno pgno, const char *aData){
u32 cksum = pPager->cksumInit + pgno;
@@ -529,6 +572,7 @@ static int pager_playback_one_page(Pager *pPager, OsFile *jfd, int format){
memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra);
pPg->dirty = 0;
pPg->needSync = 0;
+ CODEC(pPager, PGHDR_TO_DATA(pPg), pPg->pgno, 3);
}
return rc;
}
@@ -537,21 +581,53 @@ static int pager_playback_one_page(Pager *pPager, OsFile *jfd, int format){
** Playback the journal and thus restore the database file to
** the state it was in before we started making changes.
**
-** The journal file format is as follows: There is an initial
-** file-type string for sanity checking. Then there is a single
-** Pgno number which is the number of pages in the database before
-** changes were made. The database is truncated to this size.
-** Next come zero or more page records where each page record
-** consists of a Pgno and SQLITE_PAGE_SIZE bytes of data. See
-** the PageRecord structure for details.
+** The journal file format is as follows:
+**
+** * 8 byte prefix. One of the aJournalMagic123 vectors defined
+** above. The format of the journal file is determined by which
+** of the three prefix vectors is seen.
+** * 4 byte big-endian integer which is the number of valid page records
+** in the journal. If this value is 0xffffffff, then compute the
+** number of page records from the journal size. This field appears
+** in format 3 only.
+** * 4 byte big-endian integer which is the initial value for the
+** sanity checksum. This field appears in format 3 only.
+** * 4 byte integer which is the number of pages to truncate the
+** database to during a rollback.
+** * Zero or more pages instances, each as follows:
+** + 4 byte page number.
+** + SQLITE_PAGE_SIZE bytes of data.
+** + 4 byte checksum (format 3 only)
+**
+** When we speak of the journal header, we mean the first 4 bullets above.
+** Each entry in the journal is an instance of the 5th bullet. Note that
+** bullets 2 and 3 only appear in format-3 journals.
+**
+** Call the value from the second bullet "nRec". nRec is the number of
+** valid page entries in the journal. In most cases, you can compute the
+** value of nRec from the size of the journal file. But if a power
+** failure occurred while the journal was being written, it could be the
+** case that the size of the journal file had already been increased but
+** the extra entries had not yet made it safely to disk. In such a case,
+** the value of nRec computed from the file size would be too large. For
+** that reason, we always use the nRec value in the header.
+**
+** If the nRec value is 0xffffffff it means that nRec should be computed
+** from the file size. This value is used when the user selects the
+** no-sync option for the journal. A power failure could lead to corruption
+** in this case. But for things like temporary table (which will be
+** deleted when the power is restored) we don't care.
+**
+** Journal formats 1 and 2 do not have an nRec value in the header so we
+** have to compute nRec from the file size. This has risks (as described
+** above) which is why all persistent tables have been changed to use
+** format 3.
**
** If the file opened as the journal file is not a well-formed
-** journal file (as determined by looking at the magic number
-** at the beginning) then this routine returns SQLITE_PROTOCOL.
-** If any other errors occur during playback, the database will
-** likely be corrupted, so the PAGER_ERR_CORRUPT bit is set in
-** pPager->errMask and SQLITE_CORRUPT is returned. If it all
-** works, then this routine returns SQLITE_OK.
+** journal file then the database will likely already be
+** corrupted, so the PAGER_ERR_CORRUPT bit is set in pPager->errMask
+** and SQLITE_CORRUPT is returned. If it all works, then this routine
+** returns SQLITE_OK.
*/
static int pager_playback(Pager *pPager, int useJournalSize){
off_t szJ; /* Size of the journal file in bytes */
@@ -571,6 +647,13 @@ static int pager_playback(Pager *pPager, int useJournalSize){
if( rc!=SQLITE_OK ){
goto end_playback;
}
+
+ /* If the journal file is too small to contain a complete header,
+ ** it must mean that the process that created the journal was just
+ ** beginning to write the journal file when it died. In that case,
+ ** the database file should have still been completely unchanged.
+ ** Nothing needs to be rolled back. We can safely ignore this journal.
+ */
if( szJ < sizeof(aMagic)+sizeof(Pgno) ){
goto end_playback;
}
@@ -594,6 +677,15 @@ static int pager_playback(Pager *pPager, int useJournalSize){
goto end_playback;
}
if( format>=JOURNAL_FORMAT_3 ){
+ if( szJ < sizeof(aMagic) + 3*sizeof(u32) ){
+ /* Ignore the journal if it is too small to contain a complete
+ ** header. We already did this test once above, but at the prior
+ ** test, we did not know the journal format and so we had to assume
+ ** the smallest possible header. Now we know the header is bigger
+ ** than the minimum so we test again.
+ */
+ goto end_playback;
+ }
rc = read32bits(format, &pPager->jfd, (u32*)&nRec);
if( rc ) goto end_playback;
rc = read32bits(format, &pPager->jfd, &pPager->cksumInit);
@@ -630,7 +722,7 @@ static int pager_playback(Pager *pPager, int useJournalSize){
/* Pages that have been written to the journal but never synced
** where not restored by the loop above. We have to restore those
- ** pages by reading the back from the original database.
+ ** pages by reading them back from the original database.
*/
if( rc==SQLITE_OK ){
PgHdr *pPg;
@@ -640,6 +732,8 @@ static int pager_playback(Pager *pPager, int useJournalSize){
if( (int)pPg->pgno <= pPager->origDbSize ){
sqliteOsSeek(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)(pPg->pgno-1));
rc = sqliteOsRead(&pPager->fd, zBuf, SQLITE_PAGE_SIZE);
+ TRACE2("REFETCH %d\n", pPg->pgno);
+ CODEC(pPager, zBuf, pPg->pgno, 2);
if( rc ) break;
}else{
memset(zBuf, 0, SQLITE_PAGE_SIZE);
@@ -747,6 +841,7 @@ end_ckpt_playback:
void sqlitepager_set_cachesize(Pager *pPager, int mxPage){
if( mxPage>=0 ){
pPager->noSync = pPager->tempFile;
+ if( pPager->noSync==0 ) pPager->needSync = 0;
}else{
pPager->noSync = 1;
mxPage = -mxPage;
@@ -772,8 +867,9 @@ void sqlitepager_set_cachesize(Pager *pPager, int mxPage){
** when it is rolled back.
**
** FULL The journal is synced twice before writes begin on the
-** database (with some additional information being written
-** in between the two syncs. If we assume that writing a
+** database (with some additional information - the nRec field
+** of the journal header - being written in between the two
+** syncs). If we assume that writing a
** single disk sector is atomic, then this mode provides
** assurance that the journal will not be corrupted to the
** point of causing damage to the database during rollback.
@@ -784,6 +880,7 @@ void sqlitepager_set_cachesize(Pager *pPager, int mxPage){
void sqlitepager_set_safety_level(Pager *pPager, int level){
pPager->noSync = level==1 || pPager->tempFile;
pPager->fullSync = level==3 && !pPager->tempFile;
+ if( pPager->noSync==0 ) pPager->needSync = 0;
}
/*
@@ -933,7 +1030,7 @@ int sqlitepager_pagecount(Pager *pPager){
/*
** Forward declaration
*/
-static int syncAllPages(Pager*);
+static int syncJournal(Pager*);
/*
** Truncate the file to the number of pages specified.
@@ -950,7 +1047,7 @@ int sqlitepager_truncate(Pager *pPager, Pgno nPage){
if( nPage>=(unsigned)pPager->dbSize ){
return SQLITE_OK;
}
- syncAllPages(pPager);
+ syncJournal(pPager);
rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)nPage);
if( rc==SQLITE_OK ){
pPager->dbSize = nPage;
@@ -1056,23 +1153,26 @@ int sqlitepager_ref(void *pData){
}
/*
-** Sync the journal and then write all free dirty pages to the database
-** file.
+** Sync the journal. In other words, make sure all the pages that have
+** been written to the journal have actually reached the surface of the
+** disk. It is not safe to modify the original database file until after
+** the journal has been synced. If the original database is modified before
+** the journal is synced and a power failure occurs, the unsynced journal
+** data would be lost and we would be unable to completely rollback the
+** database changes. Database corruption would occur.
+**
+** This routine also updates the nRec field in the header of the journal.
+** (See comments on the pager_playback() routine for additional information.)
+** If the sync mode is FULL, two syncs will occur. First the whole journal
+** is synced, then the nRec field is updated, then a second sync occurs.
**
-** Writing all free dirty pages to the database after the sync is a
-** non-obvious optimization. fsync() is an expensive operation so we
-** want to minimize the number ot times it is called. After an fsync() call,
-** we are free to write dirty pages back to the database. It is best
-** to go ahead and write as many dirty pages as possible to minimize
-** the risk of having to do another fsync() later on. Writing dirty
-** free pages in this way was observed to make database operations go
-** up to 10 times faster.
+** For temporary databases, we do not care if we are able to rollback
+** after a power failure, so sync occurs.
**
-** If we are writing to temporary database, there is no need to preserve
-** the integrity of the journal file, so we can save time and skip the
-** fsync().
+** This routine clears the needSync field of every page current held in
+** memory.
*/
-static int syncAllPages(Pager *pPager){
+static int syncJournal(Pager *pPager){
PgHdr *pPg;
int rc = SQLITE_OK;
@@ -1082,9 +1182,13 @@ static int syncAllPages(Pager *pPager){
if( pPager->needSync ){
if( !pPager->tempFile ){
assert( pPager->journalOpen );
- assert( !pPager->noSync );
+ /* assert( !pPager->noSync ); // noSync might be set if synchronous
+ ** was turned off after the transaction was started. Ticket #615 */
#ifndef NDEBUG
{
+ /* Make sure the pPager->nRec counter we are keeping agrees
+ ** with the nRec computed from the size of the journal file.
+ */
off_t hdrSz, pgSz, jSz;
hdrSz = JOURNAL_HDR_SZ(journal_format);
pgSz = JOURNAL_PG_SZ(journal_format);
@@ -1094,6 +1198,7 @@ static int syncAllPages(Pager *pPager){
}
#endif
if( journal_format>=3 ){
+ /* Write the nRec value into the journal file header */
off_t szJ;
if( pPager->fullSync ){
TRACE1("SYNC\n");
@@ -1152,7 +1257,10 @@ static int pager_write_pagelist(PgHdr *pList){
while( pList ){
assert( pList->dirty );
sqliteOsSeek(&pPager->fd, (pList->pgno-1)*(off_t)SQLITE_PAGE_SIZE);
+ CODEC(pPager, PGHDR_TO_DATA(pList), pList->pgno, 6);
+ TRACE2("STORE %d\n", pList->pgno);
rc = sqliteOsWrite(&pPager->fd, PGHDR_TO_DATA(pList), SQLITE_PAGE_SIZE);
+ CODEC(pPager, PGHDR_TO_DATA(pList), pList->pgno, 0);
if( rc ) return rc;
pList->dirty = 0;
pList = pList->pDirty;
@@ -1304,7 +1412,7 @@ int sqlitepager_get(Pager *pPager, Pgno pgno, void **ppPage){
** it can't be helped.
*/
if( pPg==0 ){
- int rc = syncAllPages(pPager);
+ int rc = syncJournal(pPager);
if( rc!=0 ){
sqlitepager_rollback(pPager);
return SQLITE_IOERR;
@@ -1413,6 +1521,8 @@ int sqlitepager_get(Pager *pPager, Pgno pgno, void **ppPage){
int rc;
sqliteOsSeek(&pPager->fd, (pgno-1)*(off_t)SQLITE_PAGE_SIZE);
rc = sqliteOsRead(&pPager->fd, PGHDR_TO_DATA(pPg), SQLITE_PAGE_SIZE);
+ TRACE2("FETCH %d\n", pPg->pgno);
+ CODEC(pPager, PGHDR_TO_DATA(pPg), pPg->pgno, 3);
if( rc!=SQLITE_OK ){
off_t fileSize;
if( sqliteOsFileSize(&pPager->fd,&fileSize)!=SQLITE_OK
@@ -1557,7 +1667,7 @@ static int pager_open_journal(Pager *pPager){
rc = write32bits(&pPager->jfd, pPager->noSync ? 0xffffffff : 0);
}
if( rc==SQLITE_OK ){
- pPager->cksumInit = (u32)sqliteRandomInteger();
+ sqliteRandomness(sizeof(pPager->cksumInit), &pPager->cksumInit);
rc = write32bits(&pPager->jfd, pPager->cksumInit);
}
}else if( journal_format==JOURNAL_FORMAT_2 ){
@@ -1700,7 +1810,10 @@ int sqlitepager_write(void *pData){
szPg = SQLITE_PAGE_SIZE+4;
}
store32bits(pPg->pgno, pPg, -4);
+ CODEC(pPager, pData, pPg->pgno, 7);
rc = sqliteOsWrite(&pPager->jfd, &((char*)pData)[-4], szPg);
+ TRACE3("JOURNAL %d %d\n", pPg->pgno, pPg->needSync);
+ CODEC(pPager, pData, pPg->pgno, 0);
if( journal_format>=JOURNAL_FORMAT_3 ){
*(u32*)PGHDR_TO_EXTRA(pPg) = saved;
}
@@ -1718,7 +1831,6 @@ int sqlitepager_write(void *pData){
pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
page_add_to_ckpt_list(pPg);
}
- TRACE3("JOURNAL %d %d\n", pPg->pgno, pPg->needSync);
}else{
pPg->needSync = !pPager->journalStarted && !pPager->noSync;
TRACE3("APPEND %d %d\n", pPg->pgno, pPg->needSync);
@@ -1736,7 +1848,10 @@ int sqlitepager_write(void *pData){
if( pPager->ckptInUse && !pPg->inCkpt && (int)pPg->pgno<=pPager->ckptSize ){
assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize );
store32bits(pPg->pgno, pPg, -4);
+ CODEC(pPager, pData, pPg->pgno, 7);
rc = sqliteOsWrite(&pPager->cpfd, &((char*)pData)[-4], SQLITE_PAGE_SIZE+4);
+ TRACE2("CKPT-JOURNAL %d\n", pPg->pgno);
+ CODEC(pPager, pData, pPg->pgno, 0);
if( rc!=SQLITE_OK ){
sqlitepager_rollback(pPager);
pPager->errMask |= PAGER_ERR_FULL;
@@ -1896,7 +2011,8 @@ int sqlitepager_commit(Pager *pPager){
return rc;
}
assert( pPager->journalOpen );
- if( pPager->needSync && sqliteOsSync(&pPager->jfd)!=SQLITE_OK ){
+ rc = syncJournal(pPager);
+ if( rc!=SQLITE_OK ){
goto commit_abort;
}
pPg = pager_get_all_dirty_pages(pPager);
@@ -2077,6 +2193,18 @@ const char *sqlitepager_filename(Pager *pPager){
return pPager->zFilename;
}
+/*
+** Set the codec for this pager
+*/
+void sqlitepager_set_codec(
+ Pager *pPager,
+ void (*xCodec)(void*,void*,Pgno,int),
+ void *pCodecArg
+){
+ pPager->xCodec = xCodec;
+ pPager->pCodecArg = pCodecArg;
+}
+
#ifdef SQLITE_TEST
/*
** Print a listing of all referenced pages and their ref count.