ext/lsm1/lsm_log.c - external/github.com/sqlite/sqlite.git - Git at Google

 /*
 ** 2011-08-13
 **
 ** The author disclaims copyright to this source code.  In place of
 ** a legal notice, here is a blessing:
 **
 **    May you do good and not evil.
 **    May you find forgiveness for yourself and forgive others.
 **    May you share freely, never taking more than you give.
 **
 *************************************************************************
 **
 ** This file contains the implementation of LSM database logging. Logging
 ** has one purpose in LSM - to make transactions durable.
 **
 ** When data is written to an LSM database, it is initially stored in an
 ** in-memory tree structure. Since this structure is in volatile memory,
 ** if a power failure or application crash occurs it may be lost. To
 ** prevent loss of data in this case, each time a record is written to the
 ** in-memory tree an equivalent record is appended to the log on disk.
 ** If a power failure or application crash does occur, data can be recovered
 ** by reading the log.
 **
 ** A log file consists of the following types of records representing data
 ** written into the database:
 **
 **   LOG_WRITE:  A key-value pair written to the database.
 **   LOG_DELETE: A delete key issued to the database.
 **   LOG_COMMIT: A transaction commit.
 **
 ** And the following types of records for ancillary purposes..
 **
 **   LOG_EOF:    A record indicating the end of a log file.
 **   LOG_PAD1:   A single byte padding record.
 **   LOG_PAD2:   An N byte padding record (N>1).
 **   LOG_JUMP:   A pointer to another offset within the log file.
 **
 ** Each transaction written to the log contains one or more LOG_WRITE and/or
 ** LOG_DELETE records, followed by a LOG_COMMIT record. The LOG_COMMIT record
 ** contains an 8-byte checksum based on all previous data written to the
 ** log file.
 **
 ** LOG CHECKSUMS & RECOVERY
 **
 **   Checksums are found in two types of log records: LOG_COMMIT and
 **   LOG_CKSUM records. In order to recover content from a log, a client
 **   reads each record from the start of the log, calculating a checksum as
 **   it does. Each time a LOG_COMMIT or LOG_CKSUM is encountered, the
 **   recovery process verifies that the checksum stored in the log
 **   matches the calculated checksum. If it does not, the recovery process
 **   can stop reading the log.
 **
 **   If a recovery process reads records (other than COMMIT or CKSUM)
 **   consisting of at least LSM_CKSUM_MAXDATA bytes, then the next record in
 **   the log must be either a LOG_CKSUM or LOG_COMMIT record. If it is
 **   not, the recovery process also stops reading the log.
 **
 **   To recover the log file, it must be read twice. The first time to
 **   determine the location of the last valid commit record. And the second
 **   time to load data into the in-memory tree.
 **
 **   Todo: Surely there is a better way...
 **
 ** LOG WRAPPING
 **
 **   If the log file were never deleted or wrapped, it would be possible to
 **   read it from start to end each time is required recovery (i.e each time
 **   the number of database clients changes from 0 to 1). Effectively reading
 **   the entire history of the database each time. This would quickly become
 **   inefficient. Additionally, since the log file would grow without bound,
 **   it wastes storage space.
 **
 **   Instead, part of each checkpoint written into the database file contains
 **   a log offset (and other information required to read the log starting at
 **   at this offset) at which to begin recovery. Offset $O.
 **
 **   Once a checkpoint has been written and synced into the database file, it
 **   is guaranteed that no recovery process will need to read any data before
 **   offset $O of the log file. It is therefore safe to begin overwriting
 **   any data that occurs before offset $O.
 **
 **   This implementation separates the log into three regions mapped into
 **   the log file - regions 0, 1 and 2. During recovery, regions are read
 **   in ascending order (i.e. 0, then 1, then 2). Each region is zero or
 **   more bytes in size.
 **
 **     |---1---|..|--0--|.|--2--|....
 **
 **   New records are always appended to the end of region 2.
 **
 **   Initially (when it is empty), all three regions are zero bytes in size.
 **   Each of them are located at the beginning of the file. As records are
 **   added to the log, region 2 grows, so that the log consists of a zero
 **   byte region 1, followed by a zero byte region 0, followed by an N byte
 **   region 2. After one or more checkpoints have been written to disk,
 **   the start point of region 2 is moved to $O. For example:
 **
 **     A) ||.........|--2--|....
 **
 **   (both regions 0 and 1 are 0 bytes in size at offset 0).
 **
 **   Eventually, the log wraps around to write new records into the start.
 **   At this point, region 2 is renamed to region 0. Region 0 is renamed
 **   to region 2. After appending a few records to the new region 2, the
 **   log file looks like this:
 **
 **     B) ||--2--|...|--0--|....
 **
 **   (region 1 is still 0 bytes in size, located at offset 0).
 **
 **   Any checkpoints made at this point may reduce the size of region 0.
 **   However, if they do not, and region 2 expands so that it is about to
 **   overwrite the start of region 0, then region 2 is renamed to region 1,
 **   and a new region 2 created at the end of the file following the existing
 **   region 0.
 **
 **     C) |---1---|..|--0--|.|-2-|
 **
 **   In this state records are appended to region 2 until checkpoints have
 **   contracted regions 0 AND 1 UNTil they are both zero bytes in size. They
 **   are then shifted to the start of the log file, leaving the system in
 **   the equivalent of state A above.
 **
 **   Alternatively, state B may transition directly to state A if the size
 **   of region 0 is reduced to zero bytes before region 2 threatens to
 **   encroach upon it.
 **
 ** LOG_PAD1 & LOG_PAD2 RECORDS
 **
 **   PAD1 and PAD2 records may appear in a log file at any point. They allow
 **   a process writing the log file align the beginning of transactions with
 **   the beginning of disk sectors, which increases robustness.
 **
 ** RECORD FORMATS:
 **
 **   LOG_EOF:    * A single 0x00 byte.
 **
 **   LOG_PAD1:   * A single 0x01 byte.
 **
 **   LOG_PAD2:   * A single 0x02 byte, followed by
 **               * The number of unused bytes (N) as a varint,
 **               * An N byte block of unused space.
 **
 **   LOG_COMMIT: * A single 0x03 byte.
 **               * An 8-byte checksum.
 **
 **   LOG_JUMP:   * A single 0x04 byte.
 **               * Absolute file offset to jump to, encoded as a varint.
 **
 **   LOG_WRITE:  * A single 0x06 or 0x07 byte,
 **               * The number of bytes in the key, encoded as a varint,
 **               * The number of bytes in the value, encoded as a varint,
 **               * If the first byte was 0x07, an 8 byte checksum.
 **               * The key data,
 **               * The value data.
 **
 **   LOG_DELETE: * A single 0x08 or 0x09 byte,
 **               * The number of bytes in the key, encoded as a varint,
 **               * If the first byte was 0x09, an 8 byte checksum.
 **               * The key data.
 **
 **   Varints are as described in lsm_varint.c (SQLite 4 format).
 **
 ** CHECKSUMS:
 **
 **   The checksum is calculated using two 32-bit unsigned integers, s0 and
 **   s1. The initial value for both is 42. It is updated each time a record
 **   is written into the log file by treating the encoded (binary) record as
 **   an array of 32-bit little-endian integers. Then, if x[] is the integer
 **   array, updating the checksum accumulators as follows:
 **
 **     for i from 0 to n-1 step 2:
 **       s0 += x[i] + s1;
 **       s1 += x[i+1] + s0;
 **     endfor
 **
 **   If the record is not an even multiple of 8-bytes in size it is padded
 **   with zeroes to make it so before the checksum is updated.
 **
 **   The checksum stored in a COMMIT, WRITE or DELETE is based on all bytes
 **   up to the start of the 8-byte checksum itself, including the COMMIT,
 **   WRITE or DELETE fields that appear before the checksum in the record.
 **
 ** VARINT FORMAT
 **
 ** See lsm_varint.c.
 */

 #ifndef _LSM_INT_H
 # include "lsmInt.h"
 #endif

 /* Log record types */
 #define LSM_LOG_EOF          0x00
 #define LSM_LOG_PAD1         0x01
 #define LSM_LOG_PAD2         0x02
 #define LSM_LOG_COMMIT       0x03
 #define LSM_LOG_JUMP         0x04

 #define LSM_LOG_WRITE        0x06
 #define LSM_LOG_WRITE_CKSUM  0x07

 #define LSM_LOG_DELETE       0x08
 #define LSM_LOG_DELETE_CKSUM 0x09

 #define LSM_LOG_DRANGE       0x0A
 #define LSM_LOG_DRANGE_CKSUM 0x0B

 /* Require a checksum every 32KB. */
 #define LSM_CKSUM_MAXDATA (32*1024)

 /* Do not wrap a log file smaller than this in bytes. */
 #define LSM_MIN_LOGWRAP      (128*1024)

 /*
 ** szSector:
 **   Commit records must be aligned to end on szSector boundaries. If
 **   the safety-mode is set to NORMAL or OFF, this value is 1. Otherwise,
 **   if the safety-mode is set to FULL, it is the size of the file-system
 **   sectors as reported by lsmFsSectorSize().
 */
 struct LogWriter {
   u32 cksum0;                     /* Checksum 0 at offset iOff */
   u32 cksum1;                     /* Checksum 1 at offset iOff */
   int iCksumBuf;                  /* Bytes of buf that have been checksummed */
   i64 iOff;                       /* Offset at start of buffer buf */
   int szSector;                   /* Sector size for this transaction */
   LogRegion jump;                 /* Avoid writing to this region */
   i64 iRegion1End;                /* End of first region written by trans */
   i64 iRegion2Start;              /* Start of second regions written by trans */
   LsmString buf;                  /* Buffer containing data not yet written */
 };

 /*
 ** Return the result of interpreting the first 4 bytes in buffer aIn as
 ** a 32-bit unsigned little-endian integer.
 */
 static u32 getU32le(u8 *aIn){
   return ((u32)aIn[3] << 24)
        + ((u32)aIn[2] << 16)
        + ((u32)aIn[1] << 8)
        + ((u32)aIn[0]);
 }


 /*
 ** This function is the same as logCksum(), except that pointer "a" need
 ** not be aligned to an 8-byte boundary or padded with zero bytes. This
 ** version is slower, but sometimes more convenient to use.
 */
 static void logCksumUnaligned(
   char *z,                        /* Input buffer */
   int n,                          /* Size of input buffer in bytes */
   u32 *pCksum0,                   /* IN/OUT: Checksum value 1 */
   u32 *pCksum1                    /* IN/OUT: Checksum value 2 */
 ){
   u8 *a = (u8 *)z;
   u32 cksum0 = *pCksum0;
   u32 cksum1 = *pCksum1;
   int nIn = (n/8) * 8;
   int i;

   assert( n>0 );
   for(i=0; i<nIn; i+=8){
     cksum0 += getU32le(&a[i]) + cksum1;
     cksum1 += getU32le(&a[i+4]) + cksum0;
   }

   if( nIn!=n ){
     u8 aBuf[8] = {0, 0, 0, 0, 0, 0, 0, 0};
     assert( (n-nIn)<8 && n>nIn );
     memcpy(aBuf, &a[nIn], n-nIn);
     cksum0 += getU32le(aBuf) + cksum1;
     cksum1 += getU32le(&aBuf[4]) + cksum0;
   }

   *pCksum0 = cksum0;
   *pCksum1 = cksum1;
 }

 /*
 ** Update pLog->cksum0 and pLog->cksum1 so that the first nBuf bytes in the
 ** write buffer (pLog->buf) are included in the checksum.
 */
 static void logUpdateCksum(LogWriter *pLog, int nBuf){
   assert( (pLog->iCksumBuf % 8)==0 );
   assert( pLog->iCksumBuf<=nBuf );
   assert( (nBuf % 8)==0 || nBuf==pLog->buf.n );
   if( nBuf>pLog->iCksumBuf ){
     logCksumUnaligned(
         &pLog->buf.z[pLog->iCksumBuf], nBuf-pLog->iCksumBuf,
         &pLog->cksum0, &pLog->cksum1
     );
   }
   pLog->iCksumBuf = nBuf;
 }

 static i64 firstByteOnSector(LogWriter *pLog, i64 iOff){
   return (iOff / pLog->szSector) * pLog->szSector;
 }
 static i64 lastByteOnSector(LogWriter *pLog, i64 iOff){
   return firstByteOnSector(pLog, iOff) + pLog->szSector - 1;
 }

 /*
 ** If possible, reclaim log file space. Log file space is reclaimed after
 ** a snapshot that points to the same data in the database file is synced
 ** into the db header.
 */
 static int logReclaimSpace(lsm_db *pDb){
   int rc;
   int iMeta;
   int bRotrans;                   /* True if there exists some ro-trans */

   /* Test if there exists some other connection with a read-only transaction
   ** open. If there does, then log file space may not be reclaimed.  */
   rc = lsmDetectRoTrans(pDb, &bRotrans);
   if( rc!=LSM_OK || bRotrans ) return rc;

   iMeta = (int)pDb->pShmhdr->iMetaPage;
   if( iMeta==1 || iMeta==2 ){
     DbLog *pLog = &pDb->treehdr.log;
     i64 iSyncedId;

     /* Read the snapshot-id of the snapshot stored on meta-page iMeta. Note
     ** that in theory, the value read is untrustworthy (due to a race
     ** condition - see comments above lsmFsReadSyncedId()). So it is only
     ** ever used to conclude that no log space can be reclaimed. If it seems
     ** to indicate that it may be possible to reclaim log space, a
     ** second call to lsmCheckpointSynced() (which does return trustworthy
     ** values) is made below to confirm.  */
     rc = lsmFsReadSyncedId(pDb, iMeta, &iSyncedId);

     if( rc==LSM_OK && pLog->iSnapshotId!=iSyncedId ){
       i64 iSnapshotId = 0;
       i64 iOff = 0;
       rc = lsmCheckpointSynced(pDb, &iSnapshotId, &iOff, 0);
       if( rc==LSM_OK && pLog->iSnapshotId<iSnapshotId ){
         int iRegion;
         for(iRegion=0; iRegion<3; iRegion++){
           LogRegion *p = &pLog->aRegion[iRegion];
           if( iOff>=p->iStart && iOff<=p->iEnd ) break;
           p->iStart = 0;
           p->iEnd = 0;
         }
         assert( iRegion<3 );
         pLog->aRegion[iRegion].iStart = iOff;
         pLog->iSnapshotId = iSnapshotId;
       }
     }
   }
   return rc;
 }

 /*
 ** This function is called when a write-transaction is first opened. It
 ** is assumed that the caller is holding the client-mutex when it is
 ** called.
 **
 ** Before returning, this function allocates the LogWriter object that
 ** will be used to write to the log file during the write transaction.
 ** LSM_OK is returned if no error occurs, otherwise an LSM error code.
 */
 int lsmLogBegin(lsm_db *pDb){
   int rc = LSM_OK;
   LogWriter *pNew;
   LogRegion *aReg;

   if( pDb->bUseLog==0 ) return LSM_OK;

   /* If the log file has not yet been opened, open it now. Also allocate
   ** the LogWriter structure, if it has not already been allocated.  */
   rc = lsmFsOpenLog(pDb, 0);
   if( pDb->pLogWriter==0 ){
     pNew = lsmMallocZeroRc(pDb->pEnv, sizeof(LogWriter), &rc);
     if( pNew ){
       lsmStringInit(&pNew->buf, pDb->pEnv);
       rc = lsmStringExtend(&pNew->buf, 2);
     }
     pDb->pLogWriter = pNew;
   }else{
     pNew = pDb->pLogWriter;
     assert( (u8 *)(&pNew[1])==(u8 *)(&((&pNew->buf)[1])) );
     memset(pNew, 0, ((u8 *)&pNew->buf) - (u8 *)pNew);
     pNew->buf.n = 0;
   }

   if( rc==LSM_OK ){
     /* The following call detects whether or not a new snapshot has been
     ** synced into the database file. If so, it updates the contents of
     ** the pDb->treehdr.log structure to reclaim any space in the log
     ** file that is no longer required.
     **
     ** TODO: Calling this every transaction is overkill. And since the
     ** call has to read and checksum a snapshot from the database file,
     ** it is expensive. It would be better to figure out a way so that
     ** this is only called occasionally - say for every 32KB written to
     ** the log file.
     */
     rc = logReclaimSpace(pDb);
   }
   if( rc!=LSM_OK ){
     lsmLogClose(pDb);
     return rc;
   }

   /* Set the effective sector-size for this transaction. Sectors are assumed
   ** to be one byte in size if the safety-mode is OFF or NORMAL, or as
   ** reported by lsmFsSectorSize if it is FULL.  */
   if( pDb->eSafety==LSM_SAFETY_FULL ){
     pNew->szSector = lsmFsSectorSize(pDb->pFS);
     assert( pNew->szSector>0 );
   }else{
     pNew->szSector = 1;
   }

   /* There are now three scenarios:
   **
   **   1) Regions 0 and 1 are both zero bytes in size and region 2 begins
   **      at a file offset greater than LSM_MIN_LOGWRAP. In this case, wrap
   **      around to the start and write data into the start of the log file.
   **
   **   2) Region 1 is zero bytes in size and region 2 occurs earlier in the
   **      file than region 0. In this case, append data to region 2, but
   **      remember to jump over region 1 if required.
   **
   **   3) Region 2 is the last in the file. Append to it.
   */
   aReg = &pDb->treehdr.log.aRegion[0];

   assert( aReg[0].iEnd==0 || aReg[0].iEnd>aReg[0].iStart );
   assert( aReg[1].iEnd==0 || aReg[1].iEnd>aReg[1].iStart );

   pNew->cksum0 = pDb->treehdr.log.cksum0;
   pNew->cksum1 = pDb->treehdr.log.cksum1;

   if( aReg[0].iEnd==0 && aReg[1].iEnd==0 && aReg[2].iStart>=LSM_MIN_LOGWRAP ){
     /* Case 1. Wrap around to the start of the file. Write an LSM_LOG_JUMP
     ** into the log file in this case. Pad it out to 8 bytes using a PAD2
     ** record so that the checksums can be updated immediately.  */
     u8 aJump[] = {
       LSM_LOG_PAD2, 0x04, 0x00, 0x00, 0x00, 0x00, LSM_LOG_JUMP, 0x00
     };

     lsmStringBinAppend(&pNew->buf, aJump, sizeof(aJump));
     logUpdateCksum(pNew, pNew->buf.n);
     rc = lsmFsWriteLog(pDb->pFS, aReg[2].iEnd, &pNew->buf);
     pNew->iCksumBuf = pNew->buf.n = 0;

     aReg[2].iEnd += 8;
     pNew->jump = aReg[0] = aReg[2];
     aReg[2].iStart = aReg[2].iEnd = 0;
   }else if( aReg[1].iEnd==0 && aReg[2].iEnd<aReg[0].iEnd ){
     /* Case 2. */
     pNew->iOff = aReg[2].iEnd;
     pNew->jump = aReg[0];
   }else{
     /* Case 3. */
     assert( aReg[2].iStart>=aReg[0].iEnd && aReg[2].iStart>=aReg[1].iEnd );
     pNew->iOff = aReg[2].iEnd;
   }

   if( pNew->jump.iStart ){
     i64 iRound;
     assert( pNew->jump.iStart>pNew->iOff );

     iRound = firstByteOnSector(pNew, pNew->jump.iStart);
     if( iRound>pNew->iOff ) pNew->jump.iStart = iRound;
     pNew->jump.iEnd = lastByteOnSector(pNew, pNew->jump.iEnd);
   }

   assert( pDb->pLogWriter==pNew );
   return rc;
 }

 /*
 ** This function is called when a write-transaction is being closed.
 ** Parameter bCommit is true if the transaction is being committed,
 ** or false otherwise. The caller must hold the client-mutex to call
 ** this function.
 **
 ** A call to this function deletes the LogWriter object allocated by
 ** lsmLogBegin(). If the transaction is being committed, the shared state
 ** in *pLog is updated before returning.
 */
 void lsmLogEnd(lsm_db *pDb, int bCommit){
   DbLog *pLog;
   LogWriter *p;
   p = pDb->pLogWriter;

   if( p==0 ) return;
   pLog = &pDb->treehdr.log;

   if( bCommit ){
     pLog->aRegion[2].iEnd = p->iOff;
     pLog->cksum0 = p->cksum0;
     pLog->cksum1 = p->cksum1;
     if( p->iRegion1End ){
       /* This happens when the transaction had to jump over some other
       ** part of the log.  */
       assert( pLog->aRegion[1].iEnd==0 );
       assert( pLog->aRegion[2].iStart<p->iRegion1End );
       pLog->aRegion[1].iStart = pLog->aRegion[2].iStart;
       pLog->aRegion[1].iEnd = p->iRegion1End;
       pLog->aRegion[2].iStart = p->iRegion2Start;
     }
   }
 }

 static int jumpIfRequired(
   lsm_db *pDb,
   LogWriter *pLog,
   int nReq,
   int *pbJump
 ){
   /* Determine if it is necessary to add an LSM_LOG_JUMP to jump over the
   ** jump region before writing the LSM_LOG_WRITE or DELETE record. This
   ** is necessary if there is insufficient room between the current offset
   ** and the jump region to fit the new WRITE/DELETE record and the largest
   ** possible JUMP record with up to 7 bytes of padding (a total of 17
   ** bytes).  */
   if( (pLog->jump.iStart > (pLog->iOff + pLog->buf.n))
    && (pLog->jump.iStart < (pLog->iOff + pLog->buf.n + (nReq + 17)))
   ){
     int rc;                       /* Return code */
     i64 iJump;                    /* Offset to jump to */
     u8 aJump[10];                 /* Encoded jump record */
     int nJump;                    /* Valid bytes in aJump[] */
     int nPad;                     /* Bytes of padding required */

     /* Serialize the JUMP record */
     iJump = pLog->jump.iEnd+1;
     aJump[0] = LSM_LOG_JUMP;
     nJump = 1 + lsmVarintPut64(&aJump[1], iJump);

     /* Adding padding to the contents of the buffer so that it will be a
     ** multiple of 8 bytes in size after the JUMP record is appended. This
     ** is not strictly required, it just makes the keeping the running
     ** checksum up to date in this file a little simpler.  */
     nPad = (pLog->buf.n + nJump) % 8;
     if( nPad ){
       u8 aPad[7] = {0,0,0,0,0,0,0};
       nPad = 8-nPad;
       if( nPad==1 ){
         aPad[0] = LSM_LOG_PAD1;
       }else{
         aPad[0] = LSM_LOG_PAD2;
         aPad[1] = (u8)(nPad-2);
       }
       rc = lsmStringBinAppend(&pLog->buf, aPad, nPad);
       if( rc!=LSM_OK ) return rc;
     }

     /* Append the JUMP record to the buffer. Then flush the buffer to disk
     ** and update the checksums. The next write to the log file (assuming
     ** there is no transaction rollback) will be to offset iJump (just past
     ** the jump region).  */
     rc = lsmStringBinAppend(&pLog->buf, aJump, nJump);
     if( rc!=LSM_OK ) return rc;
     assert( (pLog->buf.n % 8)==0 );
     rc = lsmFsWriteLog(pDb->pFS, pLog->iOff, &pLog->buf);
     if( rc!=LSM_OK ) return rc;
     logUpdateCksum(pLog, pLog->buf.n);
     pLog->iRegion1End = (pLog->iOff + pLog->buf.n);
     pLog->iRegion2Start = iJump;
     pLog->iOff = iJump;
     pLog->iCksumBuf = pLog->buf.n = 0;
     if( pbJump ) *pbJump = 1;
   }

   return LSM_OK;
 }

 static int logCksumAndFlush(lsm_db *pDb){
   int rc;                         /* Return code */
   LogWriter *pLog = pDb->pLogWriter;

   /* Calculate the checksum value. Append it to the buffer. */
   logUpdateCksum(pLog, pLog->buf.n);
   lsmPutU32((u8 *)&pLog->buf.z[pLog->buf.n], pLog->cksum0);
   pLog->buf.n += 4;
   lsmPutU32((u8 *)&pLog->buf.z[pLog->buf.n], pLog->cksum1);
   pLog->buf.n += 4;

   /* Write the contents of the buffer to disk. */
   rc = lsmFsWriteLog(pDb->pFS, pLog->iOff, &pLog->buf);
   pLog->iOff += pLog->buf.n;
   pLog->iCksumBuf = pLog->buf.n = 0;

   return rc;
 }

 /*
 ** Write the contents of the log-buffer to disk. Then write either a CKSUM
 ** or COMMIT record, depending on the value of parameter eType.
 */
 static int logFlush(lsm_db *pDb, int eType){
   int rc;
   int nReq;
   LogWriter *pLog = pDb->pLogWriter;

   assert( eType==LSM_LOG_COMMIT );
   assert( pLog );

   /* Commit record is always 9 bytes in size. */
   nReq = 9;
   if( eType==LSM_LOG_COMMIT && pLog->szSector>1 ) nReq += pLog->szSector + 17;
   rc = jumpIfRequired(pDb, pLog, nReq, 0);

   /* If this is a COMMIT, add padding to the log so that the COMMIT record
   ** is aligned against the end of a disk sector. In other words, add padding
   ** so that the first byte following the COMMIT record lies on a different
   ** sector.  */
   if( eType==LSM_LOG_COMMIT && pLog->szSector>1 ){
     int nPad;                     /* Bytes of padding to add */

     /* Determine the value of nPad. */
     nPad = ((pLog->iOff + pLog->buf.n + 9) % pLog->szSector);
     if( nPad ) nPad = pLog->szSector - nPad;
     rc = lsmStringExtend(&pLog->buf, nPad);
     if( rc!=LSM_OK ) return rc;

     while( nPad ){
       if( nPad==1 ){
         pLog->buf.z[pLog->buf.n++] = LSM_LOG_PAD1;
         nPad = 0;
       }else{
         int n = LSM_MIN(200, nPad-2);
         pLog->buf.z[pLog->buf.n++] = LSM_LOG_PAD2;
         pLog->buf.z[pLog->buf.n++] = (char)n;
         nPad -= 2;
         memset(&pLog->buf.z[pLog->buf.n], 0x2B, n);
         pLog->buf.n += n;
         nPad -= n;
       }
     }
   }

   /* Make sure there is room in the log-buffer to add the CKSUM or COMMIT
   ** record. Then add the first byte of it.  */
   rc = lsmStringExtend(&pLog->buf, 9);
   if( rc!=LSM_OK ) return rc;
   pLog->buf.z[pLog->buf.n++] = (char)eType;
   memset(&pLog->buf.z[pLog->buf.n], 0, 8);

   rc = logCksumAndFlush(pDb);

   /* If this is a commit and synchronous=full, sync the log to disk. */
   if( rc==LSM_OK && eType==LSM_LOG_COMMIT && pDb->eSafety==LSM_SAFETY_FULL ){
     rc = lsmFsSyncLog(pDb->pFS);
   }
   return rc;
 }

 /*
 ** Append an LSM_LOG_WRITE (if nVal>=0) or LSM_LOG_DELETE (if nVal<0)
 ** record to the database log.
 */
 int lsmLogWrite(
   lsm_db *pDb,                    /* Database handle */
   int eType,
   void *pKey, int nKey,           /* Database key to write to log */
   void *pVal, int nVal            /* Database value (or nVal<0) to write */
 ){
   int rc = LSM_OK;
   LogWriter *pLog;                /* Log object to write to */
   int nReq;                       /* Bytes of space required in log */
   int bCksum = 0;                 /* True to embed a checksum in this record */

   assert( eType==LSM_WRITE || eType==LSM_DELETE || eType==LSM_DRANGE );
   assert( LSM_LOG_WRITE==LSM_WRITE );
   assert( LSM_LOG_DELETE==LSM_DELETE );
   assert( LSM_LOG_DRANGE==LSM_DRANGE );
   assert( (eType==LSM_LOG_DELETE)==(nVal<0) );

   if( pDb->bUseLog==0 ) return LSM_OK;
   pLog = pDb->pLogWriter;

   /* Determine how many bytes of space are required, assuming that a checksum
   ** will be embedded in this record (even though it may not be).  */
   nReq = 1 + lsmVarintLen32(nKey) + 8 + nKey;
   if( eType!=LSM_LOG_DELETE ) nReq += lsmVarintLen32(nVal) + nVal;

   /* Jump over the jump region if required. Set bCksum to true to tell the
   ** code below to include a checksum in the record if either (a) writing
   ** this record would mean that more than LSM_CKSUM_MAXDATA bytes of data
   ** have been written to the log since the last checksum, or (b) the jump
   ** is taken.  */
   rc = jumpIfRequired(pDb, pLog, nReq, &bCksum);
   if( (pLog->buf.n+nReq) > LSM_CKSUM_MAXDATA ) bCksum = 1;

   if( rc==LSM_OK ){
     rc = lsmStringExtend(&pLog->buf, nReq);
   }
   if( rc==LSM_OK ){
     u8 *a = (u8 *)&pLog->buf.z[pLog->buf.n];

     /* Write the record header - the type byte followed by either 1 (for
     ** DELETE) or 2 (for WRITE) varints.  */
     assert( LSM_LOG_WRITE_CKSUM == (LSM_LOG_WRITE | 0x0001) );
     assert( LSM_LOG_DELETE_CKSUM == (LSM_LOG_DELETE | 0x0001) );
     assert( LSM_LOG_DRANGE_CKSUM == (LSM_LOG_DRANGE | 0x0001) );
     *(a++) = (u8)eType | (u8)bCksum;
     a += lsmVarintPut32(a, nKey);
     if( eType!=LSM_LOG_DELETE ) a += lsmVarintPut32(a, nVal);

     if( bCksum ){
       pLog->buf.n = (a - (u8 *)pLog->buf.z);
       rc = logCksumAndFlush(pDb);
       a = (u8 *)&pLog->buf.z[pLog->buf.n];
     }

     memcpy(a, pKey, nKey);
     a += nKey;
     if( eType!=LSM_LOG_DELETE ){
       memcpy(a, pVal, nVal);
       a += nVal;
     }
     pLog->buf.n = a - (u8 *)pLog->buf.z;
     assert( pLog->buf.n<=pLog->buf.nAlloc );
   }

   return rc;
 }

 /*
 ** Append an LSM_LOG_COMMIT record to the database log.
 */
 int lsmLogCommit(lsm_db *pDb){
   if( pDb->bUseLog==0 ) return LSM_OK;
   return logFlush(pDb, LSM_LOG_COMMIT);
 }

 /*
 ** Store the current offset and other checksum related information in the
 ** structure *pMark. Later, *pMark can be passed to lsmLogSeek() to "rewind"
 ** the LogWriter object to the current log file offset. This is used when
 ** rolling back savepoint transactions.
 */
 void lsmLogTell(
   lsm_db *pDb,                    /* Database handle */
   LogMark *pMark                  /* Populate this object with current offset */
 ){
   LogWriter *pLog;
   int nCksum;

   if( pDb->bUseLog==0 ) return;
   pLog = pDb->pLogWriter;
   nCksum = pLog->buf.n & 0xFFFFFFF8;
   logUpdateCksum(pLog, nCksum);
   assert( pLog->iCksumBuf==nCksum );
   pMark->nBuf = pLog->buf.n - nCksum;
   memcpy(pMark->aBuf, &pLog->buf.z[nCksum], pMark->nBuf);

   pMark->iOff = pLog->iOff + pLog->buf.n;
   pMark->cksum0 = pLog->cksum0;
   pMark->cksum1 = pLog->cksum1;
 }

 /*
 ** Seek (rewind) back to the log file offset stored by an ealier call to
 ** lsmLogTell() in *pMark.
 */
 void lsmLogSeek(
   lsm_db *pDb,                    /* Database handle */
   LogMark *pMark                  /* Object containing log offset to seek to */
 ){
   LogWriter *pLog;

   if( pDb->bUseLog==0 ) return;
   pLog = pDb->pLogWriter;

   assert( pMark->iOff<=pLog->iOff+pLog->buf.n );
   if( (pMark->iOff & 0xFFFFFFF8)>=pLog->iOff ){
     pLog->buf.n = (int)(pMark->iOff - pLog->iOff);
     pLog->iCksumBuf = (pLog->buf.n & 0xFFFFFFF8);
   }else{
     pLog->buf.n = pMark->nBuf;
     memcpy(pLog->buf.z, pMark->aBuf, pMark->nBuf);
     pLog->iCksumBuf = 0;
     pLog->iOff = pMark->iOff - pMark->nBuf;
   }
   pLog->cksum0 = pMark->cksum0;
   pLog->cksum1 = pMark->cksum1;

   if( pMark->iOff > pLog->iRegion1End ) pLog->iRegion1End = 0;
   if( pMark->iOff > pLog->iRegion2Start ) pLog->iRegion2Start = 0;
 }

 /*
 ** This function does the work for an lsm_info(LOG_STRUCTURE) request.
 */
 int lsmInfoLogStructure(lsm_db *pDb, char **pzVal){
   int rc = LSM_OK;
   char *zVal = 0;

   /* If there is no read or write transaction open, read the latest
   ** tree-header from shared-memory to report on. If necessary, update
   ** it based on the contents of the database header.
   **
   ** No locks are taken here - these are passive read operations only.
   */
   if( pDb->pCsr==0 && pDb->nTransOpen==0 ){
     rc = lsmTreeLoadHeader(pDb, 0);
     if( rc==LSM_OK ) rc = logReclaimSpace(pDb);
   }

   if( rc==LSM_OK ){
     DbLog *pLog = &pDb->treehdr.log;
     zVal = lsmMallocPrintf(pDb->pEnv,
         "%d %d %d %d %d %d",
         (int)pLog->aRegion[0].iStart, (int)pLog->aRegion[0].iEnd,
         (int)pLog->aRegion[1].iStart, (int)pLog->aRegion[1].iEnd,
         (int)pLog->aRegion[2].iStart, (int)pLog->aRegion[2].iEnd
     );
     if( !zVal ) rc = LSM_NOMEM_BKPT;
   }

   *pzVal = zVal;
   return rc;
 }

 /*************************************************************************
 ** Begin code for log recovery.
 */

 typedef struct LogReader LogReader;
 struct LogReader {
   FileSystem *pFS;                /* File system to read from */
   i64 iOff;                       /* File offset at end of buf content */
   int iBuf;                       /* Current read offset in buf */
   LsmString buf;                  /* Buffer containing file content */

   int iCksumBuf;                  /* Offset in buf corresponding to cksum[01] */
   u32 cksum0;                     /* Checksum 0 at offset iCksumBuf */
   u32 cksum1;                     /* Checksum 1 at offset iCksumBuf */
 };

 static void logReaderBlob(
   LogReader *p,                   /* Log reader object */
   LsmString *pBuf,                /* Dynamic storage, if required */
   int nBlob,                      /* Number of bytes to read */
   u8 **ppBlob,                    /* OUT: Pointer to blob read */
   int *pRc                        /* IN/OUT: Error code */
 ){
   static const int LOG_READ_SIZE = 512;
   int rc = *pRc;                  /* Return code */
   int nReq = nBlob;               /* Bytes required */

   while( rc==LSM_OK && nReq>0 ){
     int nAvail;                   /* Bytes of data available in p->buf */
     if( p->buf.n==p->iBuf ){
       int nCksum;                 /* Total bytes requiring checksum */
       int nCarry = 0;             /* Total bytes requiring checksum */

       nCksum = p->iBuf - p->iCksumBuf;
       if( nCksum>0 ){
         nCarry = nCksum % 8;
         nCksum = ((nCksum / 8) * 8);
         if( nCksum>0 ){
           logCksumUnaligned(
               &p->buf.z[p->iCksumBuf], nCksum, &p->cksum0, &p->cksum1
           );
         }
       }
       if( nCarry>0 ) memcpy(p->buf.z, &p->buf.z[p->iBuf-nCarry], nCarry);
       p->buf.n = nCarry;
       p->iBuf = nCarry;

       rc = lsmFsReadLog(p->pFS, p->iOff, LOG_READ_SIZE, &p->buf);
       if( rc!=LSM_OK ) break;
       p->iCksumBuf = 0;
       p->iOff += LOG_READ_SIZE;
     }

     nAvail = p->buf.n - p->iBuf;
     if( ppBlob && nReq==nBlob && nBlob<=nAvail ){
       *ppBlob = (u8 *)&p->buf.z[p->iBuf];
       p->iBuf += nBlob;
       nReq = 0;
     }else{
       int nCopy = LSM_MIN(nAvail, nReq);
       if( nBlob==nReq ){
         pBuf->n = 0;
       }
       rc = lsmStringBinAppend(pBuf, (u8 *)&p->buf.z[p->iBuf], nCopy);
       nReq -= nCopy;
       p->iBuf += nCopy;
       if( nReq==0 && ppBlob ){
         *ppBlob = (u8*)pBuf->z;
       }
     }
   }

   *pRc = rc;
 }

 static void logReaderVarint(
   LogReader *p,
   LsmString *pBuf,
   int *piVal,                     /* OUT: Value read from log */
   int *pRc                        /* IN/OUT: Error code */
 ){
   if( *pRc==LSM_OK ){
     u8 *aVarint;
     if( p->buf.n==p->iBuf ){
       logReaderBlob(p, 0, 10, &aVarint, pRc);
       if( LSM_OK==*pRc ) p->iBuf -= (10 - lsmVarintGet32(aVarint, piVal));
     }else{
       logReaderBlob(p, pBuf, lsmVarintSize(p->buf.z[p->iBuf]), &aVarint, pRc);
       if( LSM_OK==*pRc ) lsmVarintGet32(aVarint, piVal);
     }
   }
 }

 static void logReaderByte(LogReader *p, u8 *pByte, int *pRc){
   u8 *pPtr = 0;
   logReaderBlob(p, 0, 1, &pPtr, pRc);
   if( pPtr ) *pByte = *pPtr;
 }

 static void logReaderCksum(LogReader *p, LsmString *pBuf, int *pbEof, int *pRc){
   if( *pRc==LSM_OK ){
     u8 *pPtr = 0;
     u32 cksum0, cksum1;
     int nCksum = p->iBuf - p->iCksumBuf;

     /* Update in-memory (expected) checksums */
     assert( nCksum>=0 );
     logCksumUnaligned(&p->buf.z[p->iCksumBuf], nCksum, &p->cksum0, &p->cksum1);
     p->iCksumBuf = p->iBuf + 8;
     logReaderBlob(p, pBuf, 8, &pPtr, pRc);
     assert( pPtr || *pRc );

     /* Read the checksums from the log file. Set *pbEof if they do not match. */
     if( pPtr ){
       cksum0 = lsmGetU32(pPtr);
       cksum1 = lsmGetU32(&pPtr[4]);
       *pbEof = (cksum0!=p->cksum0 || cksum1!=p->cksum1);
       p->iCksumBuf = p->iBuf;
     }
   }
 }

 static void logReaderInit(
   lsm_db *pDb,                    /* Database handle */
   DbLog *pLog,                    /* Log object associated with pDb */
   int bInitBuf,                   /* True if p->buf is uninitialized */
   LogReader *p                    /* Initialize this LogReader object */
 ){
   p->pFS = pDb->pFS;
   p->iOff = pLog->aRegion[2].iStart;
   p->cksum0 = pLog->cksum0;
   p->cksum1 = pLog->cksum1;
   if( bInitBuf ){ lsmStringInit(&p->buf, pDb->pEnv); }
   p->buf.n = 0;
   p->iCksumBuf = 0;
   p->iBuf = 0;
 }

 /*
 ** This function is called after reading the header of a LOG_DELETE or
 ** LOG_WRITE record. Parameter nByte is the total size of the key and
 ** value that follow the header just read. Return true if the size and
 ** position of the record indicate that it should contain a checksum.
 */
 static int logRequireCksum(LogReader *p, int nByte){
   return ((p->iBuf + nByte - p->iCksumBuf) > LSM_CKSUM_MAXDATA);
 }

 /*
 ** Recover the contents of the log file.
 */
 int lsmLogRecover(lsm_db *pDb){
   LsmString buf1;                 /* Key buffer */
   LsmString buf2;                 /* Value buffer */
   LogReader reader;               /* Log reader object */
   int rc = LSM_OK;                /* Return code */
   int nCommit = 0;                /* Number of transactions to recover */
   int iPass;
   int nJump = 0;                  /* Number of LSM_LOG_JUMP records in pass 0 */
   DbLog *pLog;
   int bOpen;

   rc = lsmFsOpenLog(pDb, &bOpen);
   if( rc!=LSM_OK ) return rc;

   rc = lsmTreeInit(pDb);
   if( rc!=LSM_OK ) return rc;

   pLog = &pDb->treehdr.log;
   lsmCheckpointLogoffset(pDb->pShmhdr->aSnap2, pLog);

   logReaderInit(pDb, pLog, 1, &reader);
   lsmStringInit(&buf1, pDb->pEnv);
   lsmStringInit(&buf2, pDb->pEnv);

   /* The outer for() loop runs at most twice. The first iteration is to
   ** count the number of committed transactions in the log. The second
   ** iterates through those transactions and updates the in-memory tree
   ** structure with their contents.  */
   if( bOpen ){
     for(iPass=0; iPass<2 && rc==LSM_OK; iPass++){
       int bEof = 0;

       while( rc==LSM_OK && !bEof ){
         u8 eType = 0;
         logReaderByte(&reader, &eType, &rc);

         switch( eType ){
           case LSM_LOG_PAD1:
             break;

           case LSM_LOG_PAD2: {
             int nPad;
             logReaderVarint(&reader, &buf1, &nPad, &rc);
             logReaderBlob(&reader, &buf1, nPad, 0, &rc);
             break;
           }

           case LSM_LOG_DRANGE:
           case LSM_LOG_DRANGE_CKSUM:
           case LSM_LOG_WRITE:
           case LSM_LOG_WRITE_CKSUM: {
             int nKey;
             int nVal;
             u8 *aVal;
             logReaderVarint(&reader, &buf1, &nKey, &rc);
             logReaderVarint(&reader, &buf2, &nVal, &rc);

             if( eType==LSM_LOG_WRITE_CKSUM || eType==LSM_LOG_DRANGE_CKSUM ){
               logReaderCksum(&reader, &buf1, &bEof, &rc);
             }else{
               bEof = logRequireCksum(&reader, nKey+nVal);
             }
             if( bEof ) break;

             logReaderBlob(&reader, &buf1, nKey, 0, &rc);
             logReaderBlob(&reader, &buf2, nVal, &aVal, &rc);
             if( iPass==1 && rc==LSM_OK ){
               if( eType==LSM_LOG_WRITE || eType==LSM_LOG_WRITE_CKSUM ){
                 rc = lsmTreeInsert(pDb, (u8 *)buf1.z, nKey, aVal, nVal);
               }else{
                 rc = lsmTreeDelete(pDb, (u8 *)buf1.z, nKey, aVal, nVal);
               }
             }
             break;
           }

           case LSM_LOG_DELETE:
           case LSM_LOG_DELETE_CKSUM: {
             int nKey; u8 *aKey;
             logReaderVarint(&reader, &buf1, &nKey, &rc);

             if( eType==LSM_LOG_DELETE_CKSUM ){
               logReaderCksum(&reader, &buf1, &bEof, &rc);
             }else{
               bEof = logRequireCksum(&reader, nKey);
             }
             if( bEof ) break;

             logReaderBlob(&reader, &buf1, nKey, &aKey, &rc);
             if( iPass==1 && rc==LSM_OK ){
               rc = lsmTreeInsert(pDb, aKey, nKey, NULL, -1);
             }
             break;
           }

           case LSM_LOG_COMMIT:
             logReaderCksum(&reader, &buf1, &bEof, &rc);
             if( bEof==0 ){
               nCommit++;
               assert( nCommit>0 || iPass==1 );
               if( nCommit==0 ) bEof = 1;
             }
             break;

           case LSM_LOG_JUMP: {
             int iOff = 0;
             logReaderVarint(&reader, &buf1, &iOff, &rc);
             if( rc==LSM_OK ){
               if( iPass==1 ){
                 if( pLog->aRegion[2].iStart==0 ){
                   assert( pLog->aRegion[1].iStart==0 );
                   pLog->aRegion[1].iEnd = reader.iOff;
                 }else{
                   assert( pLog->aRegion[0].iStart==0 );
                   pLog->aRegion[0].iStart = pLog->aRegion[2].iStart;
                   pLog->aRegion[0].iEnd = reader.iOff-reader.buf.n+reader.iBuf;
                 }
                 pLog->aRegion[2].iStart = iOff;
               }else{
                 if( (nJump++)==2 ){
                   bEof = 1;
                 }
               }

               reader.iOff = iOff;
               reader.buf.n = reader.iBuf;
             }
             break;
           }

           default:
             /* Including LSM_LOG_EOF */
             bEof = 1;
             break;
         }
       }

       if( rc==LSM_OK && iPass==0 ){
         if( nCommit==0 ){
           if( pLog->aRegion[2].iStart==0 ){
             iPass = 1;
           }else{
             pLog->aRegion[2].iStart = 0;
             iPass = -1;
             lsmCheckpointZeroLogoffset(pDb);
           }
         }
         logReaderInit(pDb, pLog, 0, &reader);
         nCommit = nCommit * -1;
       }
     }
   }

   /* Initialize DbLog object */
   if( rc==LSM_OK ){
     pLog->aRegion[2].iEnd = reader.iOff - reader.buf.n + reader.iBuf;
     pLog->cksum0 = reader.cksum0;
     pLog->cksum1 = reader.cksum1;
   }

   if( rc==LSM_OK ){
     rc = lsmFinishRecovery(pDb);
   }else{
     lsmFinishRecovery(pDb);
   }

   if( pDb->bRoTrans ){
     lsmFsCloseLog(pDb);
   }

   lsmStringClear(&buf1);
   lsmStringClear(&buf2);
   lsmStringClear(&reader.buf);
   return rc;
 }

 void lsmLogClose(lsm_db *db){
   if( db->pLogWriter ){
     lsmFree(db->pEnv, db->pLogWriter->buf.z);
     lsmFree(db->pEnv, db->pLogWriter);
     db->pLogWriter = 0;
   }
 }