| /* |
| ** 2011-08-18 |
| ** |
| ** The author disclaims copyright to this source code. In place of |
| ** a legal notice, here is a blessing: |
| ** |
| ** May you do good and not evil. |
| ** May you find forgiveness for yourself and forgive others. |
| ** May you share freely, never taking more than you give. |
| ** |
| ************************************************************************* |
| ** |
| ** This file contains the implementation of an in-memory tree structure. |
| ** |
| ** Technically the tree is a B-tree of order 4 (in the Knuth sense - each |
| ** node may have up to 4 children). Keys are stored within B-tree nodes by |
| ** reference. This may be slightly slower than a conventional red-black |
| ** tree, but it is simpler. It is also an easier structure to modify to |
| ** create a version that supports nested transaction rollback. |
| ** |
| ** This tree does not currently support a delete operation. One is not |
| ** required. When LSM deletes a key from a database, it inserts a DELETE |
| ** marker into the data structure. As a result, although the value associated |
| ** with a key stored in the in-memory tree structure may be modified, no |
| ** keys are ever removed. |
| */ |
| |
| /* |
| ** MVCC NOTES |
| ** |
| ** The in-memory tree structure supports SQLite-style MVCC. This means |
| ** that while one client is writing to the tree structure, other clients |
| ** may still be querying an older snapshot of the tree. |
| ** |
| ** One way to implement this is to use an append-only b-tree. In this |
| ** case instead of modifying nodes in-place, a copy of the node is made |
| ** and the required modifications made to the copy. The parent of the |
| ** node is then modified (to update the pointer so that it points to |
| ** the new copy), which causes a copy of the parent to be made, and so on. |
| ** This means that each time the tree is written to a new root node is |
| ** created. A snapshot is identified by the root node that it uses. |
| ** |
| ** The problem with the above is that each time the tree is written to, |
| ** a copy of the node structure modified and all of its ancestor nodes |
| ** is made. This may prove excessive with large tree structures. |
| ** |
| ** To reduce this overhead, the data structure used for a tree node is |
| ** designed so that it may be edited in place exactly once without |
| ** affecting existing users. In other words, the node structure is capable |
| ** of storing two separate versions of the node at the same time. |
| ** When a node is to be edited, if the node structure already contains |
| ** two versions, a copy is made as in the append-only approach. Or, if |
| ** it only contains a single version, it is edited in place. |
| ** |
| ** This reduces the overhead so that, roughly, one new node structure |
| ** must be allocated for each write (on top of those allocations that |
| ** would have been required by a non-MVCC tree). Logic: Assume that at |
| ** any time, 50% of nodes in the tree already contain 2 versions. When |
| ** a new entry is written to a node, there is a 50% chance that a copy |
| ** of the node will be required. And a 25% chance that a copy of its |
| ** parent is required. And so on. |
| ** |
| ** ROLLBACK |
| ** |
| ** The in-memory tree also supports transaction and sub-transaction |
| ** rollback. In order to rollback to point in time X, the following is |
| ** necessary: |
| ** |
| ** 1. All memory allocated since X must be freed, and |
| ** 2. All "v2" data adding to nodes that existed at X should be zeroed. |
| ** 3. The root node must be restored to its X value. |
| ** |
| ** The Mempool object used to allocate memory for the tree supports |
| ** operation (1) - see the lsmPoolMark() and lsmPoolRevert() functions. |
| ** |
| ** To support (2), all nodes that have v2 data are part of a singly linked |
| ** list, sorted by the age of the v2 data (nodes that have had data added |
| ** most recently are at the end of the list). So to zero all v2 data added |
| ** since X, the linked list is traversed from the first node added following |
| ** X onwards. |
| ** |
| */ |
| |
| #ifndef _LSM_INT_H |
| # include "lsmInt.h" |
| #endif |
| |
| #include <string.h> |
| |
| #define MAX_DEPTH 32 |
| |
| typedef struct TreeKey TreeKey; |
| typedef struct TreeNode TreeNode; |
| typedef struct TreeLeaf TreeLeaf; |
| typedef struct NodeVersion NodeVersion; |
| |
| struct TreeOld { |
| u32 iShmid; /* Last shared-memory chunk in use by old */ |
| u32 iRoot; /* Offset of root node in shm file */ |
| u32 nHeight; /* Height of tree structure */ |
| }; |
| |
| #if 0 |
| /* |
| ** assert() that a TreeKey.flags value is sane. Usage: |
| ** |
| ** assert( lsmAssertFlagsOk(pTreeKey->flags) ); |
| */ |
| static int lsmAssertFlagsOk(u8 keyflags){ |
| /* At least one flag must be set. Otherwise, what is this key doing? */ |
| assert( keyflags!=0 ); |
| |
| /* The POINT_DELETE and INSERT flags cannot both be set. */ |
| assert( (keyflags & LSM_POINT_DELETE)==0 || (keyflags & LSM_INSERT)==0 ); |
| |
| /* If both the START_DELETE and END_DELETE flags are set, then the INSERT |
| ** flag must also be set. In other words - the three DELETE flags cannot |
| ** all be set */ |
| assert( (keyflags & LSM_END_DELETE)==0 |
| || (keyflags & LSM_START_DELETE)==0 |
| || (keyflags & LSM_POINT_DELETE)==0 |
| ); |
| |
| return 1; |
| } |
| #endif |
| static int assert_delete_ranges_match(lsm_db *); |
| static int treeCountEntries(lsm_db *db); |
| |
| /* |
| ** Container for a key-value pair. Within the *-shm file, each key/value |
| ** pair is stored in a single allocation (which may not actually be |
| ** contiguous in memory). Layout is the TreeKey structure, followed by |
| ** the nKey bytes of key blob, followed by the nValue bytes of value blob |
| ** (if nValue is non-negative). |
| */ |
| struct TreeKey { |
| int nKey; /* Size of pKey in bytes */ |
| int nValue; /* Size of pValue. Or negative. */ |
| u8 flags; /* Various LSM_XXX flags */ |
| }; |
| |
| #define TKV_KEY(p) ((void *)&(p)[1]) |
| #define TKV_VAL(p) ((void *)(((u8 *)&(p)[1]) + (p)->nKey)) |
| |
| /* |
| ** A single tree node. A node structure may contain up to 3 key/value |
| ** pairs. Internal (non-leaf) nodes have up to 4 children. |
| ** |
| ** TODO: Update the format of this to be more compact. Get it working |
| ** first though... |
| */ |
| struct TreeNode { |
| u32 aiKeyPtr[3]; /* Array of pointers to TreeKey objects */ |
| |
| /* The following fields are present for interior nodes only, not leaves. */ |
| u32 aiChildPtr[4]; /* Array of pointers to child nodes */ |
| |
| /* The extra child pointer slot. */ |
| u32 iV2; /* Transaction number of v2 */ |
| u8 iV2Child; /* apChild[] entry replaced by pV2Ptr */ |
| u32 iV2Ptr; /* Substitute pointer */ |
| }; |
| |
| struct TreeLeaf { |
| u32 aiKeyPtr[3]; /* Array of pointers to TreeKey objects */ |
| }; |
| |
| typedef struct TreeBlob TreeBlob; |
| struct TreeBlob { |
| int n; |
| u8 *a; |
| }; |
| |
| /* |
| ** Cursor for searching a tree structure. |
| ** |
| ** If a cursor does not point to any element (a.k.a. EOF), then the |
| ** TreeCursor.iNode variable is set to a negative value. Otherwise, the |
| ** cursor currently points to key aiCell[iNode] on node apTreeNode[iNode]. |
| ** |
| ** Entries in the apTreeNode[] and aiCell[] arrays contain the node and |
| ** index of the TreeNode.apChild[] pointer followed to descend to the |
| ** current element. Hence apTreeNode[0] always contains the root node of |
| ** the tree. |
| */ |
| struct TreeCursor { |
| lsm_db *pDb; /* Database handle for this cursor */ |
| TreeRoot *pRoot; /* Root node and height of tree to access */ |
| int iNode; /* Cursor points at apTreeNode[iNode] */ |
| TreeNode *apTreeNode[MAX_DEPTH];/* Current position in tree */ |
| u8 aiCell[MAX_DEPTH]; /* Current position in tree */ |
| TreeKey *pSave; /* Saved key */ |
| TreeBlob blob; /* Dynamic storage for a key */ |
| }; |
| |
| /* |
| ** A value guaranteed to be larger than the largest possible transaction |
| ** id (TreeHeader.iTransId). |
| */ |
| #define WORKING_VERSION (1<<30) |
| |
| static int tblobGrow(lsm_db *pDb, TreeBlob *p, int n, int *pRc){ |
| if( n>p->n ){ |
| lsmFree(pDb->pEnv, p->a); |
| p->a = lsmMallocRc(pDb->pEnv, n, pRc); |
| p->n = n; |
| } |
| return (p->a==0); |
| } |
| static void tblobFree(lsm_db *pDb, TreeBlob *p){ |
| lsmFree(pDb->pEnv, p->a); |
| } |
| |
| |
| /*********************************************************************** |
| ** Start of IntArray methods. */ |
| /* |
| ** Append value iVal to the contents of IntArray *p. Return LSM_OK if |
| ** successful, or LSM_NOMEM if an OOM condition is encountered. |
| */ |
| static int intArrayAppend(lsm_env *pEnv, IntArray *p, u32 iVal){ |
| assert( p->nArray<=p->nAlloc ); |
| if( p->nArray>=p->nAlloc ){ |
| u32 *aNew; |
| int nNew = p->nArray ? p->nArray*2 : 128; |
| aNew = lsmRealloc(pEnv, p->aArray, nNew*sizeof(u32)); |
| if( !aNew ) return LSM_NOMEM_BKPT; |
| p->aArray = aNew; |
| p->nAlloc = nNew; |
| } |
| |
| p->aArray[p->nArray++] = iVal; |
| return LSM_OK; |
| } |
| |
| /* |
| ** Zero the IntArray object. |
| */ |
| static void intArrayFree(lsm_env *pEnv, IntArray *p){ |
| p->nArray = 0; |
| } |
| |
| /* |
| ** Return the number of entries currently in the int-array object. |
| */ |
| static int intArraySize(IntArray *p){ |
| return p->nArray; |
| } |
| |
| /* |
| ** Return a copy of the iIdx'th entry in the int-array. |
| */ |
| static u32 intArrayEntry(IntArray *p, int iIdx){ |
| return p->aArray[iIdx]; |
| } |
| |
| /* |
| ** Truncate the int-array so that all but the first nVal values are |
| ** discarded. |
| */ |
| static void intArrayTruncate(IntArray *p, int nVal){ |
| p->nArray = nVal; |
| } |
| /* End of IntArray methods. |
| ***********************************************************************/ |
| |
| static int treeKeycmp(void *p1, int n1, void *p2, int n2){ |
| int res; |
| res = memcmp(p1, p2, LSM_MIN(n1, n2)); |
| if( res==0 ) res = (n1-n2); |
| return res; |
| } |
| |
| /* |
| ** The pointer passed as the first argument points to an interior node, |
| ** not a leaf. This function returns the offset of the iCell'th child |
| ** sub-tree of the node. |
| */ |
| static u32 getChildPtr(TreeNode *p, int iVersion, int iCell){ |
| assert( iVersion>=0 ); |
| assert( iCell>=0 && iCell<=array_size(p->aiChildPtr) ); |
| if( p->iV2 && p->iV2<=(u32)iVersion && iCell==p->iV2Child ) return p->iV2Ptr; |
| return p->aiChildPtr[iCell]; |
| } |
| |
| /* |
| ** Given an offset within the *-shm file, return the associated chunk number. |
| */ |
| static int treeOffsetToChunk(u32 iOff){ |
| assert( LSM_SHM_CHUNK_SIZE==(1<<15) ); |
| return (int)(iOff>>15); |
| } |
| |
| #define treeShmptrUnsafe(pDb, iPtr) \ |
| (&((u8*)((pDb)->apShm[(iPtr)>>15]))[(iPtr) & (LSM_SHM_CHUNK_SIZE-1)]) |
| |
| /* |
| ** Return a pointer to the mapped memory location associated with *-shm |
| ** file offset iPtr. |
| */ |
| static void *treeShmptr(lsm_db *pDb, u32 iPtr){ |
| |
| assert( (iPtr>>15)<(u32)pDb->nShm ); |
| assert( pDb->apShm[iPtr>>15] ); |
| |
| return iPtr ? treeShmptrUnsafe(pDb, iPtr) : 0; |
| } |
| |
| static ShmChunk * treeShmChunk(lsm_db *pDb, int iChunk){ |
| return (ShmChunk *)(pDb->apShm[iChunk]); |
| } |
| |
| static ShmChunk * treeShmChunkRc(lsm_db *pDb, int iChunk, int *pRc){ |
| assert( *pRc==LSM_OK ); |
| if( iChunk<pDb->nShm || LSM_OK==(*pRc = lsmShmCacheChunks(pDb, iChunk+1)) ){ |
| return (ShmChunk *)(pDb->apShm[iChunk]); |
| } |
| return 0; |
| } |
| |
| |
| #ifndef NDEBUG |
| static void assertIsWorkingChild( |
| lsm_db *db, |
| TreeNode *pNode, |
| TreeNode *pParent, |
| int iCell |
| ){ |
| TreeNode *p; |
| u32 iPtr = getChildPtr(pParent, WORKING_VERSION, iCell); |
| p = treeShmptr(db, iPtr); |
| assert( p==pNode ); |
| } |
| #else |
| # define assertIsWorkingChild(w,x,y,z) |
| #endif |
| |
| /* Values for the third argument to treeShmkey(). */ |
| #define TKV_LOADKEY 1 |
| #define TKV_LOADVAL 2 |
| |
| static TreeKey *treeShmkey( |
| lsm_db *pDb, /* Database handle */ |
| u32 iPtr, /* Shmptr to TreeKey struct */ |
| int eLoad, /* Either zero or a TREEKEY_LOADXXX value */ |
| TreeBlob *pBlob, /* Used if dynamic memory is required */ |
| int *pRc /* IN/OUT: Error code */ |
| ){ |
| TreeKey *pRet; |
| |
| assert( eLoad==TKV_LOADKEY || eLoad==TKV_LOADVAL ); |
| pRet = (TreeKey *)treeShmptr(pDb, iPtr); |
| if( pRet ){ |
| int nReq; /* Bytes of space required at pRet */ |
| int nAvail; /* Bytes of space available at pRet */ |
| |
| nReq = sizeof(TreeKey) + pRet->nKey; |
| if( eLoad==TKV_LOADVAL && pRet->nValue>0 ){ |
| nReq += pRet->nValue; |
| } |
| assert( LSM_SHM_CHUNK_SIZE==(1<<15) ); |
| nAvail = LSM_SHM_CHUNK_SIZE - (iPtr & (LSM_SHM_CHUNK_SIZE-1)); |
| |
| if( nAvail<nReq ){ |
| if( tblobGrow(pDb, pBlob, nReq, pRc)==0 ){ |
| int nLoad = 0; |
| while( *pRc==LSM_OK ){ |
| ShmChunk *pChunk; |
| void *p = treeShmptr(pDb, iPtr); |
| int n = LSM_MIN(nAvail, nReq-nLoad); |
| |
| memcpy(&pBlob->a[nLoad], p, n); |
| nLoad += n; |
| if( nLoad==nReq ) break; |
| |
| pChunk = treeShmChunk(pDb, treeOffsetToChunk(iPtr)); |
| assert( pChunk ); |
| iPtr = (pChunk->iNext * LSM_SHM_CHUNK_SIZE) + LSM_SHM_CHUNK_HDR; |
| nAvail = LSM_SHM_CHUNK_SIZE - LSM_SHM_CHUNK_HDR; |
| } |
| } |
| pRet = (TreeKey *)(pBlob->a); |
| } |
| } |
| |
| return pRet; |
| } |
| |
| #if defined(LSM_DEBUG) && defined(LSM_EXPENSIVE_ASSERT) |
| void assert_leaf_looks_ok(TreeNode *pNode){ |
| assert( pNode->apKey[1] ); |
| } |
| |
| void assert_node_looks_ok(TreeNode *pNode, int nHeight){ |
| if( pNode ){ |
| assert( pNode->apKey[1] ); |
| if( nHeight>1 ){ |
| int i; |
| assert( getChildPtr(pNode, WORKING_VERSION, 1) ); |
| assert( getChildPtr(pNode, WORKING_VERSION, 2) ); |
| for(i=0; i<4; i++){ |
| assert_node_looks_ok(getChildPtr(pNode, WORKING_VERSION, i), nHeight-1); |
| } |
| } |
| } |
| } |
| |
| /* |
| ** Run various assert() statements to check that the working-version of the |
| ** tree is correct in the following respects: |
| ** |
| ** * todo... |
| */ |
| void assert_tree_looks_ok(int rc, Tree *pTree){ |
| } |
| #else |
| # define assert_tree_looks_ok(x,y) |
| #endif |
| |
| void lsmFlagsToString(int flags, char *zFlags){ |
| |
| zFlags[0] = (flags & LSM_END_DELETE) ? ']' : '.'; |
| |
| /* Only one of LSM_POINT_DELETE, LSM_INSERT and LSM_SEPARATOR should ever |
| ** be set. If this is not true, write a '?' to the output. */ |
| switch( flags & (LSM_POINT_DELETE|LSM_INSERT|LSM_SEPARATOR) ){ |
| case 0: zFlags[1] = '.'; break; |
| case LSM_POINT_DELETE: zFlags[1] = '-'; break; |
| case LSM_INSERT: zFlags[1] = '+'; break; |
| case LSM_SEPARATOR: zFlags[1] = '^'; break; |
| default: zFlags[1] = '?'; break; |
| } |
| |
| zFlags[2] = (flags & LSM_SYSTEMKEY) ? '*' : '.'; |
| zFlags[3] = (flags & LSM_START_DELETE) ? '[' : '.'; |
| zFlags[4] = '\0'; |
| } |
| |
| #ifdef LSM_DEBUG |
| |
| /* |
| ** Pointer pBlob points to a buffer containing a blob of binary data |
| ** nBlob bytes long. Append the contents of this blob to *pStr, with |
| ** each octet represented by a 2-digit hexadecimal number. For example, |
| ** if the input blob is three bytes in size and contains {0x01, 0x44, 0xFF}, |
| ** then "0144ff" is appended to *pStr. |
| */ |
| static void lsmAppendStrBlob(LsmString *pStr, void *pBlob, int nBlob){ |
| int i; |
| lsmStringExtend(pStr, nBlob*2); |
| if( pStr->nAlloc==0 ) return; |
| for(i=0; i<nBlob; i++){ |
| u8 c = ((u8*)pBlob)[i]; |
| if( c>='a' && c<='z' ){ |
| pStr->z[pStr->n++] = c; |
| }else if( c!=0 || nBlob==1 || i!=(nBlob-1) ){ |
| pStr->z[pStr->n++] = "0123456789abcdef"[(c>>4)&0xf]; |
| pStr->z[pStr->n++] = "0123456789abcdef"[c&0xf]; |
| } |
| } |
| pStr->z[pStr->n] = 0; |
| } |
| |
| #if 0 /* NOT USED */ |
| /* |
| ** Append nIndent space (0x20) characters to string *pStr. |
| */ |
| static void lsmAppendIndent(LsmString *pStr, int nIndent){ |
| int i; |
| lsmStringExtend(pStr, nIndent); |
| for(i=0; i<nIndent; i++) lsmStringAppend(pStr, " ", 1); |
| } |
| #endif |
| |
| static void strAppendFlags(LsmString *pStr, u8 flags){ |
| char zFlags[8]; |
| |
| lsmFlagsToString(flags, zFlags); |
| zFlags[4] = ':'; |
| |
| lsmStringAppend(pStr, zFlags, 5); |
| } |
| |
| void dump_node_contents( |
| lsm_db *pDb, |
| u32 iNode, /* Print out the contents of this node */ |
| char *zPath, /* Path from root to this node */ |
| int nPath, /* Number of bytes in zPath */ |
| int nHeight /* Height: (0==leaf) (1==parent-of-leaf) */ |
| ){ |
| const char *zSpace = " "; |
| int i; |
| int rc = LSM_OK; |
| LsmString s; |
| TreeNode *pNode; |
| TreeBlob b = {0, 0}; |
| |
| pNode = (TreeNode *)treeShmptr(pDb, iNode); |
| |
| if( nHeight==0 ){ |
| /* Append the nIndent bytes of space to string s. */ |
| lsmStringInit(&s, pDb->pEnv); |
| |
| /* Append each key to string s. */ |
| for(i=0; i<3; i++){ |
| u32 iPtr = pNode->aiKeyPtr[i]; |
| if( iPtr ){ |
| TreeKey *pKey = treeShmkey(pDb, pNode->aiKeyPtr[i],TKV_LOADKEY, &b,&rc); |
| strAppendFlags(&s, pKey->flags); |
| lsmAppendStrBlob(&s, TKV_KEY(pKey), pKey->nKey); |
| lsmStringAppend(&s, " ", -1); |
| } |
| } |
| |
| printf("% 6d %.*sleaf%.*s: %s\n", |
| iNode, nPath, zPath, 20-nPath-4, zSpace, s.z |
| ); |
| lsmStringClear(&s); |
| }else{ |
| for(i=0; i<4 && nHeight>0; i++){ |
| u32 iPtr = getChildPtr(pNode, pDb->treehdr.root.iTransId, i); |
| zPath[nPath] = (char)(i+'0'); |
| zPath[nPath+1] = '/'; |
| |
| if( iPtr ){ |
| dump_node_contents(pDb, iPtr, zPath, nPath+2, nHeight-1); |
| } |
| if( i!=3 && pNode->aiKeyPtr[i] ){ |
| TreeKey *pKey = treeShmkey(pDb, pNode->aiKeyPtr[i], TKV_LOADKEY,&b,&rc); |
| lsmStringInit(&s, pDb->pEnv); |
| strAppendFlags(&s, pKey->flags); |
| lsmAppendStrBlob(&s, TKV_KEY(pKey), pKey->nKey); |
| printf("% 6d %.*s%.*s: %s\n", |
| iNode, nPath+1, zPath, 20-nPath-1, zSpace, s.z); |
| lsmStringClear(&s); |
| } |
| } |
| } |
| |
| tblobFree(pDb, &b); |
| } |
| |
| void dump_tree_contents(lsm_db *pDb, const char *zCaption){ |
| char zPath[64]; |
| TreeRoot *p = &pDb->treehdr.root; |
| printf("\n%s\n", zCaption); |
| zPath[0] = '/'; |
| if( p->iRoot ){ |
| dump_node_contents(pDb, p->iRoot, zPath, 1, p->nHeight-1); |
| } |
| fflush(stdout); |
| } |
| |
| #endif |
| |
| /* |
| ** Initialize a cursor object, the space for which has already been |
| ** allocated. |
| */ |
| static void treeCursorInit(lsm_db *pDb, int bOld, TreeCursor *pCsr){ |
| memset(pCsr, 0, sizeof(TreeCursor)); |
| pCsr->pDb = pDb; |
| if( bOld ){ |
| pCsr->pRoot = &pDb->treehdr.oldroot; |
| }else{ |
| pCsr->pRoot = &pDb->treehdr.root; |
| } |
| pCsr->iNode = -1; |
| } |
| |
| /* |
| ** Return a pointer to the mapping of the TreeKey object that the cursor |
| ** is pointing to. |
| */ |
| static TreeKey *csrGetKey(TreeCursor *pCsr, TreeBlob *pBlob, int *pRc){ |
| TreeKey *pRet; |
| lsm_db *pDb = pCsr->pDb; |
| u32 iPtr = pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[pCsr->aiCell[pCsr->iNode]]; |
| |
| assert( iPtr ); |
| pRet = (TreeKey*)treeShmptrUnsafe(pDb, iPtr); |
| if( !(pRet->flags & LSM_CONTIGUOUS) ){ |
| pRet = treeShmkey(pDb, iPtr, TKV_LOADVAL, pBlob, pRc); |
| } |
| |
| return pRet; |
| } |
| |
| /* |
| ** Save the current position of tree cursor pCsr. |
| */ |
| int lsmTreeCursorSave(TreeCursor *pCsr){ |
| int rc = LSM_OK; |
| if( pCsr && pCsr->pSave==0 ){ |
| int iNode = pCsr->iNode; |
| if( iNode>=0 ){ |
| pCsr->pSave = csrGetKey(pCsr, &pCsr->blob, &rc); |
| } |
| pCsr->iNode = -1; |
| } |
| return rc; |
| } |
| |
| /* |
| ** Restore the position of a saved tree cursor. |
| */ |
| static int treeCursorRestore(TreeCursor *pCsr, int *pRes){ |
| int rc = LSM_OK; |
| if( pCsr->pSave ){ |
| TreeKey *pKey = pCsr->pSave; |
| pCsr->pSave = 0; |
| if( pRes ){ |
| rc = lsmTreeCursorSeek(pCsr, TKV_KEY(pKey), pKey->nKey, pRes); |
| } |
| } |
| return rc; |
| } |
| |
| /* |
| ** Allocate nByte bytes of space within the *-shm file. If successful, |
| ** return LSM_OK and set *piPtr to the offset within the file at which |
| ** the allocated space is located. |
| */ |
| static u32 treeShmalloc(lsm_db *pDb, int bAlign, int nByte, int *pRc){ |
| u32 iRet = 0; |
| if( *pRc==LSM_OK ){ |
| const static int CHUNK_SIZE = LSM_SHM_CHUNK_SIZE; |
| const static int CHUNK_HDR = LSM_SHM_CHUNK_HDR; |
| u32 iWrite; /* Current write offset */ |
| u32 iEof; /* End of current chunk */ |
| int iChunk; /* Current chunk */ |
| |
| assert( nByte <= (CHUNK_SIZE-CHUNK_HDR) ); |
| |
| /* Check if there is enough space on the current chunk to fit the |
| ** new allocation. If not, link in a new chunk and put the new |
| ** allocation at the start of it. */ |
| iWrite = pDb->treehdr.iWrite; |
| if( bAlign ){ |
| iWrite = (iWrite + 3) & ~0x0003; |
| assert( (iWrite % 4)==0 ); |
| } |
| |
| assert( iWrite ); |
| iChunk = treeOffsetToChunk(iWrite-1); |
| iEof = (iChunk+1) * CHUNK_SIZE; |
| assert( iEof>=iWrite && (iEof-iWrite)<(u32)CHUNK_SIZE ); |
| if( (iWrite+nByte)>iEof ){ |
| ShmChunk *pHdr; /* Header of chunk just finished (iChunk) */ |
| ShmChunk *pFirst; /* Header of chunk treehdr.iFirst */ |
| ShmChunk *pNext; /* Header of new chunk */ |
| int iNext = 0; /* Next chunk */ |
| int rc = LSM_OK; |
| |
| pFirst = treeShmChunk(pDb, pDb->treehdr.iFirst); |
| |
| assert( shm_sequence_ge(pDb->treehdr.iUsedShmid, pFirst->iShmid) ); |
| assert( (pDb->treehdr.iNextShmid+1-pDb->treehdr.nChunk)==pFirst->iShmid ); |
| |
| /* Check if the chunk at the start of the linked list is still in |
| ** use. If not, reuse it. If so, allocate a new chunk by appending |
| ** to the *-shm file. */ |
| if( pDb->treehdr.iUsedShmid!=pFirst->iShmid ){ |
| int bInUse; |
| rc = lsmTreeInUse(pDb, pFirst->iShmid, &bInUse); |
| if( rc!=LSM_OK ){ |
| *pRc = rc; |
| return 0; |
| } |
| if( bInUse==0 ){ |
| iNext = pDb->treehdr.iFirst; |
| pDb->treehdr.iFirst = pFirst->iNext; |
| assert( pDb->treehdr.iFirst ); |
| } |
| } |
| if( iNext==0 ) iNext = pDb->treehdr.nChunk++; |
| |
| /* Set the header values for the new chunk */ |
| pNext = treeShmChunkRc(pDb, iNext, &rc); |
| if( pNext ){ |
| pNext->iNext = 0; |
| pNext->iShmid = (pDb->treehdr.iNextShmid++); |
| }else{ |
| *pRc = rc; |
| return 0; |
| } |
| |
| /* Set the header values for the chunk just finished */ |
| pHdr = (ShmChunk *)treeShmptr(pDb, iChunk*CHUNK_SIZE); |
| pHdr->iNext = iNext; |
| |
| /* Advance to the next chunk */ |
| iWrite = iNext * CHUNK_SIZE + CHUNK_HDR; |
| } |
| |
| /* Allocate space at iWrite. */ |
| iRet = iWrite; |
| pDb->treehdr.iWrite = iWrite + nByte; |
| pDb->treehdr.root.nByte += nByte; |
| } |
| return iRet; |
| } |
| |
| /* |
| ** Allocate and zero nByte bytes of space within the *-shm file. |
| */ |
| static void *treeShmallocZero(lsm_db *pDb, int nByte, u32 *piPtr, int *pRc){ |
| u32 iPtr; |
| void *p; |
| iPtr = treeShmalloc(pDb, 1, nByte, pRc); |
| p = treeShmptr(pDb, iPtr); |
| if( p ){ |
| assert( *pRc==LSM_OK ); |
| memset(p, 0, nByte); |
| *piPtr = iPtr; |
| } |
| return p; |
| } |
| |
| static TreeNode *newTreeNode(lsm_db *pDb, u32 *piPtr, int *pRc){ |
| return treeShmallocZero(pDb, sizeof(TreeNode), piPtr, pRc); |
| } |
| |
| static TreeLeaf *newTreeLeaf(lsm_db *pDb, u32 *piPtr, int *pRc){ |
| return treeShmallocZero(pDb, sizeof(TreeLeaf), piPtr, pRc); |
| } |
| |
| static TreeKey *newTreeKey( |
| lsm_db *pDb, |
| u32 *piPtr, |
| void *pKey, int nKey, /* Key data */ |
| void *pVal, int nVal, /* Value data (or nVal<0 for delete) */ |
| int *pRc |
| ){ |
| TreeKey *p; |
| u32 iPtr; |
| u32 iEnd; |
| int nRem; |
| u8 *a; |
| int n; |
| |
| /* Allocate space for the TreeKey structure itself */ |
| *piPtr = iPtr = treeShmalloc(pDb, 1, sizeof(TreeKey), pRc); |
| p = treeShmptr(pDb, iPtr); |
| if( *pRc ) return 0; |
| p->nKey = nKey; |
| p->nValue = nVal; |
| |
| /* Allocate and populate the space required for the key and value. */ |
| n = nRem = nKey; |
| a = (u8 *)pKey; |
| while( a ){ |
| while( nRem>0 ){ |
| u8 *aAlloc; |
| int nAlloc; |
| u32 iWrite; |
| |
| iWrite = (pDb->treehdr.iWrite & (LSM_SHM_CHUNK_SIZE-1)); |
| iWrite = LSM_MAX(iWrite, LSM_SHM_CHUNK_HDR); |
| nAlloc = LSM_MIN((LSM_SHM_CHUNK_SIZE-iWrite), (u32)nRem); |
| |
| aAlloc = treeShmptr(pDb, treeShmalloc(pDb, 0, nAlloc, pRc)); |
| if( aAlloc==0 ) break; |
| memcpy(aAlloc, &a[n-nRem], nAlloc); |
| nRem -= nAlloc; |
| } |
| a = pVal; |
| n = nRem = nVal; |
| pVal = 0; |
| } |
| |
| iEnd = iPtr + sizeof(TreeKey) + nKey + LSM_MAX(0, nVal); |
| if( (iPtr & ~(LSM_SHM_CHUNK_SIZE-1))!=(iEnd & ~(LSM_SHM_CHUNK_SIZE-1)) ){ |
| p->flags = 0; |
| }else{ |
| p->flags = LSM_CONTIGUOUS; |
| } |
| |
| if( *pRc ) return 0; |
| #if 0 |
| printf("store: %d %s\n", (int)iPtr, (char *)pKey); |
| #endif |
| return p; |
| } |
| |
| static TreeNode *copyTreeNode( |
| lsm_db *pDb, |
| TreeNode *pOld, |
| u32 *piNew, |
| int *pRc |
| ){ |
| TreeNode *pNew; |
| |
| pNew = newTreeNode(pDb, piNew, pRc); |
| if( pNew ){ |
| memcpy(pNew->aiKeyPtr, pOld->aiKeyPtr, sizeof(pNew->aiKeyPtr)); |
| memcpy(pNew->aiChildPtr, pOld->aiChildPtr, sizeof(pNew->aiChildPtr)); |
| if( pOld->iV2 ) pNew->aiChildPtr[pOld->iV2Child] = pOld->iV2Ptr; |
| } |
| return pNew; |
| } |
| |
| static TreeNode *copyTreeLeaf( |
| lsm_db *pDb, |
| TreeLeaf *pOld, |
| u32 *piNew, |
| int *pRc |
| ){ |
| TreeLeaf *pNew; |
| pNew = newTreeLeaf(pDb, piNew, pRc); |
| if( pNew ){ |
| memcpy(pNew, pOld, sizeof(TreeLeaf)); |
| } |
| return (TreeNode *)pNew; |
| } |
| |
| /* |
| ** The tree cursor passed as the second argument currently points to an |
| ** internal node (not a leaf). Specifically, to a sub-tree pointer. This |
| ** function replaces the sub-tree that the cursor currently points to |
| ** with sub-tree pNew. |
| ** |
| ** The sub-tree may be replaced either by writing the "v2 data" on the |
| ** internal node, or by allocating a new TreeNode structure and then |
| ** calling this function on the parent of the internal node. |
| */ |
| static int treeUpdatePtr(lsm_db *pDb, TreeCursor *pCsr, u32 iNew){ |
| int rc = LSM_OK; |
| if( pCsr->iNode<0 ){ |
| /* iNew is the new root node */ |
| pDb->treehdr.root.iRoot = iNew; |
| }else{ |
| /* If this node already has version 2 content, allocate a copy and |
| ** update the copy with the new pointer value. Otherwise, store the |
| ** new pointer as v2 data within the current node structure. */ |
| |
| TreeNode *p; /* The node to be modified */ |
| int iChildPtr; /* apChild[] entry to modify */ |
| |
| p = pCsr->apTreeNode[pCsr->iNode]; |
| iChildPtr = pCsr->aiCell[pCsr->iNode]; |
| |
| if( p->iV2 ){ |
| /* The "allocate new TreeNode" option */ |
| u32 iCopy; |
| TreeNode *pCopy; |
| pCopy = copyTreeNode(pDb, p, &iCopy, &rc); |
| if( pCopy ){ |
| assert( rc==LSM_OK ); |
| pCopy->aiChildPtr[iChildPtr] = iNew; |
| pCsr->iNode--; |
| rc = treeUpdatePtr(pDb, pCsr, iCopy); |
| } |
| }else{ |
| /* The "v2 data" option */ |
| u32 iPtr; |
| assert( pDb->treehdr.root.iTransId>0 ); |
| |
| if( pCsr->iNode ){ |
| iPtr = getChildPtr( |
| pCsr->apTreeNode[pCsr->iNode-1], |
| pDb->treehdr.root.iTransId, pCsr->aiCell[pCsr->iNode-1] |
| ); |
| }else{ |
| iPtr = pDb->treehdr.root.iRoot; |
| } |
| rc = intArrayAppend(pDb->pEnv, &pDb->rollback, iPtr); |
| |
| if( rc==LSM_OK ){ |
| p->iV2 = pDb->treehdr.root.iTransId; |
| p->iV2Child = (u8)iChildPtr; |
| p->iV2Ptr = iNew; |
| } |
| } |
| } |
| |
| return rc; |
| } |
| |
| /* |
| ** Cursor pCsr points at a node that is part of pTree. This function |
| ** inserts a new key and optionally child node pointer into that node. |
| ** |
| ** The position into which the new key and pointer are inserted is |
| ** determined by the iSlot parameter. The new key will be inserted to |
| ** the left of the key currently stored in apKey[iSlot]. Or, if iSlot is |
| ** greater than the index of the rightmost key in the node. |
| ** |
| ** Pointer pLeftPtr points to a child tree that contains keys that are |
| ** smaller than pTreeKey. |
| */ |
| static int treeInsert( |
| lsm_db *pDb, /* Database handle */ |
| TreeCursor *pCsr, /* Cursor indicating path to insert at */ |
| u32 iLeftPtr, /* Left child pointer */ |
| u32 iTreeKey, /* Location of key to insert */ |
| u32 iRightPtr, /* Right child pointer */ |
| int iSlot /* Position to insert key into */ |
| ){ |
| int rc = LSM_OK; |
| TreeNode *pNode = pCsr->apTreeNode[pCsr->iNode]; |
| |
| /* Check if the node is currently full. If so, split pNode in two and |
| ** call this function recursively to add a key to the parent. Otherwise, |
| ** insert the new key directly into pNode. */ |
| assert( pNode->aiKeyPtr[1] ); |
| if( pNode->aiKeyPtr[0] && pNode->aiKeyPtr[2] ){ |
| u32 iLeft; TreeNode *pLeft; /* New left-hand sibling node */ |
| u32 iRight; TreeNode *pRight; /* New right-hand sibling node */ |
| |
| pLeft = newTreeNode(pDb, &iLeft, &rc); |
| pRight = newTreeNode(pDb, &iRight, &rc); |
| if( rc ) return rc; |
| |
| pLeft->aiChildPtr[1] = getChildPtr(pNode, WORKING_VERSION, 0); |
| pLeft->aiKeyPtr[1] = pNode->aiKeyPtr[0]; |
| pLeft->aiChildPtr[2] = getChildPtr(pNode, WORKING_VERSION, 1); |
| |
| pRight->aiChildPtr[1] = getChildPtr(pNode, WORKING_VERSION, 2); |
| pRight->aiKeyPtr[1] = pNode->aiKeyPtr[2]; |
| pRight->aiChildPtr[2] = getChildPtr(pNode, WORKING_VERSION, 3); |
| |
| if( pCsr->iNode==0 ){ |
| /* pNode is the root of the tree. Grow the tree by one level. */ |
| u32 iRoot; TreeNode *pRoot; /* New root node */ |
| |
| pRoot = newTreeNode(pDb, &iRoot, &rc); |
| pRoot->aiKeyPtr[1] = pNode->aiKeyPtr[1]; |
| pRoot->aiChildPtr[1] = iLeft; |
| pRoot->aiChildPtr[2] = iRight; |
| |
| pDb->treehdr.root.iRoot = iRoot; |
| pDb->treehdr.root.nHeight++; |
| }else{ |
| |
| pCsr->iNode--; |
| rc = treeInsert(pDb, pCsr, |
| iLeft, pNode->aiKeyPtr[1], iRight, pCsr->aiCell[pCsr->iNode] |
| ); |
| } |
| |
| assert( pLeft->iV2==0 ); |
| assert( pRight->iV2==0 ); |
| switch( iSlot ){ |
| case 0: |
| pLeft->aiKeyPtr[0] = iTreeKey; |
| pLeft->aiChildPtr[0] = iLeftPtr; |
| if( iRightPtr ) pLeft->aiChildPtr[1] = iRightPtr; |
| break; |
| case 1: |
| pLeft->aiChildPtr[3] = (iRightPtr ? iRightPtr : pLeft->aiChildPtr[2]); |
| pLeft->aiKeyPtr[2] = iTreeKey; |
| pLeft->aiChildPtr[2] = iLeftPtr; |
| break; |
| case 2: |
| pRight->aiKeyPtr[0] = iTreeKey; |
| pRight->aiChildPtr[0] = iLeftPtr; |
| if( iRightPtr ) pRight->aiChildPtr[1] = iRightPtr; |
| break; |
| case 3: |
| pRight->aiChildPtr[3] = (iRightPtr ? iRightPtr : pRight->aiChildPtr[2]); |
| pRight->aiKeyPtr[2] = iTreeKey; |
| pRight->aiChildPtr[2] = iLeftPtr; |
| break; |
| } |
| |
| }else{ |
| TreeNode *pNew; |
| u32 *piKey; |
| u32 *piChild; |
| u32 iStore = 0; |
| u32 iNew = 0; |
| int i; |
| |
| /* Allocate a new version of node pNode. */ |
| pNew = newTreeNode(pDb, &iNew, &rc); |
| if( rc ) return rc; |
| |
| piKey = pNew->aiKeyPtr; |
| piChild = pNew->aiChildPtr; |
| |
| for(i=0; i<iSlot; i++){ |
| if( pNode->aiKeyPtr[i] ){ |
| *(piKey++) = pNode->aiKeyPtr[i]; |
| *(piChild++) = getChildPtr(pNode, WORKING_VERSION, i); |
| } |
| } |
| |
| *piKey++ = iTreeKey; |
| *piChild++ = iLeftPtr; |
| |
| iStore = iRightPtr; |
| for(i=iSlot; i<3; i++){ |
| if( pNode->aiKeyPtr[i] ){ |
| *(piKey++) = pNode->aiKeyPtr[i]; |
| *(piChild++) = iStore ? iStore : getChildPtr(pNode, WORKING_VERSION, i); |
| iStore = 0; |
| } |
| } |
| |
| if( iStore ){ |
| *piChild = iStore; |
| }else{ |
| *piChild = getChildPtr(pNode, WORKING_VERSION, |
| (pNode->aiKeyPtr[2] ? 3 : 2) |
| ); |
| } |
| pCsr->iNode--; |
| rc = treeUpdatePtr(pDb, pCsr, iNew); |
| } |
| |
| return rc; |
| } |
| |
| static int treeInsertLeaf( |
| lsm_db *pDb, /* Database handle */ |
| TreeCursor *pCsr, /* Cursor structure */ |
| u32 iTreeKey, /* Key pointer to insert */ |
| int iSlot /* Insert key to the left of this */ |
| ){ |
| int rc = LSM_OK; /* Return code */ |
| TreeNode *pLeaf = pCsr->apTreeNode[pCsr->iNode]; |
| TreeLeaf *pNew; |
| u32 iNew; |
| |
| assert( iSlot>=0 && iSlot<=4 ); |
| assert( pCsr->iNode>0 ); |
| assert( pLeaf->aiKeyPtr[1] ); |
| |
| pCsr->iNode--; |
| |
| pNew = newTreeLeaf(pDb, &iNew, &rc); |
| if( pNew ){ |
| if( pLeaf->aiKeyPtr[0] && pLeaf->aiKeyPtr[2] ){ |
| /* The leaf is full. Split it in two. */ |
| TreeLeaf *pRight; |
| u32 iRight; |
| pRight = newTreeLeaf(pDb, &iRight, &rc); |
| if( pRight ){ |
| assert( rc==LSM_OK ); |
| pNew->aiKeyPtr[1] = pLeaf->aiKeyPtr[0]; |
| pRight->aiKeyPtr[1] = pLeaf->aiKeyPtr[2]; |
| switch( iSlot ){ |
| case 0: pNew->aiKeyPtr[0] = iTreeKey; break; |
| case 1: pNew->aiKeyPtr[2] = iTreeKey; break; |
| case 2: pRight->aiKeyPtr[0] = iTreeKey; break; |
| case 3: pRight->aiKeyPtr[2] = iTreeKey; break; |
| } |
| |
| rc = treeInsert(pDb, pCsr, iNew, pLeaf->aiKeyPtr[1], iRight, |
| pCsr->aiCell[pCsr->iNode] |
| ); |
| } |
| }else{ |
| int iOut = 0; |
| int i; |
| for(i=0; i<4; i++){ |
| if( i==iSlot ) pNew->aiKeyPtr[iOut++] = iTreeKey; |
| if( i<3 && pLeaf->aiKeyPtr[i] ){ |
| pNew->aiKeyPtr[iOut++] = pLeaf->aiKeyPtr[i]; |
| } |
| } |
| rc = treeUpdatePtr(pDb, pCsr, iNew); |
| } |
| } |
| |
| return rc; |
| } |
| |
| void lsmTreeMakeOld(lsm_db *pDb){ |
| |
| /* A write transaction must be open. Otherwise the code below that |
| ** assumes (pDb->pClient->iLogOff) is current may malfunction. |
| ** |
| ** Update: currently this assert fails due to lsm_flush(), which does |
| ** not set nTransOpen. |
| */ |
| assert( /* pDb->nTransOpen>0 && */ pDb->iReader>=0 ); |
| |
| if( pDb->treehdr.iOldShmid==0 ){ |
| pDb->treehdr.iOldLog = (pDb->treehdr.log.aRegion[2].iEnd << 1); |
| pDb->treehdr.iOldLog |= (~(pDb->pClient->iLogOff) & (i64)0x0001); |
| |
| pDb->treehdr.oldcksum0 = pDb->treehdr.log.cksum0; |
| pDb->treehdr.oldcksum1 = pDb->treehdr.log.cksum1; |
| pDb->treehdr.iOldShmid = pDb->treehdr.iNextShmid-1; |
| memcpy(&pDb->treehdr.oldroot, &pDb->treehdr.root, sizeof(TreeRoot)); |
| |
| pDb->treehdr.root.iTransId = 1; |
| pDb->treehdr.root.iRoot = 0; |
| pDb->treehdr.root.nHeight = 0; |
| pDb->treehdr.root.nByte = 0; |
| } |
| } |
| |
| void lsmTreeDiscardOld(lsm_db *pDb){ |
| assert( lsmShmAssertLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_EXCL) |
| || lsmShmAssertLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_EXCL) |
| ); |
| pDb->treehdr.iUsedShmid = pDb->treehdr.iOldShmid; |
| pDb->treehdr.iOldShmid = 0; |
| } |
| |
| int lsmTreeHasOld(lsm_db *pDb){ |
| return pDb->treehdr.iOldShmid!=0; |
| } |
| |
| /* |
| ** This function is called during recovery to initialize the |
| ** tree header. Only the database connections private copy of the tree-header |
| ** is initialized here - it will be copied into shared memory if log file |
| ** recovery is successful. |
| */ |
| int lsmTreeInit(lsm_db *pDb){ |
| ShmChunk *pOne; |
| int rc = LSM_OK; |
| |
| memset(&pDb->treehdr, 0, sizeof(TreeHeader)); |
| pDb->treehdr.root.iTransId = 1; |
| pDb->treehdr.iFirst = 1; |
| pDb->treehdr.nChunk = 2; |
| pDb->treehdr.iWrite = LSM_SHM_CHUNK_SIZE + LSM_SHM_CHUNK_HDR; |
| pDb->treehdr.iNextShmid = 2; |
| pDb->treehdr.iUsedShmid = 1; |
| |
| pOne = treeShmChunkRc(pDb, 1, &rc); |
| if( pOne ){ |
| pOne->iNext = 0; |
| pOne->iShmid = 1; |
| } |
| return rc; |
| } |
| |
| static void treeHeaderChecksum( |
| TreeHeader *pHdr, |
| u32 *aCksum |
| ){ |
| u32 cksum1 = 0x12345678; |
| u32 cksum2 = 0x9ABCDEF0; |
| u32 *a = (u32 *)pHdr; |
| int i; |
| |
| assert( (offsetof(TreeHeader, aCksum) + sizeof(u32)*2)==sizeof(TreeHeader) ); |
| assert( (sizeof(TreeHeader) % (sizeof(u32)*2))==0 ); |
| |
| for(i=0; i<(offsetof(TreeHeader, aCksum) / sizeof(u32)); i+=2){ |
| cksum1 += a[i]; |
| cksum2 += (cksum1 + a[i+1]); |
| } |
| aCksum[0] = cksum1; |
| aCksum[1] = cksum2; |
| } |
| |
| /* |
| ** Return true if the checksum stored in TreeHeader object *pHdr is |
| ** consistent with the contents of its other fields. |
| */ |
| static int treeHeaderChecksumOk(TreeHeader *pHdr){ |
| u32 aCksum[2]; |
| treeHeaderChecksum(pHdr, aCksum); |
| return (0==memcmp(aCksum, pHdr->aCksum, sizeof(aCksum))); |
| } |
| |
| /* |
| ** This type is used by functions lsmTreeRepair() and treeSortByShmid() to |
| ** make relinking the linked list of shared-memory chunks easier. |
| */ |
| typedef struct ShmChunkLoc ShmChunkLoc; |
| struct ShmChunkLoc { |
| ShmChunk *pShm; |
| u32 iLoc; |
| }; |
| |
| /* |
| ** This function checks that the linked list of shared memory chunks |
| ** that starts at chunk db->treehdr.iFirst: |
| ** |
| ** 1) Includes all chunks in the shared-memory region, and |
| ** 2) Links them together in order of ascending shm-id. |
| ** |
| ** If no error occurs and the conditions above are met, LSM_OK is returned. |
| ** |
| ** If either of the conditions are untrue, LSM_CORRUPT is returned. Or, if |
| ** an error is encountered before the checks are completed, another LSM error |
| ** code (i.e. LSM_IOERR or LSM_NOMEM) may be returned. |
| */ |
| static int treeCheckLinkedList(lsm_db *db){ |
| int rc = LSM_OK; |
| int nVisit = 0; |
| ShmChunk *p; |
| |
| p = treeShmChunkRc(db, db->treehdr.iFirst, &rc); |
| while( rc==LSM_OK && p ){ |
| if( p->iNext ){ |
| if( p->iNext>=db->treehdr.nChunk ){ |
| rc = LSM_CORRUPT_BKPT; |
| }else{ |
| ShmChunk *pNext = treeShmChunkRc(db, p->iNext, &rc); |
| if( rc==LSM_OK ){ |
| if( pNext->iShmid!=p->iShmid+1 ){ |
| rc = LSM_CORRUPT_BKPT; |
| } |
| p = pNext; |
| } |
| } |
| }else{ |
| p = 0; |
| } |
| nVisit++; |
| } |
| |
| if( rc==LSM_OK && (u32)nVisit!=db->treehdr.nChunk-1 ){ |
| rc = LSM_CORRUPT_BKPT; |
| } |
| return rc; |
| } |
| |
| /* |
| ** Iterate through the current in-memory tree. If there are any v2-pointers |
| ** with transaction ids larger than db->treehdr.iTransId, zero them. |
| */ |
| static int treeRepairPtrs(lsm_db *db){ |
| int rc = LSM_OK; |
| |
| if( db->treehdr.root.nHeight>1 ){ |
| TreeCursor csr; /* Cursor used to iterate through tree */ |
| u32 iTransId = db->treehdr.root.iTransId; |
| |
| /* Initialize the cursor structure. Also decrement the nHeight variable |
| ** in the tree-header. This will prevent the cursor from visiting any |
| ** leaf nodes. */ |
| db->treehdr.root.nHeight--; |
| treeCursorInit(db, 0, &csr); |
| |
| rc = lsmTreeCursorEnd(&csr, 0); |
| while( rc==LSM_OK && lsmTreeCursorValid(&csr) ){ |
| TreeNode *pNode = csr.apTreeNode[csr.iNode]; |
| if( pNode->iV2>iTransId ){ |
| pNode->iV2Child = 0; |
| pNode->iV2Ptr = 0; |
| pNode->iV2 = 0; |
| } |
| rc = lsmTreeCursorNext(&csr); |
| } |
| tblobFree(csr.pDb, &csr.blob); |
| |
| db->treehdr.root.nHeight++; |
| } |
| |
| return rc; |
| } |
| |
| static int treeRepairList(lsm_db *db){ |
| int rc = LSM_OK; |
| int i; |
| ShmChunk *p; |
| ShmChunk *pMin = 0; |
| u32 iMin = 0; |
| |
| /* Iterate through all shm chunks. Find the smallest shm-id present in |
| ** the shared-memory region. */ |
| for(i=1; rc==LSM_OK && (u32)i<db->treehdr.nChunk; i++){ |
| p = treeShmChunkRc(db, i, &rc); |
| if( p && (pMin==0 || shm_sequence_ge(pMin->iShmid, p->iShmid)) ){ |
| pMin = p; |
| iMin = i; |
| } |
| } |
| |
| /* Fix the shm-id values on any chunks with a shm-id greater than or |
| ** equal to treehdr.iNextShmid. Then do a merge-sort of all chunks to |
| ** fix the ShmChunk.iNext pointers. |
| */ |
| if( rc==LSM_OK ){ |
| int nSort; |
| int nByte; |
| u32 iPrevShmid; |
| ShmChunkLoc *aSort; |
| |
| /* Allocate space for a merge sort. */ |
| nSort = 1; |
| while( (u32)nSort < (db->treehdr.nChunk-1) ) nSort = nSort * 2; |
| nByte = sizeof(ShmChunkLoc) * nSort * 2; |
| aSort = lsmMallocZeroRc(db->pEnv, nByte, &rc); |
| iPrevShmid = pMin->iShmid; |
| |
| /* Fix all shm-ids, if required. */ |
| if( rc==LSM_OK ){ |
| iPrevShmid = pMin->iShmid-1; |
| for(i=1; (u32)i<db->treehdr.nChunk; i++){ |
| p = treeShmChunk(db, i); |
| aSort[i-1].pShm = p; |
| aSort[i-1].iLoc = i; |
| if( (u32)i!=db->treehdr.iFirst ){ |
| if( shm_sequence_ge(p->iShmid, db->treehdr.iNextShmid) ){ |
| p->iShmid = iPrevShmid--; |
| } |
| } |
| } |
| if( iMin!=db->treehdr.iFirst ){ |
| p = treeShmChunk(db, db->treehdr.iFirst); |
| p->iShmid = iPrevShmid; |
| } |
| } |
| |
| if( rc==LSM_OK ){ |
| ShmChunkLoc *aSpace = &aSort[nSort]; |
| for(i=0; i<nSort; i++){ |
| if( aSort[i].pShm ){ |
| assert( shm_sequence_ge(aSort[i].pShm->iShmid, iPrevShmid) ); |
| assert( aSpace[aSort[i].pShm->iShmid - iPrevShmid].pShm==0 ); |
| aSpace[aSort[i].pShm->iShmid - iPrevShmid] = aSort[i]; |
| } |
| } |
| |
| if( aSpace[nSort-1].pShm ) aSpace[nSort-1].pShm->iNext = 0; |
| for(i=0; i<nSort-1; i++){ |
| if( aSpace[i].pShm ){ |
| aSpace[i].pShm->iNext = aSpace[i+1].iLoc; |
| } |
| } |
| |
| rc = treeCheckLinkedList(db); |
| lsmFree(db->pEnv, aSort); |
| } |
| } |
| |
| return rc; |
| } |
| |
| /* |
| ** This function is called as part of opening a write-transaction if the |
| ** writer-flag is already set - indicating that the previous writer |
| ** failed before ending its transaction. |
| */ |
| int lsmTreeRepair(lsm_db *db){ |
| int rc = LSM_OK; |
| TreeHeader hdr; |
| ShmHeader *pHdr = db->pShmhdr; |
| |
| /* Ensure that the two tree-headers are consistent. Copy one over the other |
| ** if necessary. Prefer the data from a tree-header for which the checksum |
| ** computes. Or, if they both compute, prefer tree-header-1. */ |
| if( memcmp(&pHdr->hdr1, &pHdr->hdr2, sizeof(TreeHeader)) ){ |
| if( treeHeaderChecksumOk(&pHdr->hdr1) ){ |
| memcpy(&pHdr->hdr2, &pHdr->hdr1, sizeof(TreeHeader)); |
| }else{ |
| memcpy(&pHdr->hdr1, &pHdr->hdr2, sizeof(TreeHeader)); |
| } |
| } |
| |
| /* Save the connections current copy of the tree-header. It will be |
| ** restored before returning. */ |
| memcpy(&hdr, &db->treehdr, sizeof(TreeHeader)); |
| |
| /* Walk the tree. Zero any v2 pointers with a transaction-id greater than |
| ** the transaction-id currently in the tree-headers. */ |
| rc = treeRepairPtrs(db); |
| |
| /* Repair the linked list of shared-memory chunks. */ |
| if( rc==LSM_OK ){ |
| rc = treeRepairList(db); |
| } |
| |
| memcpy(&db->treehdr, &hdr, sizeof(TreeHeader)); |
| return rc; |
| } |
| |
| static void treeOverwriteKey(lsm_db *db, TreeCursor *pCsr, u32 iKey, int *pRc){ |
| if( *pRc==LSM_OK ){ |
| TreeRoot *p = &db->treehdr.root; |
| TreeNode *pNew; |
| u32 iNew; |
| TreeNode *pNode = pCsr->apTreeNode[pCsr->iNode]; |
| int iCell = pCsr->aiCell[pCsr->iNode]; |
| |
| /* Create a copy of this node */ |
| if( (pCsr->iNode>0 && (u32)pCsr->iNode==(p->nHeight-1)) ){ |
| pNew = copyTreeLeaf(db, (TreeLeaf *)pNode, &iNew, pRc); |
| }else{ |
| pNew = copyTreeNode(db, pNode, &iNew, pRc); |
| } |
| |
| if( pNew ){ |
| /* Modify the value in the new version */ |
| pNew->aiKeyPtr[iCell] = iKey; |
| |
| /* Change the pointer in the parent (if any) to point at the new |
| ** TreeNode */ |
| pCsr->iNode--; |
| treeUpdatePtr(db, pCsr, iNew); |
| } |
| } |
| } |
| |
| static int treeNextIsEndDelete(lsm_db *db, TreeCursor *pCsr){ |
| int iNode = pCsr->iNode; |
| int iCell = pCsr->aiCell[iNode]+1; |
| |
| /* Cursor currently points to a leaf node. */ |
| assert( (u32)pCsr->iNode==(db->treehdr.root.nHeight-1) ); |
| |
| while( iNode>=0 ){ |
| TreeNode *pNode = pCsr->apTreeNode[iNode]; |
| if( iCell<3 && pNode->aiKeyPtr[iCell] ){ |
| int rc = LSM_OK; |
| TreeKey *pKey = treeShmptr(db, pNode->aiKeyPtr[iCell]); |
| assert( rc==LSM_OK ); |
| return ((pKey->flags & LSM_END_DELETE) ? 1 : 0); |
| } |
| iNode--; |
| iCell = pCsr->aiCell[iNode]; |
| } |
| |
| return 0; |
| } |
| |
| static int treePrevIsStartDelete(lsm_db *db, TreeCursor *pCsr){ |
| int iNode = pCsr->iNode; |
| |
| /* Cursor currently points to a leaf node. */ |
| assert( (u32)pCsr->iNode==(db->treehdr.root.nHeight-1) ); |
| |
| while( iNode>=0 ){ |
| TreeNode *pNode = pCsr->apTreeNode[iNode]; |
| int iCell = pCsr->aiCell[iNode]-1; |
| if( iCell>=0 && pNode->aiKeyPtr[iCell] ){ |
| int rc = LSM_OK; |
| TreeKey *pKey = treeShmptr(db, pNode->aiKeyPtr[iCell]); |
| assert( rc==LSM_OK ); |
| return ((pKey->flags & LSM_START_DELETE) ? 1 : 0); |
| } |
| iNode--; |
| } |
| |
| return 0; |
| } |
| |
| |
| static int treeInsertEntry( |
| lsm_db *pDb, /* Database handle */ |
| int flags, /* Flags associated with entry */ |
| void *pKey, /* Pointer to key data */ |
| int nKey, /* Size of key data in bytes */ |
| void *pVal, /* Pointer to value data (or NULL) */ |
| int nVal /* Bytes in value data (or -ve for delete) */ |
| ){ |
| int rc = LSM_OK; /* Return Code */ |
| TreeKey *pTreeKey; /* New key-value being inserted */ |
| u32 iTreeKey; |
| TreeRoot *p = &pDb->treehdr.root; |
| TreeCursor csr; /* Cursor to seek to pKey/nKey */ |
| int res = 0; /* Result of seek operation on csr */ |
| |
| assert( nVal>=0 || pVal==0 ); |
| assert_tree_looks_ok(LSM_OK, pTree); |
| assert( flags==LSM_INSERT || flags==LSM_POINT_DELETE |
| || flags==LSM_START_DELETE || flags==LSM_END_DELETE |
| ); |
| assert( (flags & LSM_CONTIGUOUS)==0 ); |
| #if 0 |
| dump_tree_contents(pDb, "before"); |
| #endif |
| |
| if( p->iRoot ){ |
| TreeKey *pRes; /* Key at end of seek operation */ |
| treeCursorInit(pDb, 0, &csr); |
| |
| /* Seek to the leaf (or internal node) that the new key belongs on */ |
| rc = lsmTreeCursorSeek(&csr, pKey, nKey, &res); |
| pRes = csrGetKey(&csr, &csr.blob, &rc); |
| if( rc!=LSM_OK ) return rc; |
| assert( pRes ); |
| |
| if( flags==LSM_START_DELETE ){ |
| /* When inserting a start-delete-range entry, if the key that |
| ** occurs immediately before the new entry is already a START_DELETE, |
| ** then the new entry is not required. */ |
| if( (res<=0 && (pRes->flags & LSM_START_DELETE)) |
| || (res>0 && treePrevIsStartDelete(pDb, &csr)) |
| ){ |
| goto insert_entry_out; |
| } |
| }else if( flags==LSM_END_DELETE ){ |
| /* When inserting an start-delete-range entry, if the key that |
| ** occurs immediately after the new entry is already an END_DELETE, |
| ** then the new entry is not required. */ |
| if( (res<0 && treeNextIsEndDelete(pDb, &csr)) |
| || (res>=0 && (pRes->flags & LSM_END_DELETE)) |
| ){ |
| goto insert_entry_out; |
| } |
| } |
| |
| if( res==0 && (flags & (LSM_END_DELETE|LSM_START_DELETE)) ){ |
| if( pRes->flags & LSM_INSERT ){ |
| nVal = pRes->nValue; |
| pVal = TKV_VAL(pRes); |
| } |
| flags = flags | pRes->flags; |
| } |
| |
| if( flags & (LSM_INSERT|LSM_POINT_DELETE) ){ |
| if( (res<0 && (pRes->flags & LSM_START_DELETE)) |
| || (res>0 && (pRes->flags & LSM_END_DELETE)) |
| ){ |
| flags = flags | (LSM_END_DELETE|LSM_START_DELETE); |
| }else if( res==0 ){ |
| flags = flags | (pRes->flags & (LSM_END_DELETE|LSM_START_DELETE)); |
| } |
| } |
| }else{ |
| memset(&csr, 0, sizeof(TreeCursor)); |
| } |
| |
| /* Allocate and populate a new key-value pair structure */ |
| pTreeKey = newTreeKey(pDb, &iTreeKey, pKey, nKey, pVal, nVal, &rc); |
| if( rc!=LSM_OK ) return rc; |
| assert( pTreeKey->flags==0 || pTreeKey->flags==LSM_CONTIGUOUS ); |
| pTreeKey->flags |= flags; |
| |
| if( p->iRoot==0 ){ |
| /* The tree is completely empty. Add a new root node and install |
| ** (pKey/nKey) as the middle entry. Even though it is a leaf at the |
| ** moment, use newTreeNode() to allocate the node (i.e. allocate enough |
| ** space for the fields used by interior nodes). This is because the |
| ** treeInsert() routine may convert this node to an interior node. */ |
| TreeNode *pRoot = newTreeNode(pDb, &p->iRoot, &rc); |
| if( rc==LSM_OK ){ |
| assert( p->nHeight==0 ); |
| pRoot->aiKeyPtr[1] = iTreeKey; |
| p->nHeight = 1; |
| } |
| }else{ |
| if( res==0 ){ |
| /* The search found a match within the tree. */ |
| treeOverwriteKey(pDb, &csr, iTreeKey, &rc); |
| }else{ |
| /* The cursor now points to the leaf node into which the new entry should |
| ** be inserted. There may or may not be a free slot within the leaf for |
| ** the new key-value pair. |
| ** |
| ** iSlot is set to the index of the key within pLeaf that the new key |
| ** should be inserted to the left of (or to a value 1 greater than the |
| ** index of the rightmost key if the new key is larger than all keys |
| ** currently stored in the node). |
| */ |
| int iSlot = csr.aiCell[csr.iNode] + (res<0); |
| if( csr.iNode==0 ){ |
| rc = treeInsert(pDb, &csr, 0, iTreeKey, 0, iSlot); |
| }else{ |
| rc = treeInsertLeaf(pDb, &csr, iTreeKey, iSlot); |
| } |
| } |
| } |
| |
| #if 0 |
| dump_tree_contents(pDb, "after"); |
| #endif |
| insert_entry_out: |
| tblobFree(pDb, &csr.blob); |
| assert_tree_looks_ok(rc, pTree); |
| return rc; |
| } |
| |
| /* |
| ** Insert a new entry into the in-memory tree. |
| ** |
| ** If the value of the 5th parameter, nVal, is negative, then a delete-marker |
| ** is inserted into the tree. In this case the value pointer, pVal, must be |
| ** NULL. |
| */ |
| int lsmTreeInsert( |
| lsm_db *pDb, /* Database handle */ |
| void *pKey, /* Pointer to key data */ |
| int nKey, /* Size of key data in bytes */ |
| void *pVal, /* Pointer to value data (or NULL) */ |
| int nVal /* Bytes in value data (or -ve for delete) */ |
| ){ |
| int flags; |
| if( nVal<0 ){ |
| flags = LSM_POINT_DELETE; |
| }else{ |
| flags = LSM_INSERT; |
| } |
| |
| return treeInsertEntry(pDb, flags, pKey, nKey, pVal, nVal); |
| } |
| |
| static int treeDeleteEntry(lsm_db *db, TreeCursor *pCsr, u32 iNewptr){ |
| TreeRoot *p = &db->treehdr.root; |
| TreeNode *pNode = pCsr->apTreeNode[pCsr->iNode]; |
| int iSlot = pCsr->aiCell[pCsr->iNode]; |
| int bLeaf; |
| int rc = LSM_OK; |
| |
| assert( pNode->aiKeyPtr[1] ); |
| assert( pNode->aiKeyPtr[iSlot] ); |
| assert( iSlot==0 || iSlot==1 || iSlot==2 ); |
| assert( ((u32)pCsr->iNode==(db->treehdr.root.nHeight-1))==(iNewptr==0) ); |
| |
| bLeaf = ((u32)pCsr->iNode==(p->nHeight-1) && p->nHeight>1); |
| |
| if( pNode->aiKeyPtr[0] || pNode->aiKeyPtr[2] ){ |
| /* There are currently at least 2 keys on this node. So just create |
| ** a new copy of the node with one of the keys removed. If the node |
| ** happens to be the root node of the tree, allocate an entire |
| ** TreeNode structure instead of just a TreeLeaf. */ |
| TreeNode *pNew; |
| u32 iNew; |
| |
| if( bLeaf ){ |
| pNew = (TreeNode *)newTreeLeaf(db, &iNew, &rc); |
| }else{ |
| pNew = newTreeNode(db, &iNew, &rc); |
| } |
| if( pNew ){ |
| int i; |
| int iOut = 1; |
| for(i=0; i<4; i++){ |
| if( i==iSlot ){ |
| i++; |
| if( bLeaf==0 ) pNew->aiChildPtr[iOut] = iNewptr; |
| if( i<3 ) pNew->aiKeyPtr[iOut] = pNode->aiKeyPtr[i]; |
| iOut++; |
| }else if( bLeaf || p->nHeight==1 ){ |
| if( i<3 && pNode->aiKeyPtr[i] ){ |
| pNew->aiKeyPtr[iOut++] = pNode->aiKeyPtr[i]; |
| } |
| }else{ |
| if( getChildPtr(pNode, WORKING_VERSION, i) ){ |
| pNew->aiChildPtr[iOut] = getChildPtr(pNode, WORKING_VERSION, i); |
| if( i<3 ) pNew->aiKeyPtr[iOut] = pNode->aiKeyPtr[i]; |
| iOut++; |
| } |
| } |
| } |
| assert( iOut<=4 ); |
| assert( bLeaf || pNew->aiChildPtr[0]==0 ); |
| pCsr->iNode--; |
| rc = treeUpdatePtr(db, pCsr, iNew); |
| } |
| |
| }else if( pCsr->iNode==0 ){ |
| /* Removing the only key in the root node. iNewptr is the new root. */ |
| assert( iSlot==1 ); |
| db->treehdr.root.iRoot = iNewptr; |
| db->treehdr.root.nHeight--; |
| |
| }else{ |
| /* There is only one key on this node and the node is not the root |
| ** node. Find a peer for this node. Then redistribute the contents of |
| ** the peer and the parent cell between the parent and either one or |
| ** two new nodes. */ |
| TreeNode *pParent; /* Parent tree node */ |
| int iPSlot; |
| u32 iPeer; /* Pointer to peer leaf node */ |
| int iDir; |
| TreeNode *pPeer; /* The peer leaf node */ |
| TreeNode *pNew1; u32 iNew1; /* First new leaf node */ |
| |
| assert( iSlot==1 ); |
| |
| pParent = pCsr->apTreeNode[pCsr->iNode-1]; |
| iPSlot = pCsr->aiCell[pCsr->iNode-1]; |
| |
| if( iPSlot>0 && getChildPtr(pParent, WORKING_VERSION, iPSlot-1) ){ |
| iDir = -1; |
| }else{ |
| iDir = +1; |
| } |
| iPeer = getChildPtr(pParent, WORKING_VERSION, iPSlot+iDir); |
| pPeer = (TreeNode *)treeShmptr(db, iPeer); |
| assertIsWorkingChild(db, pNode, pParent, iPSlot); |
| |
| /* Allocate the first new leaf node. This is always required. */ |
| if( bLeaf ){ |
| pNew1 = (TreeNode *)newTreeLeaf(db, &iNew1, &rc); |
| }else{ |
| pNew1 = (TreeNode *)newTreeNode(db, &iNew1, &rc); |
| } |
| |
| if( pPeer->aiKeyPtr[0] && pPeer->aiKeyPtr[2] ){ |
| /* Peer node is completely full. This means that two new leaf nodes |
| ** and a new parent node are required. */ |
| |
| TreeNode *pNew2; u32 iNew2; /* Second new leaf node */ |
| TreeNode *pNewP; u32 iNewP; /* New parent node */ |
| |
| if( bLeaf ){ |
| pNew2 = (TreeNode *)newTreeLeaf(db, &iNew2, &rc); |
| }else{ |
| pNew2 = (TreeNode *)newTreeNode(db, &iNew2, &rc); |
| } |
| pNewP = copyTreeNode(db, pParent, &iNewP, &rc); |
| |
| if( iDir==-1 ){ |
| pNew1->aiKeyPtr[1] = pPeer->aiKeyPtr[0]; |
| if( bLeaf==0 ){ |
| pNew1->aiChildPtr[1] = getChildPtr(pPeer, WORKING_VERSION, 0); |
| pNew1->aiChildPtr[2] = getChildPtr(pPeer, WORKING_VERSION, 1); |
| } |
| |
| pNewP->aiChildPtr[iPSlot-1] = iNew1; |
| pNewP->aiKeyPtr[iPSlot-1] = pPeer->aiKeyPtr[1]; |
| pNewP->aiChildPtr[iPSlot] = iNew2; |
| |
| pNew2->aiKeyPtr[0] = pPeer->aiKeyPtr[2]; |
| pNew2->aiKeyPtr[1] = pParent->aiKeyPtr[iPSlot-1]; |
| if( bLeaf==0 ){ |
| pNew2->aiChildPtr[0] = getChildPtr(pPeer, WORKING_VERSION, 2); |
| pNew2->aiChildPtr[1] = getChildPtr(pPeer, WORKING_VERSION, 3); |
| pNew2->aiChildPtr[2] = iNewptr; |
| } |
| }else{ |
| pNew1->aiKeyPtr[1] = pParent->aiKeyPtr[iPSlot]; |
| if( bLeaf==0 ){ |
| pNew1->aiChildPtr[1] = iNewptr; |
| pNew1->aiChildPtr[2] = getChildPtr(pPeer, WORKING_VERSION, 0); |
| } |
| |
| pNewP->aiChildPtr[iPSlot] = iNew1; |
| pNewP->aiKeyPtr[iPSlot] = pPeer->aiKeyPtr[0]; |
| pNewP->aiChildPtr[iPSlot+1] = iNew2; |
| |
| pNew2->aiKeyPtr[0] = pPeer->aiKeyPtr[1]; |
| pNew2->aiKeyPtr[1] = pPeer->aiKeyPtr[2]; |
| if( bLeaf==0 ){ |
| pNew2->aiChildPtr[0] = getChildPtr(pPeer, WORKING_VERSION, 1); |
| pNew2->aiChildPtr[1] = getChildPtr(pPeer, WORKING_VERSION, 2); |
| pNew2->aiChildPtr[2] = getChildPtr(pPeer, WORKING_VERSION, 3); |
| } |
| } |
| assert( pCsr->iNode>=1 ); |
| pCsr->iNode -= 2; |
| if( rc==LSM_OK ){ |
| assert( pNew1->aiKeyPtr[1] && pNew2->aiKeyPtr[1] ); |
| rc = treeUpdatePtr(db, pCsr, iNewP); |
| } |
| }else{ |
| int iKOut = 0; |
| int iPOut = 0; |
| int i; |
| |
| pCsr->iNode--; |
| |
| if( iDir==1 ){ |
| pNew1->aiKeyPtr[iKOut++] = pParent->aiKeyPtr[iPSlot]; |
| if( bLeaf==0 ) pNew1->aiChildPtr[iPOut++] = iNewptr; |
| } |
| for(i=0; i<3; i++){ |
| if( pPeer->aiKeyPtr[i] ){ |
| pNew1->aiKeyPtr[iKOut++] = pPeer->aiKeyPtr[i]; |
| } |
| } |
| if( bLeaf==0 ){ |
| for(i=0; i<4; i++){ |
| if( getChildPtr(pPeer, WORKING_VERSION, i) ){ |
| pNew1->aiChildPtr[iPOut++] = getChildPtr(pPeer, WORKING_VERSION, i); |
| } |
| } |
| } |
| if( iDir==-1 ){ |
| iPSlot--; |
| pNew1->aiKeyPtr[iKOut++] = pParent->aiKeyPtr[iPSlot]; |
| if( bLeaf==0 ) pNew1->aiChildPtr[iPOut++] = iNewptr; |
| pCsr->aiCell[pCsr->iNode] = (u8)iPSlot; |
| } |
| |
| rc = treeDeleteEntry(db, pCsr, iNew1); |
| } |
| } |
| |
| return rc; |
| } |
| |
| /* |
| ** Delete a range of keys from the tree structure (i.e. the lsm_delete_range() |
| ** function, not lsm_delete()). |
| ** |
| ** This is a two step process: |
| ** |
| ** 1) Remove all entries currently stored in the tree that have keys |
| ** that fall into the deleted range. |
| ** |
| ** TODO: There are surely good ways to optimize this step - removing |
| ** a range of keys from a b-tree. But for now, this function removes |
| ** them one at a time using the usual approach. |
| ** |
| ** 2) Unless the largest key smaller than or equal to (pKey1/nKey1) is |
| ** already marked as START_DELETE, insert a START_DELETE key. |
| ** Similarly, unless the smallest key greater than or equal to |
| ** (pKey2/nKey2) is already START_END, insert a START_END key. |
| */ |
| int lsmTreeDelete( |
| lsm_db *db, |
| void *pKey1, int nKey1, /* Start of range */ |
| void *pKey2, int nKey2 /* End of range */ |
| ){ |
| int rc = LSM_OK; |
| int bDone = 0; |
| TreeRoot *p = &db->treehdr.root; |
| TreeBlob blob = {0, 0}; |
| |
| /* The range must be sensible - that (key1 < key2). */ |
| assert( treeKeycmp(pKey1, nKey1, pKey2, nKey2)<0 ); |
| assert( assert_delete_ranges_match(db) ); |
| |
| #if 0 |
| static int nCall = 0; |
| printf("\n"); |
| nCall++; |
| printf("%d delete %s .. %s\n", nCall, (char *)pKey1, (char *)pKey2); |
| dump_tree_contents(db, "before delete"); |
| #endif |
| |
| /* Step 1. This loop runs until the tree contains no keys within the |
| ** range being deleted. Or until an error occurs. */ |
| while( bDone==0 && rc==LSM_OK ){ |
| int res; |
| TreeCursor csr; /* Cursor to seek to first key in range */ |
| void *pDel; int nDel; /* Key to (possibly) delete this iteration */ |
| #ifndef NDEBUG |
| int nEntry = treeCountEntries(db); |
| #endif |
| |
| /* Seek the cursor to the first entry in the tree greater than pKey1. */ |
| treeCursorInit(db, 0, &csr); |
| lsmTreeCursorSeek(&csr, pKey1, nKey1, &res); |
| if( res<=0 && lsmTreeCursorValid(&csr) ) lsmTreeCursorNext(&csr); |
| |
| /* If there is no such entry, or if it is greater than pKey2, then the |
| ** tree now contains no keys in the range being deleted. In this case |
| ** break out of the loop. */ |
| bDone = 1; |
| if( lsmTreeCursorValid(&csr) ){ |
| lsmTreeCursorKey(&csr, 0, &pDel, &nDel); |
| if( treeKeycmp(pDel, nDel, pKey2, nKey2)<0 ) bDone = 0; |
| } |
| |
| if( bDone==0 ){ |
| if( (u32)csr.iNode==(p->nHeight-1) ){ |
| /* The element to delete already lies on a leaf node */ |
| rc = treeDeleteEntry(db, &csr, 0); |
| }else{ |
| /* 1. Overwrite the current key with a copy of the next key in the |
| ** tree (key N). |
| ** |
| ** 2. Seek to key N (cursor will stop at the internal node copy of |
| ** N). Move to the next key (original copy of N). Delete |
| ** this entry. |
| */ |
| u32 iKey; |
| TreeKey *pKey; |
| int iNode = csr.iNode; |
| lsmTreeCursorNext(&csr); |
| assert( (u32)csr.iNode==(p->nHeight-1) ); |
| |
| iKey = csr.apTreeNode[csr.iNode]->aiKeyPtr[csr.aiCell[csr.iNode]]; |
| lsmTreeCursorPrev(&csr); |
| |
| treeOverwriteKey(db, &csr, iKey, &rc); |
| pKey = treeShmkey(db, iKey, TKV_LOADKEY, &blob, &rc); |
| if( pKey ){ |
| rc = lsmTreeCursorSeek(&csr, TKV_KEY(pKey), pKey->nKey, &res); |
| } |
| if( rc==LSM_OK ){ |
| assert( res==0 && csr.iNode==iNode ); |
| rc = lsmTreeCursorNext(&csr); |
| if( rc==LSM_OK ){ |
| rc = treeDeleteEntry(db, &csr, 0); |
| } |
| } |
| } |
| } |
| |
| /* Clean up any memory allocated by the cursor. */ |
| tblobFree(db, &csr.blob); |
| #if 0 |
| dump_tree_contents(db, "ddd delete"); |
| #endif |
| assert( bDone || treeCountEntries(db)==(nEntry-1) ); |
| } |
| |
| #if 0 |
| dump_tree_contents(db, "during delete"); |
| #endif |
| |
| /* Now insert the START_DELETE and END_DELETE keys. */ |
| if( rc==LSM_OK ){ |
| rc = treeInsertEntry(db, LSM_START_DELETE, pKey1, nKey1, 0, -1); |
| } |
| #if 0 |
| dump_tree_contents(db, "during delete 2"); |
| #endif |
| if( rc==LSM_OK ){ |
| rc = treeInsertEntry(db, LSM_END_DELETE, pKey2, nKey2, 0, -1); |
| } |
| |
| #if 0 |
| dump_tree_contents(db, "after delete"); |
| #endif |
| |
| tblobFree(db, &blob); |
| assert( assert_delete_ranges_match(db) ); |
| return rc; |
| } |
| |
| /* |
| ** Return, in bytes, the amount of memory currently used by the tree |
| ** structure. |
| */ |
| int lsmTreeSize(lsm_db *pDb){ |
| return pDb->treehdr.root.nByte; |
| } |
| |
| /* |
| ** Open a cursor on the in-memory tree pTree. |
| */ |
| int lsmTreeCursorNew(lsm_db *pDb, int bOld, TreeCursor **ppCsr){ |
| TreeCursor *pCsr; |
| *ppCsr = pCsr = lsmMalloc(pDb->pEnv, sizeof(TreeCursor)); |
| if( pCsr ){ |
| treeCursorInit(pDb, bOld, pCsr); |
| return LSM_OK; |
| } |
| return LSM_NOMEM_BKPT; |
| } |
| |
| /* |
| ** Close an in-memory tree cursor. |
| */ |
| void lsmTreeCursorDestroy(TreeCursor *pCsr){ |
| if( pCsr ){ |
| tblobFree(pCsr->pDb, &pCsr->blob); |
| lsmFree(pCsr->pDb->pEnv, pCsr); |
| } |
| } |
| |
| void lsmTreeCursorReset(TreeCursor *pCsr){ |
| if( pCsr ){ |
| pCsr->iNode = -1; |
| pCsr->pSave = 0; |
| } |
| } |
| |
| #ifndef NDEBUG |
| static int treeCsrCompare(TreeCursor *pCsr, void *pKey, int nKey, int *pRc){ |
| TreeKey *p; |
| int cmp = 0; |
| assert( pCsr->iNode>=0 ); |
| p = csrGetKey(pCsr, &pCsr->blob, pRc); |
| if( p ){ |
| cmp = treeKeycmp(TKV_KEY(p), p->nKey, pKey, nKey); |
| } |
| return cmp; |
| } |
| #endif |
| |
| |
| /* |
| ** Attempt to seek the cursor passed as the first argument to key (pKey/nKey) |
| ** in the tree structure. If an exact match for the key is found, leave the |
| ** cursor pointing to it and set *pRes to zero before returning. If an |
| ** exact match cannot be found, do one of the following: |
| ** |
| ** * Leave the cursor pointing to the smallest element in the tree that |
| ** is larger than the key and set *pRes to +1, or |
| ** |
| ** * Leave the cursor pointing to the largest element in the tree that |
| ** is smaller than the key and set *pRes to -1, or |
| ** |
| ** * If the tree is empty, leave the cursor at EOF and set *pRes to -1. |
| */ |
| int lsmTreeCursorSeek(TreeCursor *pCsr, void *pKey, int nKey, int *pRes){ |
| int rc = LSM_OK; /* Return code */ |
| lsm_db *pDb = pCsr->pDb; |
| TreeRoot *pRoot = pCsr->pRoot; |
| u32 iNodePtr; /* Location of current node in search */ |
| |
| /* Discard any saved position data */ |
| treeCursorRestore(pCsr, 0); |
| |
| iNodePtr = pRoot->iRoot; |
| if( iNodePtr==0 ){ |
| /* Either an error occurred or the tree is completely empty. */ |
| assert( rc!=LSM_OK || pRoot->iRoot==0 ); |
| *pRes = -1; |
| pCsr->iNode = -1; |
| }else{ |
| TreeBlob b = {0, 0}; |
| int res = 0; /* Result of comparison function */ |
| int iNode = -1; |
| while( iNodePtr ){ |
| TreeNode *pNode; /* Node at location iNodePtr */ |
| int iTest; /* Index of second key to test (0 or 2) */ |
| u32 iTreeKey; |
| TreeKey *pTreeKey; /* Key to compare against */ |
| |
| pNode = (TreeNode *)treeShmptrUnsafe(pDb, iNodePtr); |
| iNode++; |
| pCsr->apTreeNode[iNode] = pNode; |
| |
| /* Compare (pKey/nKey) with the key in the middle slot of B-tree node |
| ** pNode. The middle slot is never empty. If the comparison is a match, |
| ** then the search is finished. Break out of the loop. */ |
| pTreeKey = (TreeKey*)treeShmptrUnsafe(pDb, pNode->aiKeyPtr[1]); |
| if( !(pTreeKey->flags & LSM_CONTIGUOUS) ){ |
| pTreeKey = treeShmkey(pDb, pNode->aiKeyPtr[1], TKV_LOADKEY, &b, &rc); |
| if( rc!=LSM_OK ) break; |
| } |
| res = treeKeycmp((void *)&pTreeKey[1], pTreeKey->nKey, pKey, nKey); |
| if( res==0 ){ |
| pCsr->aiCell[iNode] = 1; |
| break; |
| } |
| |
| /* Based on the results of the previous comparison, compare (pKey/nKey) |
| ** to either the left or right key of the B-tree node, if such a key |
| ** exists. */ |
| iTest = (res>0 ? 0 : 2); |
| iTreeKey = pNode->aiKeyPtr[iTest]; |
| if( iTreeKey ){ |
| pTreeKey = (TreeKey*)treeShmptrUnsafe(pDb, iTreeKey); |
| if( !(pTreeKey->flags & LSM_CONTIGUOUS) ){ |
| pTreeKey = treeShmkey(pDb, iTreeKey, TKV_LOADKEY, &b, &rc); |
| if( rc ) break; |
| } |
| res = treeKeycmp((void *)&pTreeKey[1], pTreeKey->nKey, pKey, nKey); |
| if( res==0 ){ |
| pCsr->aiCell[iNode] = (u8)iTest; |
| break; |
| } |
| }else{ |
| iTest = 1; |
| } |
| |
| if( (u32)iNode<(pRoot->nHeight-1) ){ |
| iNodePtr = getChildPtr(pNode, pRoot->iTransId, iTest + (res<0)); |
| }else{ |
| iNodePtr = 0; |
| } |
| pCsr->aiCell[iNode] = (u8)(iTest + (iNodePtr && (res<0))); |
| } |
| |
| *pRes = res; |
| pCsr->iNode = iNode; |
| tblobFree(pDb, &b); |
| } |
| |
| /* assert() that *pRes has been set properly */ |
| #ifndef NDEBUG |
| if( rc==LSM_OK && lsmTreeCursorValid(pCsr) ){ |
| int cmp = treeCsrCompare(pCsr, pKey, nKey, &rc); |
| assert( rc!=LSM_OK || *pRes==cmp || (*pRes ^ cmp)>0 ); |
| } |
| #endif |
| |
| return rc; |
| } |
| |
| int lsmTreeCursorNext(TreeCursor *pCsr){ |
| #ifndef NDEBUG |
| TreeKey *pK1; |
| TreeBlob key1 = {0, 0}; |
| #endif |
| lsm_db *pDb = pCsr->pDb; |
| TreeRoot *pRoot = pCsr->pRoot; |
| const int iLeaf = pRoot->nHeight-1; |
| int iCell; |
| int rc = LSM_OK; |
| TreeNode *pNode; |
| |
| /* Restore the cursor position, if required */ |
| int iRestore = 0; |
| treeCursorRestore(pCsr, &iRestore); |
| if( iRestore>0 ) return LSM_OK; |
| |
| /* Save a pointer to the current key. This is used in an assert() at the |
| ** end of this function - to check that the 'next' key really is larger |
| ** than the current key. */ |
| #ifndef NDEBUG |
| pK1 = csrGetKey(pCsr, &key1, &rc); |
| if( rc!=LSM_OK ) return rc; |
| #endif |
| |
| assert( lsmTreeCursorValid(pCsr) ); |
| assert( pCsr->aiCell[pCsr->iNode]<3 ); |
| |
| pNode = pCsr->apTreeNode[pCsr->iNode]; |
| iCell = ++pCsr->aiCell[pCsr->iNode]; |
| |
| /* If the current node is not a leaf, and the current cell has sub-tree |
| ** associated with it, descend to the left-most key on the left-most |
| ** leaf of the sub-tree. */ |
| if( pCsr->iNode<iLeaf && getChildPtr(pNode, pRoot->iTransId, iCell) ){ |
| do { |
| u32 iNodePtr; |
| pCsr->iNode++; |
| iNodePtr = getChildPtr(pNode, pRoot->iTransId, iCell); |
| pNode = (TreeNode *)treeShmptr(pDb, iNodePtr); |
| pCsr->apTreeNode[pCsr->iNode] = pNode; |
| iCell = pCsr->aiCell[pCsr->iNode] = (pNode->aiKeyPtr[0]==0); |
| }while( pCsr->iNode < iLeaf ); |
| } |
| |
| /* Otherwise, the next key is found by following pointer up the tree |
| ** until there is a key immediately to the right of the pointer followed |
| ** to reach the sub-tree containing the current key. */ |
| else if( iCell>=3 || pNode->aiKeyPtr[iCell]==0 ){ |
| while( (--pCsr->iNode)>=0 ){ |
| iCell = pCsr->aiCell[pCsr->iNode]; |
| if( iCell<3 && pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[iCell] ) break; |
| } |
| } |
| |
| #ifndef NDEBUG |
| if( pCsr->iNode>=0 ){ |
| TreeKey *pK2 = csrGetKey(pCsr, &pCsr->blob, &rc); |
| assert( rc||treeKeycmp(TKV_KEY(pK2),pK2->nKey,TKV_KEY(pK1),pK1->nKey)>=0 ); |
| } |
| tblobFree(pDb, &key1); |
| #endif |
| |
| return rc; |
| } |
| |
| int lsmTreeCursorPrev(TreeCursor *pCsr){ |
| #ifndef NDEBUG |
| TreeKey *pK1; |
| TreeBlob key1 = {0, 0}; |
| #endif |
| lsm_db *pDb = pCsr->pDb; |
| TreeRoot *pRoot = pCsr->pRoot; |
| const int iLeaf = pRoot->nHeight-1; |
| int iCell; |
| int rc = LSM_OK; |
| TreeNode *pNode; |
| |
| /* Restore the cursor position, if required */ |
| int iRestore = 0; |
| treeCursorRestore(pCsr, &iRestore); |
| if( iRestore<0 ) return LSM_OK; |
| |
| /* Save a pointer to the current key. This is used in an assert() at the |
| ** end of this function - to check that the 'next' key really is smaller |
| ** than the current key. */ |
| #ifndef NDEBUG |
| pK1 = csrGetKey(pCsr, &key1, &rc); |
| if( rc!=LSM_OK ) return rc; |
| #endif |
| |
| assert( lsmTreeCursorValid(pCsr) ); |
| pNode = pCsr->apTreeNode[pCsr->iNode]; |
| iCell = pCsr->aiCell[pCsr->iNode]; |
| assert( iCell>=0 && iCell<3 ); |
| |
| /* If the current node is not a leaf, and the current cell has sub-tree |
| ** associated with it, descend to the right-most key on the right-most |
| ** leaf of the sub-tree. */ |
| if( pCsr->iNode<iLeaf && getChildPtr(pNode, pRoot->iTransId, iCell) ){ |
| do { |
| u32 iNodePtr; |
| pCsr->iNode++; |
| iNodePtr = getChildPtr(pNode, pRoot->iTransId, iCell); |
| pNode = (TreeNode *)treeShmptr(pDb, iNodePtr); |
| if( rc!=LSM_OK ) break; |
| pCsr->apTreeNode[pCsr->iNode] = pNode; |
| iCell = 1 + (pNode->aiKeyPtr[2]!=0) + (pCsr->iNode < iLeaf); |
| pCsr->aiCell[pCsr->iNode] = (u8)iCell; |
| }while( pCsr->iNode < iLeaf ); |
| } |
| |
| /* Otherwise, the next key is found by following pointer up the tree until |
| ** there is a key immediately to the left of the pointer followed to reach |
| ** the sub-tree containing the current key. */ |
| else{ |
| do { |
| iCell = pCsr->aiCell[pCsr->iNode]-1; |
| if( iCell>=0 && pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[iCell] ) break; |
| }while( (--pCsr->iNode)>=0 ); |
| pCsr->aiCell[pCsr->iNode] = (u8)iCell; |
| } |
| |
| #ifndef NDEBUG |
| if( pCsr->iNode>=0 ){ |
| TreeKey *pK2 = csrGetKey(pCsr, &pCsr->blob, &rc); |
| assert( rc || treeKeycmp(TKV_KEY(pK2),pK2->nKey,TKV_KEY(pK1),pK1->nKey)<0 ); |
| } |
| tblobFree(pDb, &key1); |
| #endif |
| |
| return rc; |
| } |
| |
| /* |
| ** Move the cursor to the first (bLast==0) or last (bLast!=0) entry in the |
| ** in-memory tree. |
| */ |
| int lsmTreeCursorEnd(TreeCursor *pCsr, int bLast){ |
| lsm_db *pDb = pCsr->pDb; |
| TreeRoot *pRoot = pCsr->pRoot; |
| int rc = LSM_OK; |
| |
| u32 iNodePtr; |
| pCsr->iNode = -1; |
| |
| /* Discard any saved position data */ |
| treeCursorRestore(pCsr, 0); |
| |
| iNodePtr = pRoot->iRoot; |
| while( iNodePtr ){ |
| int iCell; |
| TreeNode *pNode; |
| |
| pNode = (TreeNode *)treeShmptr(pDb, iNodePtr); |
| if( rc ) break; |
| |
| if( bLast ){ |
| iCell = ((pNode->aiKeyPtr[2]==0) ? 2 : 3); |
| }else{ |
| iCell = ((pNode->aiKeyPtr[0]==0) ? 1 : 0); |
| } |
| pCsr->iNode++; |
| pCsr->apTreeNode[pCsr->iNode] = pNode; |
| |
| if( (u32)pCsr->iNode<pRoot->nHeight-1 ){ |
| iNodePtr = getChildPtr(pNode, pRoot->iTransId, iCell); |
| }else{ |
| iNodePtr = 0; |
| } |
| pCsr->aiCell[pCsr->iNode] = (u8)(iCell - (iNodePtr==0 && bLast)); |
| } |
| |
| return rc; |
| } |
| |
| int lsmTreeCursorFlags(TreeCursor *pCsr){ |
| int flags = 0; |
| if( pCsr && pCsr->iNode>=0 ){ |
| int rc = LSM_OK; |
| TreeKey *pKey = (TreeKey *)treeShmptrUnsafe(pCsr->pDb, |
| pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[pCsr->aiCell[pCsr->iNode]] |
| ); |
| assert( rc==LSM_OK ); |
| flags = (pKey->flags & ~LSM_CONTIGUOUS); |
| } |
| return flags; |
| } |
| |
| int lsmTreeCursorKey(TreeCursor *pCsr, int *pFlags, void **ppKey, int *pnKey){ |
| TreeKey *pTreeKey; |
| int rc = LSM_OK; |
| |
| assert( lsmTreeCursorValid(pCsr) ); |
| |
| pTreeKey = pCsr->pSave; |
| if( !pTreeKey ){ |
| pTreeKey = csrGetKey(pCsr, &pCsr->blob, &rc); |
| } |
| if( rc==LSM_OK ){ |
| *pnKey = pTreeKey->nKey; |
| if( pFlags ) *pFlags = pTreeKey->flags; |
| *ppKey = (void *)&pTreeKey[1]; |
| } |
| |
| return rc; |
| } |
| |
| int lsmTreeCursorValue(TreeCursor *pCsr, void **ppVal, int *pnVal){ |
| int res = 0; |
| int rc; |
| |
| rc = treeCursorRestore(pCsr, &res); |
| if( res==0 ){ |
| TreeKey *pTreeKey = csrGetKey(pCsr, &pCsr->blob, &rc); |
| if( rc==LSM_OK ){ |
| if( pTreeKey->flags & LSM_INSERT ){ |
| *pnVal = pTreeKey->nValue; |
| *ppVal = TKV_VAL(pTreeKey); |
| }else{ |
| *ppVal = 0; |
| *pnVal = -1; |
| } |
| } |
| }else{ |
| *ppVal = 0; |
| *pnVal = 0; |
| } |
| |
| return rc; |
| } |
| |
| /* |
| ** Return true if the cursor currently points to a valid entry. |
| */ |
| int lsmTreeCursorValid(TreeCursor *pCsr){ |
| return (pCsr && (pCsr->pSave || pCsr->iNode>=0)); |
| } |
| |
| /* |
| ** Store a mark in *pMark. Later on, a call to lsmTreeRollback() with a |
| ** pointer to the same TreeMark structure may be used to roll the tree |
| ** contents back to their current state. |
| */ |
| void lsmTreeMark(lsm_db *pDb, TreeMark *pMark){ |
| pMark->iRoot = pDb->treehdr.root.iRoot; |
| pMark->nHeight = pDb->treehdr.root.nHeight; |
| pMark->iWrite = pDb->treehdr.iWrite; |
| pMark->nChunk = pDb->treehdr.nChunk; |
| pMark->iNextShmid = pDb->treehdr.iNextShmid; |
| pMark->iRollback = intArraySize(&pDb->rollback); |
| } |
| |
| /* |
| ** Roll back to mark pMark. Structure *pMark should have been previously |
| ** populated by a call to lsmTreeMark(). |
| */ |
| void lsmTreeRollback(lsm_db *pDb, TreeMark *pMark){ |
| int iIdx; |
| int nIdx; |
| u32 iNext; |
| ShmChunk *pChunk; |
| u32 iChunk; |
| u32 iShmid; |
| |
| /* Revert all required v2 pointers. */ |
| nIdx = intArraySize(&pDb->rollback); |
| for(iIdx = pMark->iRollback; iIdx<nIdx; iIdx++){ |
| TreeNode *pNode; |
| pNode = treeShmptr(pDb, intArrayEntry(&pDb->rollback, iIdx)); |
| assert( pNode ); |
| pNode->iV2 = 0; |
| pNode->iV2Child = 0; |
| pNode->iV2Ptr = 0; |
| } |
| intArrayTruncate(&pDb->rollback, pMark->iRollback); |
| |
| /* Restore the free-chunk list. */ |
| assert( pMark->iWrite!=0 ); |
| iChunk = treeOffsetToChunk(pMark->iWrite-1); |
| pChunk = treeShmChunk(pDb, iChunk); |
| iNext = pChunk->iNext; |
| pChunk->iNext = 0; |
| |
| pChunk = treeShmChunk(pDb, pDb->treehdr.iFirst); |
| iShmid = pChunk->iShmid-1; |
| |
| while( iNext ){ |
| u32 iFree = iNext; /* Current chunk being rollback-freed */ |
| ShmChunk *pFree; /* Pointer to chunk iFree */ |
| |
| pFree = treeShmChunk(pDb, iFree); |
| iNext = pFree->iNext; |
| |
| if( iFree<pMark->nChunk ){ |
| pFree->iNext = pDb->treehdr.iFirst; |
| pFree->iShmid = iShmid--; |
| pDb->treehdr.iFirst = iFree; |
| } |
| } |
| |
| /* Restore the tree-header fields */ |
| pDb->treehdr.root.iRoot = pMark->iRoot; |
| pDb->treehdr.root.nHeight = pMark->nHeight; |
| pDb->treehdr.iWrite = pMark->iWrite; |
| pDb->treehdr.nChunk = pMark->nChunk; |
| pDb->treehdr.iNextShmid = pMark->iNextShmid; |
| } |
| |
| /* |
| ** Load the in-memory tree header from shared-memory into pDb->treehdr. |
| ** If the header cannot be loaded, return LSM_PROTOCOL. |
| ** |
| ** If the header is successfully loaded and parameter piRead is not NULL, |
| ** is is set to 1 if the header was loaded from ShmHeader.hdr1, or 2 if |
| ** the header was loaded from ShmHeader.hdr2. |
| */ |
| int lsmTreeLoadHeader(lsm_db *pDb, int *piRead){ |
| int nRem = LSM_ATTEMPTS_BEFORE_PROTOCOL; |
| while( (nRem--)>0 ){ |
| ShmHeader *pShm = pDb->pShmhdr; |
| |
| memcpy(&pDb->treehdr, &pShm->hdr1, sizeof(TreeHeader)); |
| if( treeHeaderChecksumOk(&pDb->treehdr) ){ |
| if( piRead ) *piRead = 1; |
| return LSM_OK; |
| } |
| memcpy(&pDb->treehdr, &pShm->hdr2, sizeof(TreeHeader)); |
| if( treeHeaderChecksumOk(&pDb->treehdr) ){ |
| if( piRead ) *piRead = 2; |
| return LSM_OK; |
| } |
| |
| lsmShmBarrier(pDb); |
| } |
| return LSM_PROTOCOL_BKPT; |
| } |
| |
| int lsmTreeLoadHeaderOk(lsm_db *pDb, int iRead){ |
| TreeHeader *p = (iRead==1) ? &pDb->pShmhdr->hdr1 : &pDb->pShmhdr->hdr2; |
| assert( iRead==1 || iRead==2 ); |
| return (0==memcmp(pDb->treehdr.aCksum, p->aCksum, sizeof(u32)*2)); |
| } |
| |
| /* |
| ** This function is called to conclude a transaction. If argument bCommit |
| ** is true, the transaction is committed. Otherwise it is rolled back. |
| */ |
| int lsmTreeEndTransaction(lsm_db *pDb, int bCommit){ |
| ShmHeader *pShm = pDb->pShmhdr; |
| |
| treeHeaderChecksum(&pDb->treehdr, pDb->treehdr.aCksum); |
| memcpy(&pShm->hdr2, &pDb->treehdr, sizeof(TreeHeader)); |
| lsmShmBarrier(pDb); |
| memcpy(&pShm->hdr1, &pDb->treehdr, sizeof(TreeHeader)); |
| pShm->bWriter = 0; |
| intArrayFree(pDb->pEnv, &pDb->rollback); |
| |
| return LSM_OK; |
| } |
| |
| #ifndef NDEBUG |
| static int assert_delete_ranges_match(lsm_db *db){ |
| int prev = 0; |
| TreeBlob blob = {0, 0}; |
| TreeCursor csr; /* Cursor used to iterate through tree */ |
| int rc; |
| |
| treeCursorInit(db, 0, &csr); |
| for( rc = lsmTreeCursorEnd(&csr, 0); |
| rc==LSM_OK && lsmTreeCursorValid(&csr); |
| rc = lsmTreeCursorNext(&csr) |
| ){ |
| TreeKey *pKey = csrGetKey(&csr, &blob, &rc); |
| if( rc!=LSM_OK ) break; |
| assert( ((prev&LSM_START_DELETE)==0)==((pKey->flags&LSM_END_DELETE)==0) ); |
| prev = pKey->flags; |
| } |
| |
| tblobFree(csr.pDb, &csr.blob); |
| tblobFree(csr.pDb, &blob); |
| |
| return 1; |
| } |
| |
| static int treeCountEntries(lsm_db *db){ |
| TreeCursor csr; /* Cursor used to iterate through tree */ |
| int rc; |
| int nEntry = 0; |
| |
| treeCursorInit(db, 0, &csr); |
| for( rc = lsmTreeCursorEnd(&csr, 0); |
| rc==LSM_OK && lsmTreeCursorValid(&csr); |
| rc = lsmTreeCursorNext(&csr) |
| ){ |
| nEntry++; |
| } |
| |
| tblobFree(csr.pDb, &csr.blob); |
| |
| return nEntry; |
| } |
| #endif |