| package bolt |
| |
| import ( |
| "fmt" |
| "hash/fnv" |
| "os" |
| "runtime" |
| "runtime/debug" |
| "strings" |
| "sync" |
| "time" |
| "unsafe" |
| ) |
| |
| // The smallest size that the mmap can be. |
| const minMmapSize = 1 << 22 // 4MB |
| |
| // The largest step that can be taken when remapping the mmap. |
| const maxMmapStep = 1 << 30 // 1GB |
| |
| // The data file format version. |
| const version = 2 |
| |
| // Represents a marker value to indicate that a file is a Bolt DB. |
| const magic uint32 = 0xED0CDAED |
| |
| // IgnoreNoSync specifies whether the NoSync field of a DB is ignored when |
| // syncing changes to a file. This is required as some operating systems, |
| // such as OpenBSD, do not have a unified buffer cache (UBC) and writes |
| // must be synchronzied using the msync(2) syscall. |
| const IgnoreNoSync = runtime.GOOS == "openbsd" |
| |
| // DB represents a collection of buckets persisted to a file on disk. |
| // All data access is performed through transactions which can be obtained through the DB. |
| // All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called. |
| type DB struct { |
| // When enabled, the database will perform a Check() after every commit. |
| // A panic is issued if the database is in an inconsistent state. This |
| // flag has a large performance impact so it should only be used for |
| // debugging purposes. |
| StrictMode bool |
| |
| // Setting the NoSync flag will cause the database to skip fsync() |
| // calls after each commit. This can be useful when bulk loading data |
| // into a database and you can restart the bulk load in the event of |
| // a system failure or database corruption. Do not set this flag for |
| // normal use. |
| // |
| // If the package global IgnoreNoSync constant is true, this value is |
| // ignored. See the comment on that constant for more details. |
| // |
| // THIS IS UNSAFE. PLEASE USE WITH CAUTION. |
| NoSync bool |
| |
| path string |
| file *os.File |
| dataref []byte |
| data *[maxMapSize]byte |
| datasz int |
| meta0 *meta |
| meta1 *meta |
| pageSize int |
| opened bool |
| rwtx *Tx |
| txs []*Tx |
| freelist *freelist |
| stats Stats |
| |
| rwlock sync.Mutex // Allows only one writer at a time. |
| metalock sync.Mutex // Protects meta page access. |
| mmaplock sync.RWMutex // Protects mmap access during remapping. |
| statlock sync.RWMutex // Protects stats access. |
| |
| ops struct { |
| writeAt func(b []byte, off int64) (n int, err error) |
| } |
| } |
| |
| // Path returns the path to currently open database file. |
| func (db *DB) Path() string { |
| return db.path |
| } |
| |
| // GoString returns the Go string representation of the database. |
| func (db *DB) GoString() string { |
| return fmt.Sprintf("bolt.DB{path:%q}", db.path) |
| } |
| |
| // String returns the string representation of the database. |
| func (db *DB) String() string { |
| return fmt.Sprintf("DB<%q>", db.path) |
| } |
| |
| // Open creates and opens a database at the given path. |
| // If the file does not exist then it will be created automatically. |
| // Passing in nil options will cause Bolt to open the database with the default options. |
| func Open(path string, mode os.FileMode, options *Options) (*DB, error) { |
| var db = &DB{opened: true} |
| |
| // Set default options if no options are provided. |
| if options == nil { |
| options = DefaultOptions |
| } |
| |
| // Open data file and separate sync handler for metadata writes. |
| db.path = path |
| |
| var err error |
| if db.file, err = os.OpenFile(db.path, os.O_RDWR|os.O_CREATE, mode); err != nil { |
| _ = db.close() |
| return nil, err |
| } |
| |
| // Lock file so that other processes using Bolt cannot use the database |
| // at the same time. This would cause corruption since the two processes |
| // would write meta pages and free pages separately. |
| if err := flock(db.file, options.Timeout); err != nil { |
| _ = db.close() |
| return nil, err |
| } |
| |
| // Default values for test hooks |
| db.ops.writeAt = db.file.WriteAt |
| |
| // Initialize the database if it doesn't exist. |
| if info, err := db.file.Stat(); err != nil { |
| return nil, fmt.Errorf("stat error: %s", err) |
| } else if info.Size() == 0 { |
| // Initialize new files with meta pages. |
| if err := db.init(); err != nil { |
| return nil, err |
| } |
| } else { |
| // Read the first meta page to determine the page size. |
| var buf [0x1000]byte |
| if _, err := db.file.ReadAt(buf[:], 0); err == nil { |
| m := db.pageInBuffer(buf[:], 0).meta() |
| if err := m.validate(); err != nil { |
| return nil, fmt.Errorf("meta0 error: %s", err) |
| } |
| db.pageSize = int(m.pageSize) |
| } |
| } |
| |
| // Memory map the data file. |
| if err := db.mmap(0); err != nil { |
| _ = db.close() |
| return nil, err |
| } |
| |
| // Read in the freelist. |
| db.freelist = newFreelist() |
| db.freelist.read(db.page(db.meta().freelist)) |
| |
| // Mark the database as opened and return. |
| return db, nil |
| } |
| |
| // mmap opens the underlying memory-mapped file and initializes the meta references. |
| // minsz is the minimum size that the new mmap can be. |
| func (db *DB) mmap(minsz int) error { |
| db.mmaplock.Lock() |
| defer db.mmaplock.Unlock() |
| |
| // Dereference all mmap references before unmapping. |
| if db.rwtx != nil { |
| db.rwtx.root.dereference() |
| } |
| |
| // Unmap existing data before continuing. |
| if err := db.munmap(); err != nil { |
| return err |
| } |
| |
| info, err := db.file.Stat() |
| if err != nil { |
| return fmt.Errorf("mmap stat error: %s", err) |
| } else if int(info.Size()) < db.pageSize*2 { |
| return fmt.Errorf("file size too small") |
| } |
| |
| // Ensure the size is at least the minimum size. |
| var size = int(info.Size()) |
| if size < minsz { |
| size = minsz |
| } |
| size = db.mmapSize(size) |
| |
| // Memory-map the data file as a byte slice. |
| if err := mmap(db, size); err != nil { |
| return err |
| } |
| |
| // Save references to the meta pages. |
| db.meta0 = db.page(0).meta() |
| db.meta1 = db.page(1).meta() |
| |
| // Validate the meta pages. |
| if err := db.meta0.validate(); err != nil { |
| return fmt.Errorf("meta0 error: %s", err) |
| } |
| if err := db.meta1.validate(); err != nil { |
| return fmt.Errorf("meta1 error: %s", err) |
| } |
| |
| return nil |
| } |
| |
| // munmap unmaps the data file from memory. |
| func (db *DB) munmap() error { |
| if err := munmap(db); err != nil { |
| return fmt.Errorf("unmap error: " + err.Error()) |
| } |
| return nil |
| } |
| |
| // mmapSize determines the appropriate size for the mmap given the current size |
| // of the database. The minimum size is 4MB and doubles until it reaches 1GB. |
| func (db *DB) mmapSize(size int) int { |
| if size <= minMmapSize { |
| return minMmapSize |
| } else if size < maxMmapStep { |
| size *= 2 |
| } else { |
| size += maxMmapStep |
| } |
| |
| // Ensure that the mmap size is a multiple of the page size. |
| if (size % db.pageSize) != 0 { |
| size = ((size / db.pageSize) + 1) * db.pageSize |
| } |
| |
| return size |
| } |
| |
| // init creates a new database file and initializes its meta pages. |
| func (db *DB) init() error { |
| // Set the page size to the OS page size. |
| db.pageSize = os.Getpagesize() |
| |
| // Create two meta pages on a buffer. |
| buf := make([]byte, db.pageSize*4) |
| for i := 0; i < 2; i++ { |
| p := db.pageInBuffer(buf[:], pgid(i)) |
| p.id = pgid(i) |
| p.flags = metaPageFlag |
| |
| // Initialize the meta page. |
| m := p.meta() |
| m.magic = magic |
| m.version = version |
| m.pageSize = uint32(db.pageSize) |
| m.version = version |
| m.freelist = 2 |
| m.root = bucket{root: 3} |
| m.pgid = 4 |
| m.txid = txid(i) |
| } |
| |
| // Write an empty freelist at page 3. |
| p := db.pageInBuffer(buf[:], pgid(2)) |
| p.id = pgid(2) |
| p.flags = freelistPageFlag |
| p.count = 0 |
| |
| // Write an empty leaf page at page 4. |
| p = db.pageInBuffer(buf[:], pgid(3)) |
| p.id = pgid(3) |
| p.flags = leafPageFlag |
| p.count = 0 |
| |
| // Write the buffer to our data file. |
| if _, err := db.ops.writeAt(buf, 0); err != nil { |
| return err |
| } |
| if err := fdatasync(db); err != nil { |
| return err |
| } |
| |
| return nil |
| } |
| |
| // Close releases all database resources. |
| // All transactions must be closed before closing the database. |
| func (db *DB) Close() error { |
| db.metalock.Lock() |
| defer db.metalock.Unlock() |
| return db.close() |
| } |
| |
| func (db *DB) close() error { |
| db.opened = false |
| |
| db.freelist = nil |
| db.path = "" |
| |
| // Clear ops. |
| db.ops.writeAt = nil |
| |
| // Close the mmap. |
| if err := db.munmap(); err != nil { |
| return err |
| } |
| |
| // Close file handles. |
| if db.file != nil { |
| // Unlock the file. |
| _ = funlock(db.file) |
| |
| // Close the file descriptor. |
| if err := db.file.Close(); err != nil { |
| return fmt.Errorf("db file close: %s", err) |
| } |
| db.file = nil |
| } |
| |
| return nil |
| } |
| |
| // Begin starts a new transaction. |
| // Multiple read-only transactions can be used concurrently but only one |
| // write transaction can be used at a time. Starting multiple write transactions |
| // will cause the calls to block and be serialized until the current write |
| // transaction finishes. |
| // |
| // IMPORTANT: You must close read-only transactions after you are finished or |
| // else the database will not reclaim old pages. |
| func (db *DB) Begin(writable bool) (*Tx, error) { |
| if writable { |
| return db.beginRWTx() |
| } |
| return db.beginTx() |
| } |
| |
| func (db *DB) beginTx() (*Tx, error) { |
| // Lock the meta pages while we initialize the transaction. We obtain |
| // the meta lock before the mmap lock because that's the order that the |
| // write transaction will obtain them. |
| db.metalock.Lock() |
| |
| // Obtain a read-only lock on the mmap. When the mmap is remapped it will |
| // obtain a write lock so all transactions must finish before it can be |
| // remapped. |
| db.mmaplock.RLock() |
| |
| // Exit if the database is not open yet. |
| if !db.opened { |
| db.mmaplock.RUnlock() |
| db.metalock.Unlock() |
| return nil, ErrDatabaseNotOpen |
| } |
| |
| // Create a transaction associated with the database. |
| t := &Tx{} |
| t.init(db) |
| |
| // Keep track of transaction until it closes. |
| db.txs = append(db.txs, t) |
| n := len(db.txs) |
| |
| // Unlock the meta pages. |
| db.metalock.Unlock() |
| |
| // Update the transaction stats. |
| db.statlock.Lock() |
| db.stats.TxN++ |
| db.stats.OpenTxN = n |
| db.statlock.Unlock() |
| |
| return t, nil |
| } |
| |
| func (db *DB) beginRWTx() (*Tx, error) { |
| // Obtain writer lock. This is released by the transaction when it closes. |
| // This enforces only one writer transaction at a time. |
| db.rwlock.Lock() |
| |
| // Once we have the writer lock then we can lock the meta pages so that |
| // we can set up the transaction. |
| db.metalock.Lock() |
| defer db.metalock.Unlock() |
| |
| // Exit if the database is not open yet. |
| if !db.opened { |
| db.rwlock.Unlock() |
| return nil, ErrDatabaseNotOpen |
| } |
| |
| // Create a transaction associated with the database. |
| t := &Tx{writable: true} |
| t.init(db) |
| db.rwtx = t |
| |
| // Free any pages associated with closed read-only transactions. |
| var minid txid = 0xFFFFFFFFFFFFFFFF |
| for _, t := range db.txs { |
| if t.meta.txid < minid { |
| minid = t.meta.txid |
| } |
| } |
| if minid > 0 { |
| db.freelist.release(minid - 1) |
| } |
| |
| return t, nil |
| } |
| |
| // removeTx removes a transaction from the database. |
| func (db *DB) removeTx(tx *Tx) { |
| // Release the read lock on the mmap. |
| db.mmaplock.RUnlock() |
| |
| // Use the meta lock to restrict access to the DB object. |
| db.metalock.Lock() |
| |
| // Remove the transaction. |
| for i, t := range db.txs { |
| if t == tx { |
| db.txs = append(db.txs[:i], db.txs[i+1:]...) |
| break |
| } |
| } |
| n := len(db.txs) |
| |
| // Unlock the meta pages. |
| db.metalock.Unlock() |
| |
| // Merge statistics. |
| db.statlock.Lock() |
| db.stats.OpenTxN = n |
| db.stats.TxStats.add(&tx.stats) |
| db.statlock.Unlock() |
| } |
| |
| // Update executes a function within the context of a read-write managed transaction. |
| // If no error is returned from the function then the transaction is committed. |
| // If an error is returned then the entire transaction is rolled back. |
| // Any error that is returned from the function or returned from the commit is |
| // returned from the Update() method. |
| // |
| // Attempting to manually commit or rollback within the function will cause a panic. |
| func (db *DB) Update(fn func(*Tx) error) error { |
| t, err := db.Begin(true) |
| if err != nil { |
| return err |
| } |
| |
| // Make sure the transaction rolls back in the event of a panic. |
| defer func() { |
| if t.db != nil { |
| t.rollback() |
| } |
| }() |
| |
| // Mark as a managed tx so that the inner function cannot manually commit. |
| t.managed = true |
| |
| // If an error is returned from the function then rollback and return error. |
| err = fn(t) |
| t.managed = false |
| if err != nil { |
| _ = t.Rollback() |
| return err |
| } |
| |
| return t.Commit() |
| } |
| |
| // View executes a function within the context of a managed read-only transaction. |
| // Any error that is returned from the function is returned from the View() method. |
| // |
| // Attempting to manually rollback within the function will cause a panic. |
| func (db *DB) View(fn func(*Tx) error) error { |
| t, err := db.Begin(false) |
| if err != nil { |
| return err |
| } |
| |
| // Make sure the transaction rolls back in the event of a panic. |
| defer func() { |
| if t.db != nil { |
| t.rollback() |
| } |
| }() |
| |
| // Mark as a managed tx so that the inner function cannot manually rollback. |
| t.managed = true |
| |
| // If an error is returned from the function then pass it through. |
| err = fn(t) |
| t.managed = false |
| if err != nil { |
| _ = t.Rollback() |
| return err |
| } |
| |
| if err := t.Rollback(); err != nil { |
| return err |
| } |
| |
| return nil |
| } |
| |
| // Stats retrieves ongoing performance stats for the database. |
| // This is only updated when a transaction closes. |
| func (db *DB) Stats() Stats { |
| db.statlock.RLock() |
| defer db.statlock.RUnlock() |
| return db.stats |
| } |
| |
| // This is for internal access to the raw data bytes from the C cursor, use |
| // carefully, or not at all. |
| func (db *DB) Info() *Info { |
| return &Info{uintptr(unsafe.Pointer(&db.data[0])), db.pageSize} |
| } |
| |
| // page retrieves a page reference from the mmap based on the current page size. |
| func (db *DB) page(id pgid) *page { |
| pos := id * pgid(db.pageSize) |
| return (*page)(unsafe.Pointer(&db.data[pos])) |
| } |
| |
| // pageInBuffer retrieves a page reference from a given byte array based on the current page size. |
| func (db *DB) pageInBuffer(b []byte, id pgid) *page { |
| return (*page)(unsafe.Pointer(&b[id*pgid(db.pageSize)])) |
| } |
| |
| // meta retrieves the current meta page reference. |
| func (db *DB) meta() *meta { |
| if db.meta0.txid > db.meta1.txid { |
| return db.meta0 |
| } |
| return db.meta1 |
| } |
| |
| // allocate returns a contiguous block of memory starting at a given page. |
| func (db *DB) allocate(count int) (*page, error) { |
| // Allocate a temporary buffer for the page. |
| buf := make([]byte, count*db.pageSize) |
| p := (*page)(unsafe.Pointer(&buf[0])) |
| p.overflow = uint32(count - 1) |
| |
| // Use pages from the freelist if they are available. |
| if p.id = db.freelist.allocate(count); p.id != 0 { |
| return p, nil |
| } |
| |
| // Resize mmap() if we're at the end. |
| p.id = db.rwtx.meta.pgid |
| var minsz = int((p.id+pgid(count))+1) * db.pageSize |
| if minsz >= db.datasz { |
| if err := db.mmap(minsz); err != nil { |
| return nil, fmt.Errorf("mmap allocate error: %s", err) |
| } |
| } |
| |
| // Move the page id high water mark. |
| db.rwtx.meta.pgid += pgid(count) |
| |
| return p, nil |
| } |
| |
| // Options represents the options that can be set when opening a database. |
| type Options struct { |
| // Timeout is the amount of time to wait to obtain a file lock. |
| // When set to zero it will wait indefinitely. This option is only |
| // available on Darwin and Linux. |
| Timeout time.Duration |
| } |
| |
| // DefaultOptions represent the options used if nil options are passed into Open(). |
| // No timeout is used which will cause Bolt to wait indefinitely for a lock. |
| var DefaultOptions = &Options{ |
| Timeout: 0, |
| } |
| |
| // Stats represents statistics about the database. |
| type Stats struct { |
| // Freelist stats |
| FreePageN int // total number of free pages on the freelist |
| PendingPageN int // total number of pending pages on the freelist |
| FreeAlloc int // total bytes allocated in free pages |
| FreelistInuse int // total bytes used by the freelist |
| |
| // Transaction stats |
| TxN int // total number of started read transactions |
| OpenTxN int // number of currently open read transactions |
| |
| TxStats TxStats // global, ongoing stats. |
| } |
| |
| // Sub calculates and returns the difference between two sets of database stats. |
| // This is useful when obtaining stats at two different points and time and |
| // you need the performance counters that occurred within that time span. |
| func (s *Stats) Sub(other *Stats) Stats { |
| if other == nil { |
| return *s |
| } |
| var diff Stats |
| diff.FreePageN = s.FreePageN |
| diff.PendingPageN = s.PendingPageN |
| diff.FreeAlloc = s.FreeAlloc |
| diff.FreelistInuse = s.FreelistInuse |
| diff.TxN = other.TxN - s.TxN |
| diff.TxStats = s.TxStats.Sub(&other.TxStats) |
| return diff |
| } |
| |
| func (s *Stats) add(other *Stats) { |
| s.TxStats.add(&other.TxStats) |
| } |
| |
| type Info struct { |
| Data uintptr |
| PageSize int |
| } |
| |
| type meta struct { |
| magic uint32 |
| version uint32 |
| pageSize uint32 |
| flags uint32 |
| root bucket |
| freelist pgid |
| pgid pgid |
| txid txid |
| checksum uint64 |
| } |
| |
| // validate checks the marker bytes and version of the meta page to ensure it matches this binary. |
| func (m *meta) validate() error { |
| if m.checksum != 0 && m.checksum != m.sum64() { |
| return ErrChecksum |
| } else if m.magic != magic { |
| return ErrInvalid |
| } else if m.version != version { |
| return ErrVersionMismatch |
| } |
| return nil |
| } |
| |
| // copy copies one meta object to another. |
| func (m *meta) copy(dest *meta) { |
| *dest = *m |
| } |
| |
| // write writes the meta onto a page. |
| func (m *meta) write(p *page) { |
| |
| _assert(m.root.root < m.pgid, "root bucket pgid (%d) above high water mark (%d)", m.root.root, m.pgid) |
| _assert(m.freelist < m.pgid, "freelist pgid (%d) above high water mark (%d)", m.freelist, m.pgid) |
| |
| // Page id is either going to be 0 or 1 which we can determine by the transaction ID. |
| p.id = pgid(m.txid % 2) |
| p.flags |= metaPageFlag |
| |
| // Calculate the checksum. |
| m.checksum = m.sum64() |
| |
| m.copy(p.meta()) |
| } |
| |
| // generates the checksum for the meta. |
| func (m *meta) sum64() uint64 { |
| var h = fnv.New64a() |
| _, _ = h.Write((*[unsafe.Offsetof(meta{}.checksum)]byte)(unsafe.Pointer(m))[:]) |
| return h.Sum64() |
| } |
| |
| // _assert will panic with a given formatted message if the given condition is false. |
| func _assert(condition bool, msg string, v ...interface{}) { |
| if !condition { |
| panic(fmt.Sprintf("assertion failed: "+msg, v...)) |
| } |
| } |
| |
| func warn(v ...interface{}) { |
| fmt.Fprintln(os.Stderr, v...) |
| } |
| |
| func warnf(msg string, v ...interface{}) { |
| fmt.Fprintf(os.Stderr, msg+"\n", v...) |
| } |
| |
| func printstack() { |
| stack := strings.Join(strings.Split(string(debug.Stack()), "\n")[2:], "\n") |
| fmt.Fprintln(os.Stderr, stack) |
| } |