blob: 870c8b1cc9b8d04ce2bddf98f39eb64372820c69 [file] [log] [blame]
khenaidoo59ce9dd2019-11-11 13:05:32 -05001package bbolt
2
3import (
4 "errors"
5 "fmt"
6 "hash/fnv"
7 "log"
8 "os"
9 "runtime"
10 "sort"
11 "sync"
12 "time"
13 "unsafe"
14)
15
16// The largest step that can be taken when remapping the mmap.
17const maxMmapStep = 1 << 30 // 1GB
18
19// The data file format version.
20const version = 2
21
22// Represents a marker value to indicate that a file is a Bolt DB.
23const magic uint32 = 0xED0CDAED
24
25const pgidNoFreelist pgid = 0xffffffffffffffff
26
27// IgnoreNoSync specifies whether the NoSync field of a DB is ignored when
28// syncing changes to a file. This is required as some operating systems,
29// such as OpenBSD, do not have a unified buffer cache (UBC) and writes
30// must be synchronized using the msync(2) syscall.
31const IgnoreNoSync = runtime.GOOS == "openbsd"
32
33// Default values if not set in a DB instance.
34const (
35 DefaultMaxBatchSize int = 1000
36 DefaultMaxBatchDelay = 10 * time.Millisecond
37 DefaultAllocSize = 16 * 1024 * 1024
38)
39
40// default page size for db is set to the OS page size.
41var defaultPageSize = os.Getpagesize()
42
43// The time elapsed between consecutive file locking attempts.
44const flockRetryTimeout = 50 * time.Millisecond
45
46// FreelistType is the type of the freelist backend
47type FreelistType string
48
49const (
50 // FreelistArrayType indicates backend freelist type is array
51 FreelistArrayType = FreelistType("array")
52 // FreelistMapType indicates backend freelist type is hashmap
53 FreelistMapType = FreelistType("hashmap")
54)
55
56// DB represents a collection of buckets persisted to a file on disk.
57// All data access is performed through transactions which can be obtained through the DB.
58// All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called.
59type DB struct {
60 // When enabled, the database will perform a Check() after every commit.
61 // A panic is issued if the database is in an inconsistent state. This
62 // flag has a large performance impact so it should only be used for
63 // debugging purposes.
64 StrictMode bool
65
66 // Setting the NoSync flag will cause the database to skip fsync()
67 // calls after each commit. This can be useful when bulk loading data
68 // into a database and you can restart the bulk load in the event of
69 // a system failure or database corruption. Do not set this flag for
70 // normal use.
71 //
72 // If the package global IgnoreNoSync constant is true, this value is
73 // ignored. See the comment on that constant for more details.
74 //
75 // THIS IS UNSAFE. PLEASE USE WITH CAUTION.
76 NoSync bool
77
78 // When true, skips syncing freelist to disk. This improves the database
79 // write performance under normal operation, but requires a full database
80 // re-sync during recovery.
81 NoFreelistSync bool
82
83 // FreelistType sets the backend freelist type. There are two options. Array which is simple but endures
84 // dramatic performance degradation if database is large and framentation in freelist is common.
85 // The alternative one is using hashmap, it is faster in almost all circumstances
86 // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe.
87 // The default type is array
88 FreelistType FreelistType
89
90 // When true, skips the truncate call when growing the database.
91 // Setting this to true is only safe on non-ext3/ext4 systems.
92 // Skipping truncation avoids preallocation of hard drive space and
93 // bypasses a truncate() and fsync() syscall on remapping.
94 //
95 // https://github.com/boltdb/bolt/issues/284
96 NoGrowSync bool
97
98 // If you want to read the entire database fast, you can set MmapFlag to
99 // syscall.MAP_POPULATE on Linux 2.6.23+ for sequential read-ahead.
100 MmapFlags int
101
102 // MaxBatchSize is the maximum size of a batch. Default value is
103 // copied from DefaultMaxBatchSize in Open.
104 //
105 // If <=0, disables batching.
106 //
107 // Do not change concurrently with calls to Batch.
108 MaxBatchSize int
109
110 // MaxBatchDelay is the maximum delay before a batch starts.
111 // Default value is copied from DefaultMaxBatchDelay in Open.
112 //
113 // If <=0, effectively disables batching.
114 //
115 // Do not change concurrently with calls to Batch.
116 MaxBatchDelay time.Duration
117
118 // AllocSize is the amount of space allocated when the database
119 // needs to create new pages. This is done to amortize the cost
120 // of truncate() and fsync() when growing the data file.
121 AllocSize int
122
123 path string
124 openFile func(string, int, os.FileMode) (*os.File, error)
125 file *os.File
126 dataref []byte // mmap'ed readonly, write throws SEGV
127 data *[maxMapSize]byte
128 datasz int
129 filesz int // current on disk file size
130 meta0 *meta
131 meta1 *meta
132 pageSize int
133 opened bool
134 rwtx *Tx
135 txs []*Tx
136 stats Stats
137
138 freelist *freelist
139 freelistLoad sync.Once
140
141 pagePool sync.Pool
142
143 batchMu sync.Mutex
144 batch *batch
145
146 rwlock sync.Mutex // Allows only one writer at a time.
147 metalock sync.Mutex // Protects meta page access.
148 mmaplock sync.RWMutex // Protects mmap access during remapping.
149 statlock sync.RWMutex // Protects stats access.
150
151 ops struct {
152 writeAt func(b []byte, off int64) (n int, err error)
153 }
154
155 // Read only mode.
156 // When true, Update() and Begin(true) return ErrDatabaseReadOnly immediately.
157 readOnly bool
158}
159
160// Path returns the path to currently open database file.
161func (db *DB) Path() string {
162 return db.path
163}
164
165// GoString returns the Go string representation of the database.
166func (db *DB) GoString() string {
167 return fmt.Sprintf("bolt.DB{path:%q}", db.path)
168}
169
170// String returns the string representation of the database.
171func (db *DB) String() string {
172 return fmt.Sprintf("DB<%q>", db.path)
173}
174
175// Open creates and opens a database at the given path.
176// If the file does not exist then it will be created automatically.
177// Passing in nil options will cause Bolt to open the database with the default options.
178func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
179 db := &DB{
180 opened: true,
181 }
182 // Set default options if no options are provided.
183 if options == nil {
184 options = DefaultOptions
185 }
186 db.NoSync = options.NoSync
187 db.NoGrowSync = options.NoGrowSync
188 db.MmapFlags = options.MmapFlags
189 db.NoFreelistSync = options.NoFreelistSync
190 db.FreelistType = options.FreelistType
191
192 // Set default values for later DB operations.
193 db.MaxBatchSize = DefaultMaxBatchSize
194 db.MaxBatchDelay = DefaultMaxBatchDelay
195 db.AllocSize = DefaultAllocSize
196
197 flag := os.O_RDWR
198 if options.ReadOnly {
199 flag = os.O_RDONLY
200 db.readOnly = true
201 }
202
203 db.openFile = options.OpenFile
204 if db.openFile == nil {
205 db.openFile = os.OpenFile
206 }
207
208 // Open data file and separate sync handler for metadata writes.
209 db.path = path
210 var err error
211 if db.file, err = db.openFile(db.path, flag|os.O_CREATE, mode); err != nil {
212 _ = db.close()
213 return nil, err
214 }
215
216 // Lock file so that other processes using Bolt in read-write mode cannot
217 // use the database at the same time. This would cause corruption since
218 // the two processes would write meta pages and free pages separately.
219 // The database file is locked exclusively (only one process can grab the lock)
220 // if !options.ReadOnly.
221 // The database file is locked using the shared lock (more than one process may
222 // hold a lock at the same time) otherwise (options.ReadOnly is set).
223 if err := flock(db, !db.readOnly, options.Timeout); err != nil {
224 _ = db.close()
225 return nil, err
226 }
227
228 // Default values for test hooks
229 db.ops.writeAt = db.file.WriteAt
230
231 if db.pageSize = options.PageSize; db.pageSize == 0 {
232 // Set the default page size to the OS page size.
233 db.pageSize = defaultPageSize
234 }
235
236 // Initialize the database if it doesn't exist.
237 if info, err := db.file.Stat(); err != nil {
238 _ = db.close()
239 return nil, err
240 } else if info.Size() == 0 {
241 // Initialize new files with meta pages.
242 if err := db.init(); err != nil {
243 // clean up file descriptor on initialization fail
244 _ = db.close()
245 return nil, err
246 }
247 } else {
248 // Read the first meta page to determine the page size.
249 var buf [0x1000]byte
250 // If we can't read the page size, but can read a page, assume
251 // it's the same as the OS or one given -- since that's how the
252 // page size was chosen in the first place.
253 //
254 // If the first page is invalid and this OS uses a different
255 // page size than what the database was created with then we
256 // are out of luck and cannot access the database.
257 //
258 // TODO: scan for next page
259 if bw, err := db.file.ReadAt(buf[:], 0); err == nil && bw == len(buf) {
260 if m := db.pageInBuffer(buf[:], 0).meta(); m.validate() == nil {
261 db.pageSize = int(m.pageSize)
262 }
263 } else {
264 _ = db.close()
265 return nil, ErrInvalid
266 }
267 }
268
269 // Initialize page pool.
270 db.pagePool = sync.Pool{
271 New: func() interface{} {
272 return make([]byte, db.pageSize)
273 },
274 }
275
276 // Memory map the data file.
277 if err := db.mmap(options.InitialMmapSize); err != nil {
278 _ = db.close()
279 return nil, err
280 }
281
282 if db.readOnly {
283 return db, nil
284 }
285
286 db.loadFreelist()
287
288 // Flush freelist when transitioning from no sync to sync so
289 // NoFreelistSync unaware boltdb can open the db later.
290 if !db.NoFreelistSync && !db.hasSyncedFreelist() {
291 tx, err := db.Begin(true)
292 if tx != nil {
293 err = tx.Commit()
294 }
295 if err != nil {
296 _ = db.close()
297 return nil, err
298 }
299 }
300
301 // Mark the database as opened and return.
302 return db, nil
303}
304
305// loadFreelist reads the freelist if it is synced, or reconstructs it
306// by scanning the DB if it is not synced. It assumes there are no
307// concurrent accesses being made to the freelist.
308func (db *DB) loadFreelist() {
309 db.freelistLoad.Do(func() {
310 db.freelist = newFreelist(db.FreelistType)
311 if !db.hasSyncedFreelist() {
312 // Reconstruct free list by scanning the DB.
313 db.freelist.readIDs(db.freepages())
314 } else {
315 // Read free list from freelist page.
316 db.freelist.read(db.page(db.meta().freelist))
317 }
318 db.stats.FreePageN = db.freelist.free_count()
319 })
320}
321
322func (db *DB) hasSyncedFreelist() bool {
323 return db.meta().freelist != pgidNoFreelist
324}
325
326// mmap opens the underlying memory-mapped file and initializes the meta references.
327// minsz is the minimum size that the new mmap can be.
328func (db *DB) mmap(minsz int) error {
329 db.mmaplock.Lock()
330 defer db.mmaplock.Unlock()
331
332 info, err := db.file.Stat()
333 if err != nil {
334 return fmt.Errorf("mmap stat error: %s", err)
335 } else if int(info.Size()) < db.pageSize*2 {
336 return fmt.Errorf("file size too small")
337 }
338
339 // Ensure the size is at least the minimum size.
340 var size = int(info.Size())
341 if size < minsz {
342 size = minsz
343 }
344 size, err = db.mmapSize(size)
345 if err != nil {
346 return err
347 }
348
349 // Dereference all mmap references before unmapping.
350 if db.rwtx != nil {
351 db.rwtx.root.dereference()
352 }
353
354 // Unmap existing data before continuing.
355 if err := db.munmap(); err != nil {
356 return err
357 }
358
359 // Memory-map the data file as a byte slice.
360 if err := mmap(db, size); err != nil {
361 return err
362 }
363
364 // Save references to the meta pages.
365 db.meta0 = db.page(0).meta()
366 db.meta1 = db.page(1).meta()
367
368 // Validate the meta pages. We only return an error if both meta pages fail
369 // validation, since meta0 failing validation means that it wasn't saved
370 // properly -- but we can recover using meta1. And vice-versa.
371 err0 := db.meta0.validate()
372 err1 := db.meta1.validate()
373 if err0 != nil && err1 != nil {
374 return err0
375 }
376
377 return nil
378}
379
380// munmap unmaps the data file from memory.
381func (db *DB) munmap() error {
382 if err := munmap(db); err != nil {
383 return fmt.Errorf("unmap error: " + err.Error())
384 }
385 return nil
386}
387
388// mmapSize determines the appropriate size for the mmap given the current size
389// of the database. The minimum size is 32KB and doubles until it reaches 1GB.
390// Returns an error if the new mmap size is greater than the max allowed.
391func (db *DB) mmapSize(size int) (int, error) {
392 // Double the size from 32KB until 1GB.
393 for i := uint(15); i <= 30; i++ {
394 if size <= 1<<i {
395 return 1 << i, nil
396 }
397 }
398
399 // Verify the requested size is not above the maximum allowed.
400 if size > maxMapSize {
401 return 0, fmt.Errorf("mmap too large")
402 }
403
404 // If larger than 1GB then grow by 1GB at a time.
405 sz := int64(size)
406 if remainder := sz % int64(maxMmapStep); remainder > 0 {
407 sz += int64(maxMmapStep) - remainder
408 }
409
410 // Ensure that the mmap size is a multiple of the page size.
411 // This should always be true since we're incrementing in MBs.
412 pageSize := int64(db.pageSize)
413 if (sz % pageSize) != 0 {
414 sz = ((sz / pageSize) + 1) * pageSize
415 }
416
417 // If we've exceeded the max size then only grow up to the max size.
418 if sz > maxMapSize {
419 sz = maxMapSize
420 }
421
422 return int(sz), nil
423}
424
425// init creates a new database file and initializes its meta pages.
426func (db *DB) init() error {
427 // Create two meta pages on a buffer.
428 buf := make([]byte, db.pageSize*4)
429 for i := 0; i < 2; i++ {
430 p := db.pageInBuffer(buf[:], pgid(i))
431 p.id = pgid(i)
432 p.flags = metaPageFlag
433
434 // Initialize the meta page.
435 m := p.meta()
436 m.magic = magic
437 m.version = version
438 m.pageSize = uint32(db.pageSize)
439 m.freelist = 2
440 m.root = bucket{root: 3}
441 m.pgid = 4
442 m.txid = txid(i)
443 m.checksum = m.sum64()
444 }
445
446 // Write an empty freelist at page 3.
447 p := db.pageInBuffer(buf[:], pgid(2))
448 p.id = pgid(2)
449 p.flags = freelistPageFlag
450 p.count = 0
451
452 // Write an empty leaf page at page 4.
453 p = db.pageInBuffer(buf[:], pgid(3))
454 p.id = pgid(3)
455 p.flags = leafPageFlag
456 p.count = 0
457
458 // Write the buffer to our data file.
459 if _, err := db.ops.writeAt(buf, 0); err != nil {
460 return err
461 }
462 if err := fdatasync(db); err != nil {
463 return err
464 }
465
466 return nil
467}
468
469// Close releases all database resources.
470// It will block waiting for any open transactions to finish
471// before closing the database and returning.
472func (db *DB) Close() error {
473 db.rwlock.Lock()
474 defer db.rwlock.Unlock()
475
476 db.metalock.Lock()
477 defer db.metalock.Unlock()
478
479 db.mmaplock.Lock()
480 defer db.mmaplock.Unlock()
481
482 return db.close()
483}
484
485func (db *DB) close() error {
486 if !db.opened {
487 return nil
488 }
489
490 db.opened = false
491
492 db.freelist = nil
493
494 // Clear ops.
495 db.ops.writeAt = nil
496
497 // Close the mmap.
498 if err := db.munmap(); err != nil {
499 return err
500 }
501
502 // Close file handles.
503 if db.file != nil {
504 // No need to unlock read-only file.
505 if !db.readOnly {
506 // Unlock the file.
507 if err := funlock(db); err != nil {
508 log.Printf("bolt.Close(): funlock error: %s", err)
509 }
510 }
511
512 // Close the file descriptor.
513 if err := db.file.Close(); err != nil {
514 return fmt.Errorf("db file close: %s", err)
515 }
516 db.file = nil
517 }
518
519 db.path = ""
520 return nil
521}
522
523// Begin starts a new transaction.
524// Multiple read-only transactions can be used concurrently but only one
525// write transaction can be used at a time. Starting multiple write transactions
526// will cause the calls to block and be serialized until the current write
527// transaction finishes.
528//
529// Transactions should not be dependent on one another. Opening a read
530// transaction and a write transaction in the same goroutine can cause the
531// writer to deadlock because the database periodically needs to re-mmap itself
532// as it grows and it cannot do that while a read transaction is open.
533//
534// If a long running read transaction (for example, a snapshot transaction) is
535// needed, you might want to set DB.InitialMmapSize to a large enough value
536// to avoid potential blocking of write transaction.
537//
538// IMPORTANT: You must close read-only transactions after you are finished or
539// else the database will not reclaim old pages.
540func (db *DB) Begin(writable bool) (*Tx, error) {
541 if writable {
542 return db.beginRWTx()
543 }
544 return db.beginTx()
545}
546
547func (db *DB) beginTx() (*Tx, error) {
548 // Lock the meta pages while we initialize the transaction. We obtain
549 // the meta lock before the mmap lock because that's the order that the
550 // write transaction will obtain them.
551 db.metalock.Lock()
552
553 // Obtain a read-only lock on the mmap. When the mmap is remapped it will
554 // obtain a write lock so all transactions must finish before it can be
555 // remapped.
556 db.mmaplock.RLock()
557
558 // Exit if the database is not open yet.
559 if !db.opened {
560 db.mmaplock.RUnlock()
561 db.metalock.Unlock()
562 return nil, ErrDatabaseNotOpen
563 }
564
565 // Create a transaction associated with the database.
566 t := &Tx{}
567 t.init(db)
568
569 // Keep track of transaction until it closes.
570 db.txs = append(db.txs, t)
571 n := len(db.txs)
572
573 // Unlock the meta pages.
574 db.metalock.Unlock()
575
576 // Update the transaction stats.
577 db.statlock.Lock()
578 db.stats.TxN++
579 db.stats.OpenTxN = n
580 db.statlock.Unlock()
581
582 return t, nil
583}
584
585func (db *DB) beginRWTx() (*Tx, error) {
586 // If the database was opened with Options.ReadOnly, return an error.
587 if db.readOnly {
588 return nil, ErrDatabaseReadOnly
589 }
590
591 // Obtain writer lock. This is released by the transaction when it closes.
592 // This enforces only one writer transaction at a time.
593 db.rwlock.Lock()
594
595 // Once we have the writer lock then we can lock the meta pages so that
596 // we can set up the transaction.
597 db.metalock.Lock()
598 defer db.metalock.Unlock()
599
600 // Exit if the database is not open yet.
601 if !db.opened {
602 db.rwlock.Unlock()
603 return nil, ErrDatabaseNotOpen
604 }
605
606 // Create a transaction associated with the database.
607 t := &Tx{writable: true}
608 t.init(db)
609 db.rwtx = t
610 db.freePages()
611 return t, nil
612}
613
614// freePages releases any pages associated with closed read-only transactions.
615func (db *DB) freePages() {
616 // Free all pending pages prior to earliest open transaction.
617 sort.Sort(txsById(db.txs))
618 minid := txid(0xFFFFFFFFFFFFFFFF)
619 if len(db.txs) > 0 {
620 minid = db.txs[0].meta.txid
621 }
622 if minid > 0 {
623 db.freelist.release(minid - 1)
624 }
625 // Release unused txid extents.
626 for _, t := range db.txs {
627 db.freelist.releaseRange(minid, t.meta.txid-1)
628 minid = t.meta.txid + 1
629 }
630 db.freelist.releaseRange(minid, txid(0xFFFFFFFFFFFFFFFF))
631 // Any page both allocated and freed in an extent is safe to release.
632}
633
634type txsById []*Tx
635
636func (t txsById) Len() int { return len(t) }
637func (t txsById) Swap(i, j int) { t[i], t[j] = t[j], t[i] }
638func (t txsById) Less(i, j int) bool { return t[i].meta.txid < t[j].meta.txid }
639
640// removeTx removes a transaction from the database.
641func (db *DB) removeTx(tx *Tx) {
642 // Release the read lock on the mmap.
643 db.mmaplock.RUnlock()
644
645 // Use the meta lock to restrict access to the DB object.
646 db.metalock.Lock()
647
648 // Remove the transaction.
649 for i, t := range db.txs {
650 if t == tx {
651 last := len(db.txs) - 1
652 db.txs[i] = db.txs[last]
653 db.txs[last] = nil
654 db.txs = db.txs[:last]
655 break
656 }
657 }
658 n := len(db.txs)
659
660 // Unlock the meta pages.
661 db.metalock.Unlock()
662
663 // Merge statistics.
664 db.statlock.Lock()
665 db.stats.OpenTxN = n
666 db.stats.TxStats.add(&tx.stats)
667 db.statlock.Unlock()
668}
669
670// Update executes a function within the context of a read-write managed transaction.
671// If no error is returned from the function then the transaction is committed.
672// If an error is returned then the entire transaction is rolled back.
673// Any error that is returned from the function or returned from the commit is
674// returned from the Update() method.
675//
676// Attempting to manually commit or rollback within the function will cause a panic.
677func (db *DB) Update(fn func(*Tx) error) error {
678 t, err := db.Begin(true)
679 if err != nil {
680 return err
681 }
682
683 // Make sure the transaction rolls back in the event of a panic.
684 defer func() {
685 if t.db != nil {
686 t.rollback()
687 }
688 }()
689
690 // Mark as a managed tx so that the inner function cannot manually commit.
691 t.managed = true
692
693 // If an error is returned from the function then rollback and return error.
694 err = fn(t)
695 t.managed = false
696 if err != nil {
697 _ = t.Rollback()
698 return err
699 }
700
701 return t.Commit()
702}
703
704// View executes a function within the context of a managed read-only transaction.
705// Any error that is returned from the function is returned from the View() method.
706//
707// Attempting to manually rollback within the function will cause a panic.
708func (db *DB) View(fn func(*Tx) error) error {
709 t, err := db.Begin(false)
710 if err != nil {
711 return err
712 }
713
714 // Make sure the transaction rolls back in the event of a panic.
715 defer func() {
716 if t.db != nil {
717 t.rollback()
718 }
719 }()
720
721 // Mark as a managed tx so that the inner function cannot manually rollback.
722 t.managed = true
723
724 // If an error is returned from the function then pass it through.
725 err = fn(t)
726 t.managed = false
727 if err != nil {
728 _ = t.Rollback()
729 return err
730 }
731
732 return t.Rollback()
733}
734
735// Batch calls fn as part of a batch. It behaves similar to Update,
736// except:
737//
738// 1. concurrent Batch calls can be combined into a single Bolt
739// transaction.
740//
741// 2. the function passed to Batch may be called multiple times,
742// regardless of whether it returns error or not.
743//
744// This means that Batch function side effects must be idempotent and
745// take permanent effect only after a successful return is seen in
746// caller.
747//
748// The maximum batch size and delay can be adjusted with DB.MaxBatchSize
749// and DB.MaxBatchDelay, respectively.
750//
751// Batch is only useful when there are multiple goroutines calling it.
752func (db *DB) Batch(fn func(*Tx) error) error {
753 errCh := make(chan error, 1)
754
755 db.batchMu.Lock()
756 if (db.batch == nil) || (db.batch != nil && len(db.batch.calls) >= db.MaxBatchSize) {
757 // There is no existing batch, or the existing batch is full; start a new one.
758 db.batch = &batch{
759 db: db,
760 }
761 db.batch.timer = time.AfterFunc(db.MaxBatchDelay, db.batch.trigger)
762 }
763 db.batch.calls = append(db.batch.calls, call{fn: fn, err: errCh})
764 if len(db.batch.calls) >= db.MaxBatchSize {
765 // wake up batch, it's ready to run
766 go db.batch.trigger()
767 }
768 db.batchMu.Unlock()
769
770 err := <-errCh
771 if err == trySolo {
772 err = db.Update(fn)
773 }
774 return err
775}
776
777type call struct {
778 fn func(*Tx) error
779 err chan<- error
780}
781
782type batch struct {
783 db *DB
784 timer *time.Timer
785 start sync.Once
786 calls []call
787}
788
789// trigger runs the batch if it hasn't already been run.
790func (b *batch) trigger() {
791 b.start.Do(b.run)
792}
793
794// run performs the transactions in the batch and communicates results
795// back to DB.Batch.
796func (b *batch) run() {
797 b.db.batchMu.Lock()
798 b.timer.Stop()
799 // Make sure no new work is added to this batch, but don't break
800 // other batches.
801 if b.db.batch == b {
802 b.db.batch = nil
803 }
804 b.db.batchMu.Unlock()
805
806retry:
807 for len(b.calls) > 0 {
808 var failIdx = -1
809 err := b.db.Update(func(tx *Tx) error {
810 for i, c := range b.calls {
811 if err := safelyCall(c.fn, tx); err != nil {
812 failIdx = i
813 return err
814 }
815 }
816 return nil
817 })
818
819 if failIdx >= 0 {
820 // take the failing transaction out of the batch. it's
821 // safe to shorten b.calls here because db.batch no longer
822 // points to us, and we hold the mutex anyway.
823 c := b.calls[failIdx]
824 b.calls[failIdx], b.calls = b.calls[len(b.calls)-1], b.calls[:len(b.calls)-1]
825 // tell the submitter re-run it solo, continue with the rest of the batch
826 c.err <- trySolo
827 continue retry
828 }
829
830 // pass success, or bolt internal errors, to all callers
831 for _, c := range b.calls {
832 c.err <- err
833 }
834 break retry
835 }
836}
837
838// trySolo is a special sentinel error value used for signaling that a
839// transaction function should be re-run. It should never be seen by
840// callers.
841var trySolo = errors.New("batch function returned an error and should be re-run solo")
842
843type panicked struct {
844 reason interface{}
845}
846
847func (p panicked) Error() string {
848 if err, ok := p.reason.(error); ok {
849 return err.Error()
850 }
851 return fmt.Sprintf("panic: %v", p.reason)
852}
853
854func safelyCall(fn func(*Tx) error, tx *Tx) (err error) {
855 defer func() {
856 if p := recover(); p != nil {
857 err = panicked{p}
858 }
859 }()
860 return fn(tx)
861}
862
863// Sync executes fdatasync() against the database file handle.
864//
865// This is not necessary under normal operation, however, if you use NoSync
866// then it allows you to force the database file to sync against the disk.
867func (db *DB) Sync() error { return fdatasync(db) }
868
869// Stats retrieves ongoing performance stats for the database.
870// This is only updated when a transaction closes.
871func (db *DB) Stats() Stats {
872 db.statlock.RLock()
873 defer db.statlock.RUnlock()
874 return db.stats
875}
876
877// This is for internal access to the raw data bytes from the C cursor, use
878// carefully, or not at all.
879func (db *DB) Info() *Info {
880 return &Info{uintptr(unsafe.Pointer(&db.data[0])), db.pageSize}
881}
882
883// page retrieves a page reference from the mmap based on the current page size.
884func (db *DB) page(id pgid) *page {
885 pos := id * pgid(db.pageSize)
886 return (*page)(unsafe.Pointer(&db.data[pos]))
887}
888
889// pageInBuffer retrieves a page reference from a given byte array based on the current page size.
890func (db *DB) pageInBuffer(b []byte, id pgid) *page {
891 return (*page)(unsafe.Pointer(&b[id*pgid(db.pageSize)]))
892}
893
894// meta retrieves the current meta page reference.
895func (db *DB) meta() *meta {
896 // We have to return the meta with the highest txid which doesn't fail
897 // validation. Otherwise, we can cause errors when in fact the database is
898 // in a consistent state. metaA is the one with the higher txid.
899 metaA := db.meta0
900 metaB := db.meta1
901 if db.meta1.txid > db.meta0.txid {
902 metaA = db.meta1
903 metaB = db.meta0
904 }
905
906 // Use higher meta page if valid. Otherwise fallback to previous, if valid.
907 if err := metaA.validate(); err == nil {
908 return metaA
909 } else if err := metaB.validate(); err == nil {
910 return metaB
911 }
912
913 // This should never be reached, because both meta1 and meta0 were validated
914 // on mmap() and we do fsync() on every write.
915 panic("bolt.DB.meta(): invalid meta pages")
916}
917
918// allocate returns a contiguous block of memory starting at a given page.
919func (db *DB) allocate(txid txid, count int) (*page, error) {
920 // Allocate a temporary buffer for the page.
921 var buf []byte
922 if count == 1 {
923 buf = db.pagePool.Get().([]byte)
924 } else {
925 buf = make([]byte, count*db.pageSize)
926 }
927 p := (*page)(unsafe.Pointer(&buf[0]))
928 p.overflow = uint32(count - 1)
929
930 // Use pages from the freelist if they are available.
931 if p.id = db.freelist.allocate(txid, count); p.id != 0 {
932 return p, nil
933 }
934
935 // Resize mmap() if we're at the end.
936 p.id = db.rwtx.meta.pgid
937 var minsz = int((p.id+pgid(count))+1) * db.pageSize
938 if minsz >= db.datasz {
939 if err := db.mmap(minsz); err != nil {
940 return nil, fmt.Errorf("mmap allocate error: %s", err)
941 }
942 }
943
944 // Move the page id high water mark.
945 db.rwtx.meta.pgid += pgid(count)
946
947 return p, nil
948}
949
950// grow grows the size of the database to the given sz.
951func (db *DB) grow(sz int) error {
952 // Ignore if the new size is less than available file size.
953 if sz <= db.filesz {
954 return nil
955 }
956
957 // If the data is smaller than the alloc size then only allocate what's needed.
958 // Once it goes over the allocation size then allocate in chunks.
959 if db.datasz < db.AllocSize {
960 sz = db.datasz
961 } else {
962 sz += db.AllocSize
963 }
964
965 // Truncate and fsync to ensure file size metadata is flushed.
966 // https://github.com/boltdb/bolt/issues/284
967 if !db.NoGrowSync && !db.readOnly {
968 if runtime.GOOS != "windows" {
969 if err := db.file.Truncate(int64(sz)); err != nil {
970 return fmt.Errorf("file resize error: %s", err)
971 }
972 }
973 if err := db.file.Sync(); err != nil {
974 return fmt.Errorf("file sync error: %s", err)
975 }
976 }
977
978 db.filesz = sz
979 return nil
980}
981
982func (db *DB) IsReadOnly() bool {
983 return db.readOnly
984}
985
986func (db *DB) freepages() []pgid {
987 tx, err := db.beginTx()
988 defer func() {
989 err = tx.Rollback()
990 if err != nil {
991 panic("freepages: failed to rollback tx")
992 }
993 }()
994 if err != nil {
995 panic("freepages: failed to open read only tx")
996 }
997
998 reachable := make(map[pgid]*page)
999 nofreed := make(map[pgid]bool)
1000 ech := make(chan error)
1001 go func() {
1002 for e := range ech {
1003 panic(fmt.Sprintf("freepages: failed to get all reachable pages (%v)", e))
1004 }
1005 }()
1006 tx.checkBucket(&tx.root, reachable, nofreed, ech)
1007 close(ech)
1008
1009 var fids []pgid
1010 for i := pgid(2); i < db.meta().pgid; i++ {
1011 if _, ok := reachable[i]; !ok {
1012 fids = append(fids, i)
1013 }
1014 }
1015 return fids
1016}
1017
1018// Options represents the options that can be set when opening a database.
1019type Options struct {
1020 // Timeout is the amount of time to wait to obtain a file lock.
1021 // When set to zero it will wait indefinitely. This option is only
1022 // available on Darwin and Linux.
1023 Timeout time.Duration
1024
1025 // Sets the DB.NoGrowSync flag before memory mapping the file.
1026 NoGrowSync bool
1027
1028 // Do not sync freelist to disk. This improves the database write performance
1029 // under normal operation, but requires a full database re-sync during recovery.
1030 NoFreelistSync bool
1031
1032 // FreelistType sets the backend freelist type. There are two options. Array which is simple but endures
1033 // dramatic performance degradation if database is large and framentation in freelist is common.
1034 // The alternative one is using hashmap, it is faster in almost all circumstances
1035 // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe.
1036 // The default type is array
1037 FreelistType FreelistType
1038
1039 // Open database in read-only mode. Uses flock(..., LOCK_SH |LOCK_NB) to
1040 // grab a shared lock (UNIX).
1041 ReadOnly bool
1042
1043 // Sets the DB.MmapFlags flag before memory mapping the file.
1044 MmapFlags int
1045
1046 // InitialMmapSize is the initial mmap size of the database
1047 // in bytes. Read transactions won't block write transaction
1048 // if the InitialMmapSize is large enough to hold database mmap
1049 // size. (See DB.Begin for more information)
1050 //
1051 // If <=0, the initial map size is 0.
1052 // If initialMmapSize is smaller than the previous database size,
1053 // it takes no effect.
1054 InitialMmapSize int
1055
1056 // PageSize overrides the default OS page size.
1057 PageSize int
1058
1059 // NoSync sets the initial value of DB.NoSync. Normally this can just be
1060 // set directly on the DB itself when returned from Open(), but this option
1061 // is useful in APIs which expose Options but not the underlying DB.
1062 NoSync bool
1063
1064 // OpenFile is used to open files. It defaults to os.OpenFile. This option
1065 // is useful for writing hermetic tests.
1066 OpenFile func(string, int, os.FileMode) (*os.File, error)
1067}
1068
1069// DefaultOptions represent the options used if nil options are passed into Open().
1070// No timeout is used which will cause Bolt to wait indefinitely for a lock.
1071var DefaultOptions = &Options{
1072 Timeout: 0,
1073 NoGrowSync: false,
1074 FreelistType: FreelistArrayType,
1075}
1076
1077// Stats represents statistics about the database.
1078type Stats struct {
1079 // Freelist stats
1080 FreePageN int // total number of free pages on the freelist
1081 PendingPageN int // total number of pending pages on the freelist
1082 FreeAlloc int // total bytes allocated in free pages
1083 FreelistInuse int // total bytes used by the freelist
1084
1085 // Transaction stats
1086 TxN int // total number of started read transactions
1087 OpenTxN int // number of currently open read transactions
1088
1089 TxStats TxStats // global, ongoing stats.
1090}
1091
1092// Sub calculates and returns the difference between two sets of database stats.
1093// This is useful when obtaining stats at two different points and time and
1094// you need the performance counters that occurred within that time span.
1095func (s *Stats) Sub(other *Stats) Stats {
1096 if other == nil {
1097 return *s
1098 }
1099 var diff Stats
1100 diff.FreePageN = s.FreePageN
1101 diff.PendingPageN = s.PendingPageN
1102 diff.FreeAlloc = s.FreeAlloc
1103 diff.FreelistInuse = s.FreelistInuse
1104 diff.TxN = s.TxN - other.TxN
1105 diff.TxStats = s.TxStats.Sub(&other.TxStats)
1106 return diff
1107}
1108
1109type Info struct {
1110 Data uintptr
1111 PageSize int
1112}
1113
1114type meta struct {
1115 magic uint32
1116 version uint32
1117 pageSize uint32
1118 flags uint32
1119 root bucket
1120 freelist pgid
1121 pgid pgid
1122 txid txid
1123 checksum uint64
1124}
1125
1126// validate checks the marker bytes and version of the meta page to ensure it matches this binary.
1127func (m *meta) validate() error {
1128 if m.magic != magic {
1129 return ErrInvalid
1130 } else if m.version != version {
1131 return ErrVersionMismatch
1132 } else if m.checksum != 0 && m.checksum != m.sum64() {
1133 return ErrChecksum
1134 }
1135 return nil
1136}
1137
1138// copy copies one meta object to another.
1139func (m *meta) copy(dest *meta) {
1140 *dest = *m
1141}
1142
1143// write writes the meta onto a page.
1144func (m *meta) write(p *page) {
1145 if m.root.root >= m.pgid {
1146 panic(fmt.Sprintf("root bucket pgid (%d) above high water mark (%d)", m.root.root, m.pgid))
1147 } else if m.freelist >= m.pgid && m.freelist != pgidNoFreelist {
1148 // TODO: reject pgidNoFreeList if !NoFreelistSync
1149 panic(fmt.Sprintf("freelist pgid (%d) above high water mark (%d)", m.freelist, m.pgid))
1150 }
1151
1152 // Page id is either going to be 0 or 1 which we can determine by the transaction ID.
1153 p.id = pgid(m.txid % 2)
1154 p.flags |= metaPageFlag
1155
1156 // Calculate the checksum.
1157 m.checksum = m.sum64()
1158
1159 m.copy(p.meta())
1160}
1161
1162// generates the checksum for the meta.
1163func (m *meta) sum64() uint64 {
1164 var h = fnv.New64a()
1165 _, _ = h.Write((*[unsafe.Offsetof(meta{}.checksum)]byte)(unsafe.Pointer(m))[:])
1166 return h.Sum64()
1167}
1168
1169// _assert will panic with a given formatted message if the given condition is false.
1170func _assert(condition bool, msg string, v ...interface{}) {
1171 if !condition {
1172 panic(fmt.Sprintf("assertion failed: "+msg, v...))
1173 }
1174}