blob: 622947d9cb64fe83d4fb12bec593282556f9806c [file] [log] [blame]
Abhay Kumar40252eb2025-10-13 13:25:53 +00001package bbolt
2
3import (
4 "errors"
5 "fmt"
6 "io"
7 "os"
8 "runtime"
9 "sync"
10 "time"
11 "unsafe"
12
13 berrors "go.etcd.io/bbolt/errors"
14 "go.etcd.io/bbolt/internal/common"
15 fl "go.etcd.io/bbolt/internal/freelist"
16)
17
18// The time elapsed between consecutive file locking attempts.
19const flockRetryTimeout = 50 * time.Millisecond
20
21// FreelistType is the type of the freelist backend
22type FreelistType string
23
24// TODO(ahrtr): eventually we should (step by step)
25// 1. default to `FreelistMapType`;
26// 2. remove the `FreelistArrayType`, do not export `FreelistMapType`
27// and remove field `FreelistType' from both `DB` and `Options`;
28const (
29 // FreelistArrayType indicates backend freelist type is array
30 FreelistArrayType = FreelistType("array")
31 // FreelistMapType indicates backend freelist type is hashmap
32 FreelistMapType = FreelistType("hashmap")
33)
34
35// DB represents a collection of buckets persisted to a file on disk.
36// All data access is performed through transactions which can be obtained through the DB.
37// All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called.
38type DB struct {
39 // Put `stats` at the first field to ensure it's 64-bit aligned. Note that
40 // the first word in an allocated struct can be relied upon to be 64-bit
41 // aligned. Refer to https://pkg.go.dev/sync/atomic#pkg-note-BUG. Also
42 // refer to discussion in https://github.com/etcd-io/bbolt/issues/577.
43 stats Stats
44
45 // When enabled, the database will perform a Check() after every commit.
46 // A panic is issued if the database is in an inconsistent state. This
47 // flag has a large performance impact so it should only be used for
48 // debugging purposes.
49 StrictMode bool
50
51 // Setting the NoSync flag will cause the database to skip fsync()
52 // calls after each commit. This can be useful when bulk loading data
53 // into a database and you can restart the bulk load in the event of
54 // a system failure or database corruption. Do not set this flag for
55 // normal use.
56 //
57 // If the package global IgnoreNoSync constant is true, this value is
58 // ignored. See the comment on that constant for more details.
59 //
60 // THIS IS UNSAFE. PLEASE USE WITH CAUTION.
61 NoSync bool
62
63 // When true, skips syncing freelist to disk. This improves the database
64 // write performance under normal operation, but requires a full database
65 // re-sync during recovery.
66 NoFreelistSync bool
67
68 // FreelistType sets the backend freelist type. There are two options. Array which is simple but endures
69 // dramatic performance degradation if database is large and fragmentation in freelist is common.
70 // The alternative one is using hashmap, it is faster in almost all circumstances
71 // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe.
72 // The default type is array
73 FreelistType FreelistType
74
75 // When true, skips the truncate call when growing the database.
76 // Setting this to true is only safe on non-ext3/ext4 systems.
77 // Skipping truncation avoids preallocation of hard drive space and
78 // bypasses a truncate() and fsync() syscall on remapping.
79 //
80 // https://github.com/boltdb/bolt/issues/284
81 NoGrowSync bool
82
83 // When `true`, bbolt will always load the free pages when opening the DB.
84 // When opening db in write mode, this flag will always automatically
85 // set to `true`.
86 PreLoadFreelist bool
87
88 // If you want to read the entire database fast, you can set MmapFlag to
89 // syscall.MAP_POPULATE on Linux 2.6.23+ for sequential read-ahead.
90 MmapFlags int
91
92 // MaxBatchSize is the maximum size of a batch. Default value is
93 // copied from DefaultMaxBatchSize in Open.
94 //
95 // If <=0, disables batching.
96 //
97 // Do not change concurrently with calls to Batch.
98 MaxBatchSize int
99
100 // MaxBatchDelay is the maximum delay before a batch starts.
101 // Default value is copied from DefaultMaxBatchDelay in Open.
102 //
103 // If <=0, effectively disables batching.
104 //
105 // Do not change concurrently with calls to Batch.
106 MaxBatchDelay time.Duration
107
108 // AllocSize is the amount of space allocated when the database
109 // needs to create new pages. This is done to amortize the cost
110 // of truncate() and fsync() when growing the data file.
111 AllocSize int
112
113 // Mlock locks database file in memory when set to true.
114 // It prevents major page faults, however used memory can't be reclaimed.
115 //
116 // Supported only on Unix via mlock/munlock syscalls.
117 Mlock bool
118
119 logger Logger
120
121 path string
122 openFile func(string, int, os.FileMode) (*os.File, error)
123 file *os.File
124 // `dataref` isn't used at all on Windows, and the golangci-lint
125 // always fails on Windows platform.
126 //nolint
127 dataref []byte // mmap'ed readonly, write throws SEGV
128 data *[common.MaxMapSize]byte
129 datasz int
130 meta0 *common.Meta
131 meta1 *common.Meta
132 pageSize int
133 opened bool
134 rwtx *Tx
135 txs []*Tx
136
137 freelist fl.Interface
138 freelistLoad sync.Once
139
140 pagePool sync.Pool
141
142 batchMu sync.Mutex
143 batch *batch
144
145 rwlock sync.Mutex // Allows only one writer at a time.
146 metalock sync.Mutex // Protects meta page access.
147 mmaplock sync.RWMutex // Protects mmap access during remapping.
148 statlock sync.RWMutex // Protects stats access.
149
150 ops struct {
151 writeAt func(b []byte, off int64) (n int, err error)
152 }
153
154 // Read only mode.
155 // When true, Update() and Begin(true) return ErrDatabaseReadOnly immediately.
156 readOnly bool
157}
158
159// Path returns the path to currently open database file.
160func (db *DB) Path() string {
161 return db.path
162}
163
164// GoString returns the Go string representation of the database.
165func (db *DB) GoString() string {
166 return fmt.Sprintf("bolt.DB{path:%q}", db.path)
167}
168
169// String returns the string representation of the database.
170func (db *DB) String() string {
171 return fmt.Sprintf("DB<%q>", db.path)
172}
173
174// Open creates and opens a database at the given path with a given file mode.
175// If the file does not exist then it will be created automatically with a given file mode.
176// Passing in nil options will cause Bolt to open the database with the default options.
177// Note: For read/write transactions, ensure the owner has write permission on the created/opened database file, e.g. 0600
178func Open(path string, mode os.FileMode, options *Options) (db *DB, err error) {
179 db = &DB{
180 opened: true,
181 }
182
183 // Set default options if no options are provided.
184 if options == nil {
185 options = DefaultOptions
186 }
187 db.NoSync = options.NoSync
188 db.NoGrowSync = options.NoGrowSync
189 db.MmapFlags = options.MmapFlags
190 db.NoFreelistSync = options.NoFreelistSync
191 db.PreLoadFreelist = options.PreLoadFreelist
192 db.FreelistType = options.FreelistType
193 db.Mlock = options.Mlock
194
195 // Set default values for later DB operations.
196 db.MaxBatchSize = common.DefaultMaxBatchSize
197 db.MaxBatchDelay = common.DefaultMaxBatchDelay
198 db.AllocSize = common.DefaultAllocSize
199
200 if options.Logger == nil {
201 db.logger = getDiscardLogger()
202 } else {
203 db.logger = options.Logger
204 }
205
206 lg := db.Logger()
207 if lg != discardLogger {
208 lg.Infof("Opening db file (%s) with mode %s and with options: %s", path, mode, options)
209 defer func() {
210 if err != nil {
211 lg.Errorf("Opening bbolt db (%s) failed: %v", path, err)
212 } else {
213 lg.Infof("Opening bbolt db (%s) successfully", path)
214 }
215 }()
216 }
217
218 flag := os.O_RDWR
219 if options.ReadOnly {
220 flag = os.O_RDONLY
221 db.readOnly = true
222 } else {
223 // always load free pages in write mode
224 db.PreLoadFreelist = true
225 flag |= os.O_CREATE
226 }
227
228 db.openFile = options.OpenFile
229 if db.openFile == nil {
230 db.openFile = os.OpenFile
231 }
232
233 // Open data file and separate sync handler for metadata writes.
234 if db.file, err = db.openFile(path, flag, mode); err != nil {
235 _ = db.close()
236 lg.Errorf("failed to open db file (%s): %v", path, err)
237 return nil, err
238 }
239 db.path = db.file.Name()
240
241 // Lock file so that other processes using Bolt in read-write mode cannot
242 // use the database at the same time. This would cause corruption since
243 // the two processes would write meta pages and free pages separately.
244 // The database file is locked exclusively (only one process can grab the lock)
245 // if !options.ReadOnly.
246 // The database file is locked using the shared lock (more than one process may
247 // hold a lock at the same time) otherwise (options.ReadOnly is set).
248 if err = flock(db, !db.readOnly, options.Timeout); err != nil {
249 _ = db.close()
250 lg.Errorf("failed to lock db file (%s), readonly: %t, error: %v", path, db.readOnly, err)
251 return nil, err
252 }
253
254 // Default values for test hooks
255 db.ops.writeAt = db.file.WriteAt
256
257 if db.pageSize = options.PageSize; db.pageSize == 0 {
258 // Set the default page size to the OS page size.
259 db.pageSize = common.DefaultPageSize
260 }
261
262 // Initialize the database if it doesn't exist.
263 if info, statErr := db.file.Stat(); statErr != nil {
264 _ = db.close()
265 lg.Errorf("failed to get db file's stats (%s): %v", path, err)
266 return nil, statErr
267 } else if info.Size() == 0 {
268 // Initialize new files with meta pages.
269 if err = db.init(); err != nil {
270 // clean up file descriptor on initialization fail
271 _ = db.close()
272 lg.Errorf("failed to initialize db file (%s): %v", path, err)
273 return nil, err
274 }
275 } else {
276 // try to get the page size from the metadata pages
277 if db.pageSize, err = db.getPageSize(); err != nil {
278 _ = db.close()
279 lg.Errorf("failed to get page size from db file (%s): %v", path, err)
280 return nil, err
281 }
282 }
283
284 // Initialize page pool.
285 db.pagePool = sync.Pool{
286 New: func() interface{} {
287 return make([]byte, db.pageSize)
288 },
289 }
290
291 // Memory map the data file.
292 if err = db.mmap(options.InitialMmapSize); err != nil {
293 _ = db.close()
294 lg.Errorf("failed to map db file (%s): %v", path, err)
295 return nil, err
296 }
297
298 if db.PreLoadFreelist {
299 db.loadFreelist()
300 }
301
302 if db.readOnly {
303 return db, nil
304 }
305
306 // Flush freelist when transitioning from no sync to sync so
307 // NoFreelistSync unaware boltdb can open the db later.
308 if !db.NoFreelistSync && !db.hasSyncedFreelist() {
309 tx, txErr := db.Begin(true)
310 if tx != nil {
311 txErr = tx.Commit()
312 }
313 if txErr != nil {
314 lg.Errorf("starting readwrite transaction failed: %v", txErr)
315 _ = db.close()
316 return nil, txErr
317 }
318 }
319
320 // Mark the database as opened and return.
321 return db, nil
322}
323
324// getPageSize reads the pageSize from the meta pages. It tries
325// to read the first meta page firstly. If the first page is invalid,
326// then it tries to read the second page using the default page size.
327func (db *DB) getPageSize() (int, error) {
328 var (
329 meta0CanRead, meta1CanRead bool
330 )
331
332 // Read the first meta page to determine the page size.
333 if pgSize, canRead, err := db.getPageSizeFromFirstMeta(); err != nil {
334 // We cannot read the page size from page 0, but can read page 0.
335 meta0CanRead = canRead
336 } else {
337 return pgSize, nil
338 }
339
340 // Read the second meta page to determine the page size.
341 if pgSize, canRead, err := db.getPageSizeFromSecondMeta(); err != nil {
342 // We cannot read the page size from page 1, but can read page 1.
343 meta1CanRead = canRead
344 } else {
345 return pgSize, nil
346 }
347
348 // If we can't read the page size from both pages, but can read
349 // either page, then we assume it's the same as the OS or the one
350 // given, since that's how the page size was chosen in the first place.
351 //
352 // If both pages are invalid, and (this OS uses a different page size
353 // from what the database was created with or the given page size is
354 // different from what the database was created with), then we are out
355 // of luck and cannot access the database.
356 if meta0CanRead || meta1CanRead {
357 return db.pageSize, nil
358 }
359
360 return 0, berrors.ErrInvalid
361}
362
363// getPageSizeFromFirstMeta reads the pageSize from the first meta page
364func (db *DB) getPageSizeFromFirstMeta() (int, bool, error) {
365 var buf [0x1000]byte
366 var metaCanRead bool
367 if bw, err := db.file.ReadAt(buf[:], 0); err == nil && bw == len(buf) {
368 metaCanRead = true
369 if m := db.pageInBuffer(buf[:], 0).Meta(); m.Validate() == nil {
370 return int(m.PageSize()), metaCanRead, nil
371 }
372 }
373 return 0, metaCanRead, berrors.ErrInvalid
374}
375
376// getPageSizeFromSecondMeta reads the pageSize from the second meta page
377func (db *DB) getPageSizeFromSecondMeta() (int, bool, error) {
378 var (
379 fileSize int64
380 metaCanRead bool
381 )
382
383 // get the db file size
384 if info, err := db.file.Stat(); err != nil {
385 return 0, metaCanRead, err
386 } else {
387 fileSize = info.Size()
388 }
389
390 // We need to read the second meta page, so we should skip the first page;
391 // but we don't know the exact page size yet, it's chicken & egg problem.
392 // The solution is to try all the possible page sizes, which starts from 1KB
393 // and until 16MB (1024<<14) or the end of the db file
394 //
395 // TODO: should we support larger page size?
396 for i := 0; i <= 14; i++ {
397 var buf [0x1000]byte
398 var pos int64 = 1024 << uint(i)
399 if pos >= fileSize-1024 {
400 break
401 }
402 bw, err := db.file.ReadAt(buf[:], pos)
403 if (err == nil && bw == len(buf)) || (err == io.EOF && int64(bw) == (fileSize-pos)) {
404 metaCanRead = true
405 if m := db.pageInBuffer(buf[:], 0).Meta(); m.Validate() == nil {
406 return int(m.PageSize()), metaCanRead, nil
407 }
408 }
409 }
410
411 return 0, metaCanRead, berrors.ErrInvalid
412}
413
414// loadFreelist reads the freelist if it is synced, or reconstructs it
415// by scanning the DB if it is not synced. It assumes there are no
416// concurrent accesses being made to the freelist.
417func (db *DB) loadFreelist() {
418 db.freelistLoad.Do(func() {
419 db.freelist = newFreelist(db.FreelistType)
420 if !db.hasSyncedFreelist() {
421 // Reconstruct free list by scanning the DB.
422 db.freelist.Init(db.freepages())
423 } else {
424 // Read free list from freelist page.
425 db.freelist.Read(db.page(db.meta().Freelist()))
426 }
427 db.stats.FreePageN = db.freelist.FreeCount()
428 })
429}
430
431func (db *DB) hasSyncedFreelist() bool {
432 return db.meta().Freelist() != common.PgidNoFreelist
433}
434
435func (db *DB) fileSize() (int, error) {
436 info, err := db.file.Stat()
437 if err != nil {
438 return 0, fmt.Errorf("file stat error: %w", err)
439 }
440 sz := int(info.Size())
441 if sz < db.pageSize*2 {
442 return 0, fmt.Errorf("file size too small %d", sz)
443 }
444 return sz, nil
445}
446
447// mmap opens the underlying memory-mapped file and initializes the meta references.
448// minsz is the minimum size that the new mmap can be.
449func (db *DB) mmap(minsz int) (err error) {
450 db.mmaplock.Lock()
451 defer db.mmaplock.Unlock()
452
453 lg := db.Logger()
454
455 // Ensure the size is at least the minimum size.
456 var fileSize int
457 fileSize, err = db.fileSize()
458 if err != nil {
459 lg.Errorf("getting file size failed: %w", err)
460 return err
461 }
462 var size = fileSize
463 if size < minsz {
464 size = minsz
465 }
466 size, err = db.mmapSize(size)
467 if err != nil {
468 lg.Errorf("getting map size failed: %w", err)
469 return err
470 }
471
472 if db.Mlock {
473 // Unlock db memory
474 if err := db.munlock(fileSize); err != nil {
475 return err
476 }
477 }
478
479 // Dereference all mmap references before unmapping.
480 if db.rwtx != nil {
481 db.rwtx.root.dereference()
482 }
483
484 // Unmap existing data before continuing.
485 if err = db.munmap(); err != nil {
486 return err
487 }
488
489 // Memory-map the data file as a byte slice.
490 // gofail: var mapError string
491 // return errors.New(mapError)
492 if err = mmap(db, size); err != nil {
493 lg.Errorf("[GOOS: %s, GOARCH: %s] mmap failed, size: %d, error: %v", runtime.GOOS, runtime.GOARCH, size, err)
494 return err
495 }
496
497 // Perform unmmap on any error to reset all data fields:
498 // dataref, data, datasz, meta0 and meta1.
499 defer func() {
500 if err != nil {
501 if unmapErr := db.munmap(); unmapErr != nil {
502 err = fmt.Errorf("%w; rollback unmap also failed: %v", err, unmapErr)
503 }
504 }
505 }()
506
507 if db.Mlock {
508 // Don't allow swapping of data file
509 if err := db.mlock(fileSize); err != nil {
510 return err
511 }
512 }
513
514 // Save references to the meta pages.
515 db.meta0 = db.page(0).Meta()
516 db.meta1 = db.page(1).Meta()
517
518 // Validate the meta pages. We only return an error if both meta pages fail
519 // validation, since meta0 failing validation means that it wasn't saved
520 // properly -- but we can recover using meta1. And vice-versa.
521 err0 := db.meta0.Validate()
522 err1 := db.meta1.Validate()
523 if err0 != nil && err1 != nil {
524 lg.Errorf("both meta pages are invalid, meta0: %v, meta1: %v", err0, err1)
525 return err0
526 }
527
528 return nil
529}
530
531func (db *DB) invalidate() {
532 db.dataref = nil
533 db.data = nil
534 db.datasz = 0
535
536 db.meta0 = nil
537 db.meta1 = nil
538}
539
540// munmap unmaps the data file from memory.
541func (db *DB) munmap() error {
542 defer db.invalidate()
543
544 // gofail: var unmapError string
545 // return errors.New(unmapError)
546 if err := munmap(db); err != nil {
547 db.Logger().Errorf("[GOOS: %s, GOARCH: %s] munmap failed, db.datasz: %d, error: %v", runtime.GOOS, runtime.GOARCH, db.datasz, err)
548 return fmt.Errorf("unmap error: %v", err.Error())
549 }
550
551 return nil
552}
553
554// mmapSize determines the appropriate size for the mmap given the current size
555// of the database. The minimum size is 32KB and doubles until it reaches 1GB.
556// Returns an error if the new mmap size is greater than the max allowed.
557func (db *DB) mmapSize(size int) (int, error) {
558 // Double the size from 32KB until 1GB.
559 for i := uint(15); i <= 30; i++ {
560 if size <= 1<<i {
561 return 1 << i, nil
562 }
563 }
564
565 // Verify the requested size is not above the maximum allowed.
566 if size > common.MaxMapSize {
567 return 0, errors.New("mmap too large")
568 }
569
570 // If larger than 1GB then grow by 1GB at a time.
571 sz := int64(size)
572 if remainder := sz % int64(common.MaxMmapStep); remainder > 0 {
573 sz += int64(common.MaxMmapStep) - remainder
574 }
575
576 // Ensure that the mmap size is a multiple of the page size.
577 // This should always be true since we're incrementing in MBs.
578 pageSize := int64(db.pageSize)
579 if (sz % pageSize) != 0 {
580 sz = ((sz / pageSize) + 1) * pageSize
581 }
582
583 // If we've exceeded the max size then only grow up to the max size.
584 if sz > common.MaxMapSize {
585 sz = common.MaxMapSize
586 }
587
588 return int(sz), nil
589}
590
591func (db *DB) munlock(fileSize int) error {
592 // gofail: var munlockError string
593 // return errors.New(munlockError)
594 if err := munlock(db, fileSize); err != nil {
595 db.Logger().Errorf("[GOOS: %s, GOARCH: %s] munlock failed, fileSize: %d, db.datasz: %d, error: %v", runtime.GOOS, runtime.GOARCH, fileSize, db.datasz, err)
596 return fmt.Errorf("munlock error: %v", err.Error())
597 }
598 return nil
599}
600
601func (db *DB) mlock(fileSize int) error {
602 // gofail: var mlockError string
603 // return errors.New(mlockError)
604 if err := mlock(db, fileSize); err != nil {
605 db.Logger().Errorf("[GOOS: %s, GOARCH: %s] mlock failed, fileSize: %d, db.datasz: %d, error: %v", runtime.GOOS, runtime.GOARCH, fileSize, db.datasz, err)
606 return fmt.Errorf("mlock error: %v", err.Error())
607 }
608 return nil
609}
610
611func (db *DB) mrelock(fileSizeFrom, fileSizeTo int) error {
612 if err := db.munlock(fileSizeFrom); err != nil {
613 return err
614 }
615 if err := db.mlock(fileSizeTo); err != nil {
616 return err
617 }
618 return nil
619}
620
621// init creates a new database file and initializes its meta pages.
622func (db *DB) init() error {
623 // Create two meta pages on a buffer.
624 buf := make([]byte, db.pageSize*4)
625 for i := 0; i < 2; i++ {
626 p := db.pageInBuffer(buf, common.Pgid(i))
627 p.SetId(common.Pgid(i))
628 p.SetFlags(common.MetaPageFlag)
629
630 // Initialize the meta page.
631 m := p.Meta()
632 m.SetMagic(common.Magic)
633 m.SetVersion(common.Version)
634 m.SetPageSize(uint32(db.pageSize))
635 m.SetFreelist(2)
636 m.SetRootBucket(common.NewInBucket(3, 0))
637 m.SetPgid(4)
638 m.SetTxid(common.Txid(i))
639 m.SetChecksum(m.Sum64())
640 }
641
642 // Write an empty freelist at page 3.
643 p := db.pageInBuffer(buf, common.Pgid(2))
644 p.SetId(2)
645 p.SetFlags(common.FreelistPageFlag)
646 p.SetCount(0)
647
648 // Write an empty leaf page at page 4.
649 p = db.pageInBuffer(buf, common.Pgid(3))
650 p.SetId(3)
651 p.SetFlags(common.LeafPageFlag)
652 p.SetCount(0)
653
654 // Write the buffer to our data file.
655 if _, err := db.ops.writeAt(buf, 0); err != nil {
656 db.Logger().Errorf("writeAt failed: %w", err)
657 return err
658 }
659 if err := fdatasync(db); err != nil {
660 db.Logger().Errorf("[GOOS: %s, GOARCH: %s] fdatasync failed: %w", runtime.GOOS, runtime.GOARCH, err)
661 return err
662 }
663
664 return nil
665}
666
667// Close releases all database resources.
668// It will block waiting for any open transactions to finish
669// before closing the database and returning.
670func (db *DB) Close() error {
671 db.rwlock.Lock()
672 defer db.rwlock.Unlock()
673
674 db.metalock.Lock()
675 defer db.metalock.Unlock()
676
677 db.mmaplock.Lock()
678 defer db.mmaplock.Unlock()
679
680 return db.close()
681}
682
683func (db *DB) close() error {
684 if !db.opened {
685 return nil
686 }
687
688 db.opened = false
689
690 db.freelist = nil
691
692 // Clear ops.
693 db.ops.writeAt = nil
694
695 var errs []error
696 // Close the mmap.
697 if err := db.munmap(); err != nil {
698 errs = append(errs, err)
699 }
700
701 // Close file handles.
702 if db.file != nil {
703 // No need to unlock read-only file.
704 if !db.readOnly {
705 // Unlock the file.
706 if err := funlock(db); err != nil {
707 errs = append(errs, fmt.Errorf("bolt.Close(): funlock error: %w", err))
708 }
709 }
710
711 // Close the file descriptor.
712 if err := db.file.Close(); err != nil {
713 errs = append(errs, fmt.Errorf("db file close: %w", err))
714 }
715 db.file = nil
716 }
717
718 db.path = ""
719
720 if len(errs) > 0 {
721 return errs[0]
722 }
723 return nil
724}
725
726// Begin starts a new transaction.
727// Multiple read-only transactions can be used concurrently but only one
728// write transaction can be used at a time. Starting multiple write transactions
729// will cause the calls to block and be serialized until the current write
730// transaction finishes.
731//
732// Transactions should not be dependent on one another. Opening a read
733// transaction and a write transaction in the same goroutine can cause the
734// writer to deadlock because the database periodically needs to re-mmap itself
735// as it grows and it cannot do that while a read transaction is open.
736//
737// If a long running read transaction (for example, a snapshot transaction) is
738// needed, you might want to set DB.InitialMmapSize to a large enough value
739// to avoid potential blocking of write transaction.
740//
741// IMPORTANT: You must close read-only transactions after you are finished or
742// else the database will not reclaim old pages.
743func (db *DB) Begin(writable bool) (t *Tx, err error) {
744 if lg := db.Logger(); lg != discardLogger {
745 lg.Debugf("Starting a new transaction [writable: %t]", writable)
746 defer func() {
747 if err != nil {
748 lg.Errorf("Starting a new transaction [writable: %t] failed: %v", writable, err)
749 } else {
750 lg.Debugf("Starting a new transaction [writable: %t] successfully", writable)
751 }
752 }()
753 }
754
755 if writable {
756 return db.beginRWTx()
757 }
758 return db.beginTx()
759}
760
761func (db *DB) Logger() Logger {
762 if db == nil || db.logger == nil {
763 return getDiscardLogger()
764 }
765 return db.logger
766}
767
768func (db *DB) beginTx() (*Tx, error) {
769 // Lock the meta pages while we initialize the transaction. We obtain
770 // the meta lock before the mmap lock because that's the order that the
771 // write transaction will obtain them.
772 db.metalock.Lock()
773
774 // Obtain a read-only lock on the mmap. When the mmap is remapped it will
775 // obtain a write lock so all transactions must finish before it can be
776 // remapped.
777 db.mmaplock.RLock()
778
779 // Exit if the database is not open yet.
780 if !db.opened {
781 db.mmaplock.RUnlock()
782 db.metalock.Unlock()
783 return nil, berrors.ErrDatabaseNotOpen
784 }
785
786 // Exit if the database is not correctly mapped.
787 if db.data == nil {
788 db.mmaplock.RUnlock()
789 db.metalock.Unlock()
790 return nil, berrors.ErrInvalidMapping
791 }
792
793 // Create a transaction associated with the database.
794 t := &Tx{}
795 t.init(db)
796
797 // Keep track of transaction until it closes.
798 db.txs = append(db.txs, t)
799 n := len(db.txs)
800 if db.freelist != nil {
801 db.freelist.AddReadonlyTXID(t.meta.Txid())
802 }
803
804 // Unlock the meta pages.
805 db.metalock.Unlock()
806
807 // Update the transaction stats.
808 db.statlock.Lock()
809 db.stats.TxN++
810 db.stats.OpenTxN = n
811 db.statlock.Unlock()
812
813 return t, nil
814}
815
816func (db *DB) beginRWTx() (*Tx, error) {
817 // If the database was opened with Options.ReadOnly, return an error.
818 if db.readOnly {
819 return nil, berrors.ErrDatabaseReadOnly
820 }
821
822 // Obtain writer lock. This is released by the transaction when it closes.
823 // This enforces only one writer transaction at a time.
824 db.rwlock.Lock()
825
826 // Once we have the writer lock then we can lock the meta pages so that
827 // we can set up the transaction.
828 db.metalock.Lock()
829 defer db.metalock.Unlock()
830
831 // Exit if the database is not open yet.
832 if !db.opened {
833 db.rwlock.Unlock()
834 return nil, berrors.ErrDatabaseNotOpen
835 }
836
837 // Exit if the database is not correctly mapped.
838 if db.data == nil {
839 db.rwlock.Unlock()
840 return nil, berrors.ErrInvalidMapping
841 }
842
843 // Create a transaction associated with the database.
844 t := &Tx{writable: true}
845 t.init(db)
846 db.rwtx = t
847 db.freelist.ReleasePendingPages()
848 return t, nil
849}
850
851// removeTx removes a transaction from the database.
852func (db *DB) removeTx(tx *Tx) {
853 // Release the read lock on the mmap.
854 db.mmaplock.RUnlock()
855
856 // Use the meta lock to restrict access to the DB object.
857 db.metalock.Lock()
858
859 // Remove the transaction.
860 for i, t := range db.txs {
861 if t == tx {
862 last := len(db.txs) - 1
863 db.txs[i] = db.txs[last]
864 db.txs[last] = nil
865 db.txs = db.txs[:last]
866 break
867 }
868 }
869 n := len(db.txs)
870 if db.freelist != nil {
871 db.freelist.RemoveReadonlyTXID(tx.meta.Txid())
872 }
873
874 // Unlock the meta pages.
875 db.metalock.Unlock()
876
877 // Merge statistics.
878 db.statlock.Lock()
879 db.stats.OpenTxN = n
880 db.stats.TxStats.add(&tx.stats)
881 db.statlock.Unlock()
882}
883
884// Update executes a function within the context of a read-write managed transaction.
885// If no error is returned from the function then the transaction is committed.
886// If an error is returned then the entire transaction is rolled back.
887// Any error that is returned from the function or returned from the commit is
888// returned from the Update() method.
889//
890// Attempting to manually commit or rollback within the function will cause a panic.
891func (db *DB) Update(fn func(*Tx) error) error {
892 t, err := db.Begin(true)
893 if err != nil {
894 return err
895 }
896
897 // Make sure the transaction rolls back in the event of a panic.
898 defer func() {
899 if t.db != nil {
900 t.rollback()
901 }
902 }()
903
904 // Mark as a managed tx so that the inner function cannot manually commit.
905 t.managed = true
906
907 // If an error is returned from the function then rollback and return error.
908 err = fn(t)
909 t.managed = false
910 if err != nil {
911 _ = t.Rollback()
912 return err
913 }
914
915 return t.Commit()
916}
917
918// View executes a function within the context of a managed read-only transaction.
919// Any error that is returned from the function is returned from the View() method.
920//
921// Attempting to manually rollback within the function will cause a panic.
922func (db *DB) View(fn func(*Tx) error) error {
923 t, err := db.Begin(false)
924 if err != nil {
925 return err
926 }
927
928 // Make sure the transaction rolls back in the event of a panic.
929 defer func() {
930 if t.db != nil {
931 t.rollback()
932 }
933 }()
934
935 // Mark as a managed tx so that the inner function cannot manually rollback.
936 t.managed = true
937
938 // If an error is returned from the function then pass it through.
939 err = fn(t)
940 t.managed = false
941 if err != nil {
942 _ = t.Rollback()
943 return err
944 }
945
946 return t.Rollback()
947}
948
949// Batch calls fn as part of a batch. It behaves similar to Update,
950// except:
951//
952// 1. concurrent Batch calls can be combined into a single Bolt
953// transaction.
954//
955// 2. the function passed to Batch may be called multiple times,
956// regardless of whether it returns error or not.
957//
958// This means that Batch function side effects must be idempotent and
959// take permanent effect only after a successful return is seen in
960// caller.
961//
962// The maximum batch size and delay can be adjusted with DB.MaxBatchSize
963// and DB.MaxBatchDelay, respectively.
964//
965// Batch is only useful when there are multiple goroutines calling it.
966func (db *DB) Batch(fn func(*Tx) error) error {
967 errCh := make(chan error, 1)
968
969 db.batchMu.Lock()
970 if (db.batch == nil) || (db.batch != nil && len(db.batch.calls) >= db.MaxBatchSize) {
971 // There is no existing batch, or the existing batch is full; start a new one.
972 db.batch = &batch{
973 db: db,
974 }
975 db.batch.timer = time.AfterFunc(db.MaxBatchDelay, db.batch.trigger)
976 }
977 db.batch.calls = append(db.batch.calls, call{fn: fn, err: errCh})
978 if len(db.batch.calls) >= db.MaxBatchSize {
979 // wake up batch, it's ready to run
980 go db.batch.trigger()
981 }
982 db.batchMu.Unlock()
983
984 err := <-errCh
985 if err == trySolo {
986 err = db.Update(fn)
987 }
988 return err
989}
990
991type call struct {
992 fn func(*Tx) error
993 err chan<- error
994}
995
996type batch struct {
997 db *DB
998 timer *time.Timer
999 start sync.Once
1000 calls []call
1001}
1002
1003// trigger runs the batch if it hasn't already been run.
1004func (b *batch) trigger() {
1005 b.start.Do(b.run)
1006}
1007
1008// run performs the transactions in the batch and communicates results
1009// back to DB.Batch.
1010func (b *batch) run() {
1011 b.db.batchMu.Lock()
1012 b.timer.Stop()
1013 // Make sure no new work is added to this batch, but don't break
1014 // other batches.
1015 if b.db.batch == b {
1016 b.db.batch = nil
1017 }
1018 b.db.batchMu.Unlock()
1019
1020retry:
1021 for len(b.calls) > 0 {
1022 var failIdx = -1
1023 err := b.db.Update(func(tx *Tx) error {
1024 for i, c := range b.calls {
1025 if err := safelyCall(c.fn, tx); err != nil {
1026 failIdx = i
1027 return err
1028 }
1029 }
1030 return nil
1031 })
1032
1033 if failIdx >= 0 {
1034 // take the failing transaction out of the batch. it's
1035 // safe to shorten b.calls here because db.batch no longer
1036 // points to us, and we hold the mutex anyway.
1037 c := b.calls[failIdx]
1038 b.calls[failIdx], b.calls = b.calls[len(b.calls)-1], b.calls[:len(b.calls)-1]
1039 // tell the submitter re-run it solo, continue with the rest of the batch
1040 c.err <- trySolo
1041 continue retry
1042 }
1043
1044 // pass success, or bolt internal errors, to all callers
1045 for _, c := range b.calls {
1046 c.err <- err
1047 }
1048 break retry
1049 }
1050}
1051
1052// trySolo is a special sentinel error value used for signaling that a
1053// transaction function should be re-run. It should never be seen by
1054// callers.
1055var trySolo = errors.New("batch function returned an error and should be re-run solo")
1056
1057type panicked struct {
1058 reason interface{}
1059}
1060
1061func (p panicked) Error() string {
1062 if err, ok := p.reason.(error); ok {
1063 return err.Error()
1064 }
1065 return fmt.Sprintf("panic: %v", p.reason)
1066}
1067
1068func safelyCall(fn func(*Tx) error, tx *Tx) (err error) {
1069 defer func() {
1070 if p := recover(); p != nil {
1071 err = panicked{p}
1072 }
1073 }()
1074 return fn(tx)
1075}
1076
1077// Sync executes fdatasync() against the database file handle.
1078//
1079// This is not necessary under normal operation, however, if you use NoSync
1080// then it allows you to force the database file to sync against the disk.
1081func (db *DB) Sync() (err error) {
1082 if lg := db.Logger(); lg != discardLogger {
1083 lg.Debugf("Syncing bbolt db (%s)", db.path)
1084 defer func() {
1085 if err != nil {
1086 lg.Errorf("[GOOS: %s, GOARCH: %s] syncing bbolt db (%s) failed: %v", runtime.GOOS, runtime.GOARCH, db.path, err)
1087 } else {
1088 lg.Debugf("Syncing bbolt db (%s) successfully", db.path)
1089 }
1090 }()
1091 }
1092
1093 return fdatasync(db)
1094}
1095
1096// Stats retrieves ongoing performance stats for the database.
1097// This is only updated when a transaction closes.
1098func (db *DB) Stats() Stats {
1099 db.statlock.RLock()
1100 defer db.statlock.RUnlock()
1101 return db.stats
1102}
1103
1104// This is for internal access to the raw data bytes from the C cursor, use
1105// carefully, or not at all.
1106func (db *DB) Info() *Info {
1107 common.Assert(db.data != nil, "database file isn't correctly mapped")
1108 return &Info{uintptr(unsafe.Pointer(&db.data[0])), db.pageSize}
1109}
1110
1111// page retrieves a page reference from the mmap based on the current page size.
1112func (db *DB) page(id common.Pgid) *common.Page {
1113 pos := id * common.Pgid(db.pageSize)
1114 return (*common.Page)(unsafe.Pointer(&db.data[pos]))
1115}
1116
1117// pageInBuffer retrieves a page reference from a given byte array based on the current page size.
1118func (db *DB) pageInBuffer(b []byte, id common.Pgid) *common.Page {
1119 return (*common.Page)(unsafe.Pointer(&b[id*common.Pgid(db.pageSize)]))
1120}
1121
1122// meta retrieves the current meta page reference.
1123func (db *DB) meta() *common.Meta {
1124 // We have to return the meta with the highest txid which doesn't fail
1125 // validation. Otherwise, we can cause errors when in fact the database is
1126 // in a consistent state. metaA is the one with the higher txid.
1127 metaA := db.meta0
1128 metaB := db.meta1
1129 if db.meta1.Txid() > db.meta0.Txid() {
1130 metaA = db.meta1
1131 metaB = db.meta0
1132 }
1133
1134 // Use higher meta page if valid. Otherwise, fallback to previous, if valid.
1135 if err := metaA.Validate(); err == nil {
1136 return metaA
1137 } else if err := metaB.Validate(); err == nil {
1138 return metaB
1139 }
1140
1141 // This should never be reached, because both meta1 and meta0 were validated
1142 // on mmap() and we do fsync() on every write.
1143 panic("bolt.DB.meta(): invalid meta pages")
1144}
1145
1146// allocate returns a contiguous block of memory starting at a given page.
1147func (db *DB) allocate(txid common.Txid, count int) (*common.Page, error) {
1148 // Allocate a temporary buffer for the page.
1149 var buf []byte
1150 if count == 1 {
1151 buf = db.pagePool.Get().([]byte)
1152 } else {
1153 buf = make([]byte, count*db.pageSize)
1154 }
1155 p := (*common.Page)(unsafe.Pointer(&buf[0]))
1156 p.SetOverflow(uint32(count - 1))
1157
1158 // Use pages from the freelist if they are available.
1159 p.SetId(db.freelist.Allocate(txid, count))
1160 if p.Id() != 0 {
1161 return p, nil
1162 }
1163
1164 // Resize mmap() if we're at the end.
1165 p.SetId(db.rwtx.meta.Pgid())
1166 var minsz = int((p.Id()+common.Pgid(count))+1) * db.pageSize
1167 if minsz >= db.datasz {
1168 if err := db.mmap(minsz); err != nil {
1169 return nil, fmt.Errorf("mmap allocate error: %s", err)
1170 }
1171 }
1172
1173 // Move the page id high water mark.
1174 curPgid := db.rwtx.meta.Pgid()
1175 db.rwtx.meta.SetPgid(curPgid + common.Pgid(count))
1176
1177 return p, nil
1178}
1179
1180// grow grows the size of the database to the given sz.
1181func (db *DB) grow(sz int) error {
1182 // Ignore if the new size is less than available file size.
1183 lg := db.Logger()
1184 fileSize, err := db.fileSize()
1185 if err != nil {
1186 lg.Errorf("getting file size failed: %w", err)
1187 return err
1188 }
1189 if sz <= fileSize {
1190 return nil
1191 }
1192
1193 // If the data is smaller than the alloc size then only allocate what's needed.
1194 // Once it goes over the allocation size then allocate in chunks.
1195 if db.datasz <= db.AllocSize {
1196 sz = db.datasz
1197 } else {
1198 sz += db.AllocSize
1199 }
1200
1201 // Truncate and fsync to ensure file size metadata is flushed.
1202 // https://github.com/boltdb/bolt/issues/284
1203 if !db.NoGrowSync && !db.readOnly {
1204 if runtime.GOOS != "windows" {
1205 // gofail: var resizeFileError string
1206 // return errors.New(resizeFileError)
1207 if err := db.file.Truncate(int64(sz)); err != nil {
1208 lg.Errorf("[GOOS: %s, GOARCH: %s] truncating file failed, size: %d, db.datasz: %d, error: %v", runtime.GOOS, runtime.GOARCH, sz, db.datasz, err)
1209 return fmt.Errorf("file resize error: %s", err)
1210 }
1211 }
1212 if err := db.file.Sync(); err != nil {
1213 lg.Errorf("[GOOS: %s, GOARCH: %s] syncing file failed, db.datasz: %d, error: %v", runtime.GOOS, runtime.GOARCH, db.datasz, err)
1214 return fmt.Errorf("file sync error: %s", err)
1215 }
1216 if db.Mlock {
1217 // unlock old file and lock new one
1218 if err := db.mrelock(fileSize, sz); err != nil {
1219 return fmt.Errorf("mlock/munlock error: %s", err)
1220 }
1221 }
1222 }
1223
1224 return nil
1225}
1226
1227func (db *DB) IsReadOnly() bool {
1228 return db.readOnly
1229}
1230
1231func (db *DB) freepages() []common.Pgid {
1232 tx, err := db.beginTx()
1233 defer func() {
1234 err = tx.Rollback()
1235 if err != nil {
1236 panic("freepages: failed to rollback tx")
1237 }
1238 }()
1239 if err != nil {
1240 panic("freepages: failed to open read only tx")
1241 }
1242
1243 reachable := make(map[common.Pgid]*common.Page)
1244 nofreed := make(map[common.Pgid]bool)
1245 ech := make(chan error)
1246 go func() {
1247 for e := range ech {
1248 panic(fmt.Sprintf("freepages: failed to get all reachable pages (%v)", e))
1249 }
1250 }()
1251 tx.recursivelyCheckBucket(&tx.root, reachable, nofreed, HexKVStringer(), ech)
1252 close(ech)
1253
1254 // TODO: If check bucket reported any corruptions (ech) we shouldn't proceed to freeing the pages.
1255
1256 var fids []common.Pgid
1257 for i := common.Pgid(2); i < db.meta().Pgid(); i++ {
1258 if _, ok := reachable[i]; !ok {
1259 fids = append(fids, i)
1260 }
1261 }
1262 return fids
1263}
1264
1265func newFreelist(freelistType FreelistType) fl.Interface {
1266 if freelistType == FreelistMapType {
1267 return fl.NewHashMapFreelist()
1268 }
1269 return fl.NewArrayFreelist()
1270}
1271
1272// Options represents the options that can be set when opening a database.
1273type Options struct {
1274 // Timeout is the amount of time to wait to obtain a file lock.
1275 // When set to zero it will wait indefinitely.
1276 Timeout time.Duration
1277
1278 // Sets the DB.NoGrowSync flag before memory mapping the file.
1279 NoGrowSync bool
1280
1281 // Do not sync freelist to disk. This improves the database write performance
1282 // under normal operation, but requires a full database re-sync during recovery.
1283 NoFreelistSync bool
1284
1285 // PreLoadFreelist sets whether to load the free pages when opening
1286 // the db file. Note when opening db in write mode, bbolt will always
1287 // load the free pages.
1288 PreLoadFreelist bool
1289
1290 // FreelistType sets the backend freelist type. There are two options. Array which is simple but endures
1291 // dramatic performance degradation if database is large and fragmentation in freelist is common.
1292 // The alternative one is using hashmap, it is faster in almost all circumstances
1293 // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe.
1294 // The default type is array
1295 FreelistType FreelistType
1296
1297 // Open database in read-only mode. Uses flock(..., LOCK_SH |LOCK_NB) to
1298 // grab a shared lock (UNIX).
1299 ReadOnly bool
1300
1301 // Sets the DB.MmapFlags flag before memory mapping the file.
1302 MmapFlags int
1303
1304 // InitialMmapSize is the initial mmap size of the database
1305 // in bytes. Read transactions won't block write transaction
1306 // if the InitialMmapSize is large enough to hold database mmap
1307 // size. (See DB.Begin for more information)
1308 //
1309 // If <=0, the initial map size is 0.
1310 // If initialMmapSize is smaller than the previous database size,
1311 // it takes no effect.
1312 //
1313 // Note: On Windows, due to platform limitations, the database file size
1314 // will be immediately resized to match `InitialMmapSize` (aligned to page size)
1315 // when the DB is opened. On non-Windows platforms, the file size will grow
1316 // dynamically based on the actual amount of written data, regardless of `InitialMmapSize`.
1317 // Refer to https://github.com/etcd-io/bbolt/issues/378#issuecomment-1378121966.
1318 InitialMmapSize int
1319
1320 // PageSize overrides the default OS page size.
1321 PageSize int
1322
1323 // NoSync sets the initial value of DB.NoSync. Normally this can just be
1324 // set directly on the DB itself when returned from Open(), but this option
1325 // is useful in APIs which expose Options but not the underlying DB.
1326 NoSync bool
1327
1328 // OpenFile is used to open files. It defaults to os.OpenFile. This option
1329 // is useful for writing hermetic tests.
1330 OpenFile func(string, int, os.FileMode) (*os.File, error)
1331
1332 // Mlock locks database file in memory when set to true.
1333 // It prevents potential page faults, however
1334 // used memory can't be reclaimed. (UNIX only)
1335 Mlock bool
1336
1337 // Logger is the logger used for bbolt.
1338 Logger Logger
1339}
1340
1341func (o *Options) String() string {
1342 if o == nil {
1343 return "{}"
1344 }
1345
1346 return fmt.Sprintf("{Timeout: %s, NoGrowSync: %t, NoFreelistSync: %t, PreLoadFreelist: %t, FreelistType: %s, ReadOnly: %t, MmapFlags: %x, InitialMmapSize: %d, PageSize: %d, NoSync: %t, OpenFile: %p, Mlock: %t, Logger: %p}",
1347 o.Timeout, o.NoGrowSync, o.NoFreelistSync, o.PreLoadFreelist, o.FreelistType, o.ReadOnly, o.MmapFlags, o.InitialMmapSize, o.PageSize, o.NoSync, o.OpenFile, o.Mlock, o.Logger)
1348
1349}
1350
1351// DefaultOptions represent the options used if nil options are passed into Open().
1352// No timeout is used which will cause Bolt to wait indefinitely for a lock.
1353var DefaultOptions = &Options{
1354 Timeout: 0,
1355 NoGrowSync: false,
1356 FreelistType: FreelistArrayType,
1357}
1358
1359// Stats represents statistics about the database.
1360type Stats struct {
1361 // Put `TxStats` at the first field to ensure it's 64-bit aligned. Note
1362 // that the first word in an allocated struct can be relied upon to be
1363 // 64-bit aligned. Refer to https://pkg.go.dev/sync/atomic#pkg-note-BUG.
1364 // Also refer to discussion in https://github.com/etcd-io/bbolt/issues/577.
1365 TxStats TxStats // global, ongoing stats.
1366
1367 // Freelist stats
1368 FreePageN int // total number of free pages on the freelist
1369 PendingPageN int // total number of pending pages on the freelist
1370 FreeAlloc int // total bytes allocated in free pages
1371 FreelistInuse int // total bytes used by the freelist
1372
1373 // Transaction stats
1374 TxN int // total number of started read transactions
1375 OpenTxN int // number of currently open read transactions
1376}
1377
1378// Sub calculates and returns the difference between two sets of database stats.
1379// This is useful when obtaining stats at two different points and time and
1380// you need the performance counters that occurred within that time span.
1381func (s *Stats) Sub(other *Stats) Stats {
1382 if other == nil {
1383 return *s
1384 }
1385 var diff Stats
1386 diff.FreePageN = s.FreePageN
1387 diff.PendingPageN = s.PendingPageN
1388 diff.FreeAlloc = s.FreeAlloc
1389 diff.FreelistInuse = s.FreelistInuse
1390 diff.TxN = s.TxN - other.TxN
1391 diff.TxStats = s.TxStats.Sub(&other.TxStats)
1392 return diff
1393}
1394
1395type Info struct {
1396 Data uintptr
1397 PageSize int
1398}