| Abhay Kumar | 40252eb | 2025-10-13 13:25:53 +0000 | [diff] [blame^] | 1 | package bbolt |
| 2 | |
| 3 | import ( |
| 4 | "errors" |
| 5 | "fmt" |
| 6 | "io" |
| 7 | "os" |
| 8 | "runtime" |
| 9 | "sync" |
| 10 | "time" |
| 11 | "unsafe" |
| 12 | |
| 13 | berrors "go.etcd.io/bbolt/errors" |
| 14 | "go.etcd.io/bbolt/internal/common" |
| 15 | fl "go.etcd.io/bbolt/internal/freelist" |
| 16 | ) |
| 17 | |
| 18 | // The time elapsed between consecutive file locking attempts. |
| 19 | const flockRetryTimeout = 50 * time.Millisecond |
| 20 | |
| 21 | // FreelistType is the type of the freelist backend |
| 22 | type FreelistType string |
| 23 | |
| 24 | // TODO(ahrtr): eventually we should (step by step) |
| 25 | // 1. default to `FreelistMapType`; |
| 26 | // 2. remove the `FreelistArrayType`, do not export `FreelistMapType` |
| 27 | // and remove field `FreelistType' from both `DB` and `Options`; |
| 28 | const ( |
| 29 | // FreelistArrayType indicates backend freelist type is array |
| 30 | FreelistArrayType = FreelistType("array") |
| 31 | // FreelistMapType indicates backend freelist type is hashmap |
| 32 | FreelistMapType = FreelistType("hashmap") |
| 33 | ) |
| 34 | |
| 35 | // DB represents a collection of buckets persisted to a file on disk. |
| 36 | // All data access is performed through transactions which can be obtained through the DB. |
| 37 | // All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called. |
| 38 | type DB struct { |
| 39 | // Put `stats` at the first field to ensure it's 64-bit aligned. Note that |
| 40 | // the first word in an allocated struct can be relied upon to be 64-bit |
| 41 | // aligned. Refer to https://pkg.go.dev/sync/atomic#pkg-note-BUG. Also |
| 42 | // refer to discussion in https://github.com/etcd-io/bbolt/issues/577. |
| 43 | stats Stats |
| 44 | |
| 45 | // When enabled, the database will perform a Check() after every commit. |
| 46 | // A panic is issued if the database is in an inconsistent state. This |
| 47 | // flag has a large performance impact so it should only be used for |
| 48 | // debugging purposes. |
| 49 | StrictMode bool |
| 50 | |
| 51 | // Setting the NoSync flag will cause the database to skip fsync() |
| 52 | // calls after each commit. This can be useful when bulk loading data |
| 53 | // into a database and you can restart the bulk load in the event of |
| 54 | // a system failure or database corruption. Do not set this flag for |
| 55 | // normal use. |
| 56 | // |
| 57 | // If the package global IgnoreNoSync constant is true, this value is |
| 58 | // ignored. See the comment on that constant for more details. |
| 59 | // |
| 60 | // THIS IS UNSAFE. PLEASE USE WITH CAUTION. |
| 61 | NoSync bool |
| 62 | |
| 63 | // When true, skips syncing freelist to disk. This improves the database |
| 64 | // write performance under normal operation, but requires a full database |
| 65 | // re-sync during recovery. |
| 66 | NoFreelistSync bool |
| 67 | |
| 68 | // FreelistType sets the backend freelist type. There are two options. Array which is simple but endures |
| 69 | // dramatic performance degradation if database is large and fragmentation in freelist is common. |
| 70 | // The alternative one is using hashmap, it is faster in almost all circumstances |
| 71 | // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe. |
| 72 | // The default type is array |
| 73 | FreelistType FreelistType |
| 74 | |
| 75 | // When true, skips the truncate call when growing the database. |
| 76 | // Setting this to true is only safe on non-ext3/ext4 systems. |
| 77 | // Skipping truncation avoids preallocation of hard drive space and |
| 78 | // bypasses a truncate() and fsync() syscall on remapping. |
| 79 | // |
| 80 | // https://github.com/boltdb/bolt/issues/284 |
| 81 | NoGrowSync bool |
| 82 | |
| 83 | // When `true`, bbolt will always load the free pages when opening the DB. |
| 84 | // When opening db in write mode, this flag will always automatically |
| 85 | // set to `true`. |
| 86 | PreLoadFreelist bool |
| 87 | |
| 88 | // If you want to read the entire database fast, you can set MmapFlag to |
| 89 | // syscall.MAP_POPULATE on Linux 2.6.23+ for sequential read-ahead. |
| 90 | MmapFlags int |
| 91 | |
| 92 | // MaxBatchSize is the maximum size of a batch. Default value is |
| 93 | // copied from DefaultMaxBatchSize in Open. |
| 94 | // |
| 95 | // If <=0, disables batching. |
| 96 | // |
| 97 | // Do not change concurrently with calls to Batch. |
| 98 | MaxBatchSize int |
| 99 | |
| 100 | // MaxBatchDelay is the maximum delay before a batch starts. |
| 101 | // Default value is copied from DefaultMaxBatchDelay in Open. |
| 102 | // |
| 103 | // If <=0, effectively disables batching. |
| 104 | // |
| 105 | // Do not change concurrently with calls to Batch. |
| 106 | MaxBatchDelay time.Duration |
| 107 | |
| 108 | // AllocSize is the amount of space allocated when the database |
| 109 | // needs to create new pages. This is done to amortize the cost |
| 110 | // of truncate() and fsync() when growing the data file. |
| 111 | AllocSize int |
| 112 | |
| 113 | // Mlock locks database file in memory when set to true. |
| 114 | // It prevents major page faults, however used memory can't be reclaimed. |
| 115 | // |
| 116 | // Supported only on Unix via mlock/munlock syscalls. |
| 117 | Mlock bool |
| 118 | |
| 119 | logger Logger |
| 120 | |
| 121 | path string |
| 122 | openFile func(string, int, os.FileMode) (*os.File, error) |
| 123 | file *os.File |
| 124 | // `dataref` isn't used at all on Windows, and the golangci-lint |
| 125 | // always fails on Windows platform. |
| 126 | //nolint |
| 127 | dataref []byte // mmap'ed readonly, write throws SEGV |
| 128 | data *[common.MaxMapSize]byte |
| 129 | datasz int |
| 130 | meta0 *common.Meta |
| 131 | meta1 *common.Meta |
| 132 | pageSize int |
| 133 | opened bool |
| 134 | rwtx *Tx |
| 135 | txs []*Tx |
| 136 | |
| 137 | freelist fl.Interface |
| 138 | freelistLoad sync.Once |
| 139 | |
| 140 | pagePool sync.Pool |
| 141 | |
| 142 | batchMu sync.Mutex |
| 143 | batch *batch |
| 144 | |
| 145 | rwlock sync.Mutex // Allows only one writer at a time. |
| 146 | metalock sync.Mutex // Protects meta page access. |
| 147 | mmaplock sync.RWMutex // Protects mmap access during remapping. |
| 148 | statlock sync.RWMutex // Protects stats access. |
| 149 | |
| 150 | ops struct { |
| 151 | writeAt func(b []byte, off int64) (n int, err error) |
| 152 | } |
| 153 | |
| 154 | // Read only mode. |
| 155 | // When true, Update() and Begin(true) return ErrDatabaseReadOnly immediately. |
| 156 | readOnly bool |
| 157 | } |
| 158 | |
| 159 | // Path returns the path to currently open database file. |
| 160 | func (db *DB) Path() string { |
| 161 | return db.path |
| 162 | } |
| 163 | |
| 164 | // GoString returns the Go string representation of the database. |
| 165 | func (db *DB) GoString() string { |
| 166 | return fmt.Sprintf("bolt.DB{path:%q}", db.path) |
| 167 | } |
| 168 | |
| 169 | // String returns the string representation of the database. |
| 170 | func (db *DB) String() string { |
| 171 | return fmt.Sprintf("DB<%q>", db.path) |
| 172 | } |
| 173 | |
| 174 | // Open creates and opens a database at the given path with a given file mode. |
| 175 | // If the file does not exist then it will be created automatically with a given file mode. |
| 176 | // Passing in nil options will cause Bolt to open the database with the default options. |
| 177 | // Note: For read/write transactions, ensure the owner has write permission on the created/opened database file, e.g. 0600 |
| 178 | func Open(path string, mode os.FileMode, options *Options) (db *DB, err error) { |
| 179 | db = &DB{ |
| 180 | opened: true, |
| 181 | } |
| 182 | |
| 183 | // Set default options if no options are provided. |
| 184 | if options == nil { |
| 185 | options = DefaultOptions |
| 186 | } |
| 187 | db.NoSync = options.NoSync |
| 188 | db.NoGrowSync = options.NoGrowSync |
| 189 | db.MmapFlags = options.MmapFlags |
| 190 | db.NoFreelistSync = options.NoFreelistSync |
| 191 | db.PreLoadFreelist = options.PreLoadFreelist |
| 192 | db.FreelistType = options.FreelistType |
| 193 | db.Mlock = options.Mlock |
| 194 | |
| 195 | // Set default values for later DB operations. |
| 196 | db.MaxBatchSize = common.DefaultMaxBatchSize |
| 197 | db.MaxBatchDelay = common.DefaultMaxBatchDelay |
| 198 | db.AllocSize = common.DefaultAllocSize |
| 199 | |
| 200 | if options.Logger == nil { |
| 201 | db.logger = getDiscardLogger() |
| 202 | } else { |
| 203 | db.logger = options.Logger |
| 204 | } |
| 205 | |
| 206 | lg := db.Logger() |
| 207 | if lg != discardLogger { |
| 208 | lg.Infof("Opening db file (%s) with mode %s and with options: %s", path, mode, options) |
| 209 | defer func() { |
| 210 | if err != nil { |
| 211 | lg.Errorf("Opening bbolt db (%s) failed: %v", path, err) |
| 212 | } else { |
| 213 | lg.Infof("Opening bbolt db (%s) successfully", path) |
| 214 | } |
| 215 | }() |
| 216 | } |
| 217 | |
| 218 | flag := os.O_RDWR |
| 219 | if options.ReadOnly { |
| 220 | flag = os.O_RDONLY |
| 221 | db.readOnly = true |
| 222 | } else { |
| 223 | // always load free pages in write mode |
| 224 | db.PreLoadFreelist = true |
| 225 | flag |= os.O_CREATE |
| 226 | } |
| 227 | |
| 228 | db.openFile = options.OpenFile |
| 229 | if db.openFile == nil { |
| 230 | db.openFile = os.OpenFile |
| 231 | } |
| 232 | |
| 233 | // Open data file and separate sync handler for metadata writes. |
| 234 | if db.file, err = db.openFile(path, flag, mode); err != nil { |
| 235 | _ = db.close() |
| 236 | lg.Errorf("failed to open db file (%s): %v", path, err) |
| 237 | return nil, err |
| 238 | } |
| 239 | db.path = db.file.Name() |
| 240 | |
| 241 | // Lock file so that other processes using Bolt in read-write mode cannot |
| 242 | // use the database at the same time. This would cause corruption since |
| 243 | // the two processes would write meta pages and free pages separately. |
| 244 | // The database file is locked exclusively (only one process can grab the lock) |
| 245 | // if !options.ReadOnly. |
| 246 | // The database file is locked using the shared lock (more than one process may |
| 247 | // hold a lock at the same time) otherwise (options.ReadOnly is set). |
| 248 | if err = flock(db, !db.readOnly, options.Timeout); err != nil { |
| 249 | _ = db.close() |
| 250 | lg.Errorf("failed to lock db file (%s), readonly: %t, error: %v", path, db.readOnly, err) |
| 251 | return nil, err |
| 252 | } |
| 253 | |
| 254 | // Default values for test hooks |
| 255 | db.ops.writeAt = db.file.WriteAt |
| 256 | |
| 257 | if db.pageSize = options.PageSize; db.pageSize == 0 { |
| 258 | // Set the default page size to the OS page size. |
| 259 | db.pageSize = common.DefaultPageSize |
| 260 | } |
| 261 | |
| 262 | // Initialize the database if it doesn't exist. |
| 263 | if info, statErr := db.file.Stat(); statErr != nil { |
| 264 | _ = db.close() |
| 265 | lg.Errorf("failed to get db file's stats (%s): %v", path, err) |
| 266 | return nil, statErr |
| 267 | } else if info.Size() == 0 { |
| 268 | // Initialize new files with meta pages. |
| 269 | if err = db.init(); err != nil { |
| 270 | // clean up file descriptor on initialization fail |
| 271 | _ = db.close() |
| 272 | lg.Errorf("failed to initialize db file (%s): %v", path, err) |
| 273 | return nil, err |
| 274 | } |
| 275 | } else { |
| 276 | // try to get the page size from the metadata pages |
| 277 | if db.pageSize, err = db.getPageSize(); err != nil { |
| 278 | _ = db.close() |
| 279 | lg.Errorf("failed to get page size from db file (%s): %v", path, err) |
| 280 | return nil, err |
| 281 | } |
| 282 | } |
| 283 | |
| 284 | // Initialize page pool. |
| 285 | db.pagePool = sync.Pool{ |
| 286 | New: func() interface{} { |
| 287 | return make([]byte, db.pageSize) |
| 288 | }, |
| 289 | } |
| 290 | |
| 291 | // Memory map the data file. |
| 292 | if err = db.mmap(options.InitialMmapSize); err != nil { |
| 293 | _ = db.close() |
| 294 | lg.Errorf("failed to map db file (%s): %v", path, err) |
| 295 | return nil, err |
| 296 | } |
| 297 | |
| 298 | if db.PreLoadFreelist { |
| 299 | db.loadFreelist() |
| 300 | } |
| 301 | |
| 302 | if db.readOnly { |
| 303 | return db, nil |
| 304 | } |
| 305 | |
| 306 | // Flush freelist when transitioning from no sync to sync so |
| 307 | // NoFreelistSync unaware boltdb can open the db later. |
| 308 | if !db.NoFreelistSync && !db.hasSyncedFreelist() { |
| 309 | tx, txErr := db.Begin(true) |
| 310 | if tx != nil { |
| 311 | txErr = tx.Commit() |
| 312 | } |
| 313 | if txErr != nil { |
| 314 | lg.Errorf("starting readwrite transaction failed: %v", txErr) |
| 315 | _ = db.close() |
| 316 | return nil, txErr |
| 317 | } |
| 318 | } |
| 319 | |
| 320 | // Mark the database as opened and return. |
| 321 | return db, nil |
| 322 | } |
| 323 | |
| 324 | // getPageSize reads the pageSize from the meta pages. It tries |
| 325 | // to read the first meta page firstly. If the first page is invalid, |
| 326 | // then it tries to read the second page using the default page size. |
| 327 | func (db *DB) getPageSize() (int, error) { |
| 328 | var ( |
| 329 | meta0CanRead, meta1CanRead bool |
| 330 | ) |
| 331 | |
| 332 | // Read the first meta page to determine the page size. |
| 333 | if pgSize, canRead, err := db.getPageSizeFromFirstMeta(); err != nil { |
| 334 | // We cannot read the page size from page 0, but can read page 0. |
| 335 | meta0CanRead = canRead |
| 336 | } else { |
| 337 | return pgSize, nil |
| 338 | } |
| 339 | |
| 340 | // Read the second meta page to determine the page size. |
| 341 | if pgSize, canRead, err := db.getPageSizeFromSecondMeta(); err != nil { |
| 342 | // We cannot read the page size from page 1, but can read page 1. |
| 343 | meta1CanRead = canRead |
| 344 | } else { |
| 345 | return pgSize, nil |
| 346 | } |
| 347 | |
| 348 | // If we can't read the page size from both pages, but can read |
| 349 | // either page, then we assume it's the same as the OS or the one |
| 350 | // given, since that's how the page size was chosen in the first place. |
| 351 | // |
| 352 | // If both pages are invalid, and (this OS uses a different page size |
| 353 | // from what the database was created with or the given page size is |
| 354 | // different from what the database was created with), then we are out |
| 355 | // of luck and cannot access the database. |
| 356 | if meta0CanRead || meta1CanRead { |
| 357 | return db.pageSize, nil |
| 358 | } |
| 359 | |
| 360 | return 0, berrors.ErrInvalid |
| 361 | } |
| 362 | |
| 363 | // getPageSizeFromFirstMeta reads the pageSize from the first meta page |
| 364 | func (db *DB) getPageSizeFromFirstMeta() (int, bool, error) { |
| 365 | var buf [0x1000]byte |
| 366 | var metaCanRead bool |
| 367 | if bw, err := db.file.ReadAt(buf[:], 0); err == nil && bw == len(buf) { |
| 368 | metaCanRead = true |
| 369 | if m := db.pageInBuffer(buf[:], 0).Meta(); m.Validate() == nil { |
| 370 | return int(m.PageSize()), metaCanRead, nil |
| 371 | } |
| 372 | } |
| 373 | return 0, metaCanRead, berrors.ErrInvalid |
| 374 | } |
| 375 | |
| 376 | // getPageSizeFromSecondMeta reads the pageSize from the second meta page |
| 377 | func (db *DB) getPageSizeFromSecondMeta() (int, bool, error) { |
| 378 | var ( |
| 379 | fileSize int64 |
| 380 | metaCanRead bool |
| 381 | ) |
| 382 | |
| 383 | // get the db file size |
| 384 | if info, err := db.file.Stat(); err != nil { |
| 385 | return 0, metaCanRead, err |
| 386 | } else { |
| 387 | fileSize = info.Size() |
| 388 | } |
| 389 | |
| 390 | // We need to read the second meta page, so we should skip the first page; |
| 391 | // but we don't know the exact page size yet, it's chicken & egg problem. |
| 392 | // The solution is to try all the possible page sizes, which starts from 1KB |
| 393 | // and until 16MB (1024<<14) or the end of the db file |
| 394 | // |
| 395 | // TODO: should we support larger page size? |
| 396 | for i := 0; i <= 14; i++ { |
| 397 | var buf [0x1000]byte |
| 398 | var pos int64 = 1024 << uint(i) |
| 399 | if pos >= fileSize-1024 { |
| 400 | break |
| 401 | } |
| 402 | bw, err := db.file.ReadAt(buf[:], pos) |
| 403 | if (err == nil && bw == len(buf)) || (err == io.EOF && int64(bw) == (fileSize-pos)) { |
| 404 | metaCanRead = true |
| 405 | if m := db.pageInBuffer(buf[:], 0).Meta(); m.Validate() == nil { |
| 406 | return int(m.PageSize()), metaCanRead, nil |
| 407 | } |
| 408 | } |
| 409 | } |
| 410 | |
| 411 | return 0, metaCanRead, berrors.ErrInvalid |
| 412 | } |
| 413 | |
| 414 | // loadFreelist reads the freelist if it is synced, or reconstructs it |
| 415 | // by scanning the DB if it is not synced. It assumes there are no |
| 416 | // concurrent accesses being made to the freelist. |
| 417 | func (db *DB) loadFreelist() { |
| 418 | db.freelistLoad.Do(func() { |
| 419 | db.freelist = newFreelist(db.FreelistType) |
| 420 | if !db.hasSyncedFreelist() { |
| 421 | // Reconstruct free list by scanning the DB. |
| 422 | db.freelist.Init(db.freepages()) |
| 423 | } else { |
| 424 | // Read free list from freelist page. |
| 425 | db.freelist.Read(db.page(db.meta().Freelist())) |
| 426 | } |
| 427 | db.stats.FreePageN = db.freelist.FreeCount() |
| 428 | }) |
| 429 | } |
| 430 | |
| 431 | func (db *DB) hasSyncedFreelist() bool { |
| 432 | return db.meta().Freelist() != common.PgidNoFreelist |
| 433 | } |
| 434 | |
| 435 | func (db *DB) fileSize() (int, error) { |
| 436 | info, err := db.file.Stat() |
| 437 | if err != nil { |
| 438 | return 0, fmt.Errorf("file stat error: %w", err) |
| 439 | } |
| 440 | sz := int(info.Size()) |
| 441 | if sz < db.pageSize*2 { |
| 442 | return 0, fmt.Errorf("file size too small %d", sz) |
| 443 | } |
| 444 | return sz, nil |
| 445 | } |
| 446 | |
| 447 | // mmap opens the underlying memory-mapped file and initializes the meta references. |
| 448 | // minsz is the minimum size that the new mmap can be. |
| 449 | func (db *DB) mmap(minsz int) (err error) { |
| 450 | db.mmaplock.Lock() |
| 451 | defer db.mmaplock.Unlock() |
| 452 | |
| 453 | lg := db.Logger() |
| 454 | |
| 455 | // Ensure the size is at least the minimum size. |
| 456 | var fileSize int |
| 457 | fileSize, err = db.fileSize() |
| 458 | if err != nil { |
| 459 | lg.Errorf("getting file size failed: %w", err) |
| 460 | return err |
| 461 | } |
| 462 | var size = fileSize |
| 463 | if size < minsz { |
| 464 | size = minsz |
| 465 | } |
| 466 | size, err = db.mmapSize(size) |
| 467 | if err != nil { |
| 468 | lg.Errorf("getting map size failed: %w", err) |
| 469 | return err |
| 470 | } |
| 471 | |
| 472 | if db.Mlock { |
| 473 | // Unlock db memory |
| 474 | if err := db.munlock(fileSize); err != nil { |
| 475 | return err |
| 476 | } |
| 477 | } |
| 478 | |
| 479 | // Dereference all mmap references before unmapping. |
| 480 | if db.rwtx != nil { |
| 481 | db.rwtx.root.dereference() |
| 482 | } |
| 483 | |
| 484 | // Unmap existing data before continuing. |
| 485 | if err = db.munmap(); err != nil { |
| 486 | return err |
| 487 | } |
| 488 | |
| 489 | // Memory-map the data file as a byte slice. |
| 490 | // gofail: var mapError string |
| 491 | // return errors.New(mapError) |
| 492 | if err = mmap(db, size); err != nil { |
| 493 | lg.Errorf("[GOOS: %s, GOARCH: %s] mmap failed, size: %d, error: %v", runtime.GOOS, runtime.GOARCH, size, err) |
| 494 | return err |
| 495 | } |
| 496 | |
| 497 | // Perform unmmap on any error to reset all data fields: |
| 498 | // dataref, data, datasz, meta0 and meta1. |
| 499 | defer func() { |
| 500 | if err != nil { |
| 501 | if unmapErr := db.munmap(); unmapErr != nil { |
| 502 | err = fmt.Errorf("%w; rollback unmap also failed: %v", err, unmapErr) |
| 503 | } |
| 504 | } |
| 505 | }() |
| 506 | |
| 507 | if db.Mlock { |
| 508 | // Don't allow swapping of data file |
| 509 | if err := db.mlock(fileSize); err != nil { |
| 510 | return err |
| 511 | } |
| 512 | } |
| 513 | |
| 514 | // Save references to the meta pages. |
| 515 | db.meta0 = db.page(0).Meta() |
| 516 | db.meta1 = db.page(1).Meta() |
| 517 | |
| 518 | // Validate the meta pages. We only return an error if both meta pages fail |
| 519 | // validation, since meta0 failing validation means that it wasn't saved |
| 520 | // properly -- but we can recover using meta1. And vice-versa. |
| 521 | err0 := db.meta0.Validate() |
| 522 | err1 := db.meta1.Validate() |
| 523 | if err0 != nil && err1 != nil { |
| 524 | lg.Errorf("both meta pages are invalid, meta0: %v, meta1: %v", err0, err1) |
| 525 | return err0 |
| 526 | } |
| 527 | |
| 528 | return nil |
| 529 | } |
| 530 | |
| 531 | func (db *DB) invalidate() { |
| 532 | db.dataref = nil |
| 533 | db.data = nil |
| 534 | db.datasz = 0 |
| 535 | |
| 536 | db.meta0 = nil |
| 537 | db.meta1 = nil |
| 538 | } |
| 539 | |
| 540 | // munmap unmaps the data file from memory. |
| 541 | func (db *DB) munmap() error { |
| 542 | defer db.invalidate() |
| 543 | |
| 544 | // gofail: var unmapError string |
| 545 | // return errors.New(unmapError) |
| 546 | if err := munmap(db); err != nil { |
| 547 | db.Logger().Errorf("[GOOS: %s, GOARCH: %s] munmap failed, db.datasz: %d, error: %v", runtime.GOOS, runtime.GOARCH, db.datasz, err) |
| 548 | return fmt.Errorf("unmap error: %v", err.Error()) |
| 549 | } |
| 550 | |
| 551 | return nil |
| 552 | } |
| 553 | |
| 554 | // mmapSize determines the appropriate size for the mmap given the current size |
| 555 | // of the database. The minimum size is 32KB and doubles until it reaches 1GB. |
| 556 | // Returns an error if the new mmap size is greater than the max allowed. |
| 557 | func (db *DB) mmapSize(size int) (int, error) { |
| 558 | // Double the size from 32KB until 1GB. |
| 559 | for i := uint(15); i <= 30; i++ { |
| 560 | if size <= 1<<i { |
| 561 | return 1 << i, nil |
| 562 | } |
| 563 | } |
| 564 | |
| 565 | // Verify the requested size is not above the maximum allowed. |
| 566 | if size > common.MaxMapSize { |
| 567 | return 0, errors.New("mmap too large") |
| 568 | } |
| 569 | |
| 570 | // If larger than 1GB then grow by 1GB at a time. |
| 571 | sz := int64(size) |
| 572 | if remainder := sz % int64(common.MaxMmapStep); remainder > 0 { |
| 573 | sz += int64(common.MaxMmapStep) - remainder |
| 574 | } |
| 575 | |
| 576 | // Ensure that the mmap size is a multiple of the page size. |
| 577 | // This should always be true since we're incrementing in MBs. |
| 578 | pageSize := int64(db.pageSize) |
| 579 | if (sz % pageSize) != 0 { |
| 580 | sz = ((sz / pageSize) + 1) * pageSize |
| 581 | } |
| 582 | |
| 583 | // If we've exceeded the max size then only grow up to the max size. |
| 584 | if sz > common.MaxMapSize { |
| 585 | sz = common.MaxMapSize |
| 586 | } |
| 587 | |
| 588 | return int(sz), nil |
| 589 | } |
| 590 | |
| 591 | func (db *DB) munlock(fileSize int) error { |
| 592 | // gofail: var munlockError string |
| 593 | // return errors.New(munlockError) |
| 594 | if err := munlock(db, fileSize); err != nil { |
| 595 | db.Logger().Errorf("[GOOS: %s, GOARCH: %s] munlock failed, fileSize: %d, db.datasz: %d, error: %v", runtime.GOOS, runtime.GOARCH, fileSize, db.datasz, err) |
| 596 | return fmt.Errorf("munlock error: %v", err.Error()) |
| 597 | } |
| 598 | return nil |
| 599 | } |
| 600 | |
| 601 | func (db *DB) mlock(fileSize int) error { |
| 602 | // gofail: var mlockError string |
| 603 | // return errors.New(mlockError) |
| 604 | if err := mlock(db, fileSize); err != nil { |
| 605 | db.Logger().Errorf("[GOOS: %s, GOARCH: %s] mlock failed, fileSize: %d, db.datasz: %d, error: %v", runtime.GOOS, runtime.GOARCH, fileSize, db.datasz, err) |
| 606 | return fmt.Errorf("mlock error: %v", err.Error()) |
| 607 | } |
| 608 | return nil |
| 609 | } |
| 610 | |
| 611 | func (db *DB) mrelock(fileSizeFrom, fileSizeTo int) error { |
| 612 | if err := db.munlock(fileSizeFrom); err != nil { |
| 613 | return err |
| 614 | } |
| 615 | if err := db.mlock(fileSizeTo); err != nil { |
| 616 | return err |
| 617 | } |
| 618 | return nil |
| 619 | } |
| 620 | |
| 621 | // init creates a new database file and initializes its meta pages. |
| 622 | func (db *DB) init() error { |
| 623 | // Create two meta pages on a buffer. |
| 624 | buf := make([]byte, db.pageSize*4) |
| 625 | for i := 0; i < 2; i++ { |
| 626 | p := db.pageInBuffer(buf, common.Pgid(i)) |
| 627 | p.SetId(common.Pgid(i)) |
| 628 | p.SetFlags(common.MetaPageFlag) |
| 629 | |
| 630 | // Initialize the meta page. |
| 631 | m := p.Meta() |
| 632 | m.SetMagic(common.Magic) |
| 633 | m.SetVersion(common.Version) |
| 634 | m.SetPageSize(uint32(db.pageSize)) |
| 635 | m.SetFreelist(2) |
| 636 | m.SetRootBucket(common.NewInBucket(3, 0)) |
| 637 | m.SetPgid(4) |
| 638 | m.SetTxid(common.Txid(i)) |
| 639 | m.SetChecksum(m.Sum64()) |
| 640 | } |
| 641 | |
| 642 | // Write an empty freelist at page 3. |
| 643 | p := db.pageInBuffer(buf, common.Pgid(2)) |
| 644 | p.SetId(2) |
| 645 | p.SetFlags(common.FreelistPageFlag) |
| 646 | p.SetCount(0) |
| 647 | |
| 648 | // Write an empty leaf page at page 4. |
| 649 | p = db.pageInBuffer(buf, common.Pgid(3)) |
| 650 | p.SetId(3) |
| 651 | p.SetFlags(common.LeafPageFlag) |
| 652 | p.SetCount(0) |
| 653 | |
| 654 | // Write the buffer to our data file. |
| 655 | if _, err := db.ops.writeAt(buf, 0); err != nil { |
| 656 | db.Logger().Errorf("writeAt failed: %w", err) |
| 657 | return err |
| 658 | } |
| 659 | if err := fdatasync(db); err != nil { |
| 660 | db.Logger().Errorf("[GOOS: %s, GOARCH: %s] fdatasync failed: %w", runtime.GOOS, runtime.GOARCH, err) |
| 661 | return err |
| 662 | } |
| 663 | |
| 664 | return nil |
| 665 | } |
| 666 | |
| 667 | // Close releases all database resources. |
| 668 | // It will block waiting for any open transactions to finish |
| 669 | // before closing the database and returning. |
| 670 | func (db *DB) Close() error { |
| 671 | db.rwlock.Lock() |
| 672 | defer db.rwlock.Unlock() |
| 673 | |
| 674 | db.metalock.Lock() |
| 675 | defer db.metalock.Unlock() |
| 676 | |
| 677 | db.mmaplock.Lock() |
| 678 | defer db.mmaplock.Unlock() |
| 679 | |
| 680 | return db.close() |
| 681 | } |
| 682 | |
| 683 | func (db *DB) close() error { |
| 684 | if !db.opened { |
| 685 | return nil |
| 686 | } |
| 687 | |
| 688 | db.opened = false |
| 689 | |
| 690 | db.freelist = nil |
| 691 | |
| 692 | // Clear ops. |
| 693 | db.ops.writeAt = nil |
| 694 | |
| 695 | var errs []error |
| 696 | // Close the mmap. |
| 697 | if err := db.munmap(); err != nil { |
| 698 | errs = append(errs, err) |
| 699 | } |
| 700 | |
| 701 | // Close file handles. |
| 702 | if db.file != nil { |
| 703 | // No need to unlock read-only file. |
| 704 | if !db.readOnly { |
| 705 | // Unlock the file. |
| 706 | if err := funlock(db); err != nil { |
| 707 | errs = append(errs, fmt.Errorf("bolt.Close(): funlock error: %w", err)) |
| 708 | } |
| 709 | } |
| 710 | |
| 711 | // Close the file descriptor. |
| 712 | if err := db.file.Close(); err != nil { |
| 713 | errs = append(errs, fmt.Errorf("db file close: %w", err)) |
| 714 | } |
| 715 | db.file = nil |
| 716 | } |
| 717 | |
| 718 | db.path = "" |
| 719 | |
| 720 | if len(errs) > 0 { |
| 721 | return errs[0] |
| 722 | } |
| 723 | return nil |
| 724 | } |
| 725 | |
| 726 | // Begin starts a new transaction. |
| 727 | // Multiple read-only transactions can be used concurrently but only one |
| 728 | // write transaction can be used at a time. Starting multiple write transactions |
| 729 | // will cause the calls to block and be serialized until the current write |
| 730 | // transaction finishes. |
| 731 | // |
| 732 | // Transactions should not be dependent on one another. Opening a read |
| 733 | // transaction and a write transaction in the same goroutine can cause the |
| 734 | // writer to deadlock because the database periodically needs to re-mmap itself |
| 735 | // as it grows and it cannot do that while a read transaction is open. |
| 736 | // |
| 737 | // If a long running read transaction (for example, a snapshot transaction) is |
| 738 | // needed, you might want to set DB.InitialMmapSize to a large enough value |
| 739 | // to avoid potential blocking of write transaction. |
| 740 | // |
| 741 | // IMPORTANT: You must close read-only transactions after you are finished or |
| 742 | // else the database will not reclaim old pages. |
| 743 | func (db *DB) Begin(writable bool) (t *Tx, err error) { |
| 744 | if lg := db.Logger(); lg != discardLogger { |
| 745 | lg.Debugf("Starting a new transaction [writable: %t]", writable) |
| 746 | defer func() { |
| 747 | if err != nil { |
| 748 | lg.Errorf("Starting a new transaction [writable: %t] failed: %v", writable, err) |
| 749 | } else { |
| 750 | lg.Debugf("Starting a new transaction [writable: %t] successfully", writable) |
| 751 | } |
| 752 | }() |
| 753 | } |
| 754 | |
| 755 | if writable { |
| 756 | return db.beginRWTx() |
| 757 | } |
| 758 | return db.beginTx() |
| 759 | } |
| 760 | |
| 761 | func (db *DB) Logger() Logger { |
| 762 | if db == nil || db.logger == nil { |
| 763 | return getDiscardLogger() |
| 764 | } |
| 765 | return db.logger |
| 766 | } |
| 767 | |
| 768 | func (db *DB) beginTx() (*Tx, error) { |
| 769 | // Lock the meta pages while we initialize the transaction. We obtain |
| 770 | // the meta lock before the mmap lock because that's the order that the |
| 771 | // write transaction will obtain them. |
| 772 | db.metalock.Lock() |
| 773 | |
| 774 | // Obtain a read-only lock on the mmap. When the mmap is remapped it will |
| 775 | // obtain a write lock so all transactions must finish before it can be |
| 776 | // remapped. |
| 777 | db.mmaplock.RLock() |
| 778 | |
| 779 | // Exit if the database is not open yet. |
| 780 | if !db.opened { |
| 781 | db.mmaplock.RUnlock() |
| 782 | db.metalock.Unlock() |
| 783 | return nil, berrors.ErrDatabaseNotOpen |
| 784 | } |
| 785 | |
| 786 | // Exit if the database is not correctly mapped. |
| 787 | if db.data == nil { |
| 788 | db.mmaplock.RUnlock() |
| 789 | db.metalock.Unlock() |
| 790 | return nil, berrors.ErrInvalidMapping |
| 791 | } |
| 792 | |
| 793 | // Create a transaction associated with the database. |
| 794 | t := &Tx{} |
| 795 | t.init(db) |
| 796 | |
| 797 | // Keep track of transaction until it closes. |
| 798 | db.txs = append(db.txs, t) |
| 799 | n := len(db.txs) |
| 800 | if db.freelist != nil { |
| 801 | db.freelist.AddReadonlyTXID(t.meta.Txid()) |
| 802 | } |
| 803 | |
| 804 | // Unlock the meta pages. |
| 805 | db.metalock.Unlock() |
| 806 | |
| 807 | // Update the transaction stats. |
| 808 | db.statlock.Lock() |
| 809 | db.stats.TxN++ |
| 810 | db.stats.OpenTxN = n |
| 811 | db.statlock.Unlock() |
| 812 | |
| 813 | return t, nil |
| 814 | } |
| 815 | |
| 816 | func (db *DB) beginRWTx() (*Tx, error) { |
| 817 | // If the database was opened with Options.ReadOnly, return an error. |
| 818 | if db.readOnly { |
| 819 | return nil, berrors.ErrDatabaseReadOnly |
| 820 | } |
| 821 | |
| 822 | // Obtain writer lock. This is released by the transaction when it closes. |
| 823 | // This enforces only one writer transaction at a time. |
| 824 | db.rwlock.Lock() |
| 825 | |
| 826 | // Once we have the writer lock then we can lock the meta pages so that |
| 827 | // we can set up the transaction. |
| 828 | db.metalock.Lock() |
| 829 | defer db.metalock.Unlock() |
| 830 | |
| 831 | // Exit if the database is not open yet. |
| 832 | if !db.opened { |
| 833 | db.rwlock.Unlock() |
| 834 | return nil, berrors.ErrDatabaseNotOpen |
| 835 | } |
| 836 | |
| 837 | // Exit if the database is not correctly mapped. |
| 838 | if db.data == nil { |
| 839 | db.rwlock.Unlock() |
| 840 | return nil, berrors.ErrInvalidMapping |
| 841 | } |
| 842 | |
| 843 | // Create a transaction associated with the database. |
| 844 | t := &Tx{writable: true} |
| 845 | t.init(db) |
| 846 | db.rwtx = t |
| 847 | db.freelist.ReleasePendingPages() |
| 848 | return t, nil |
| 849 | } |
| 850 | |
| 851 | // removeTx removes a transaction from the database. |
| 852 | func (db *DB) removeTx(tx *Tx) { |
| 853 | // Release the read lock on the mmap. |
| 854 | db.mmaplock.RUnlock() |
| 855 | |
| 856 | // Use the meta lock to restrict access to the DB object. |
| 857 | db.metalock.Lock() |
| 858 | |
| 859 | // Remove the transaction. |
| 860 | for i, t := range db.txs { |
| 861 | if t == tx { |
| 862 | last := len(db.txs) - 1 |
| 863 | db.txs[i] = db.txs[last] |
| 864 | db.txs[last] = nil |
| 865 | db.txs = db.txs[:last] |
| 866 | break |
| 867 | } |
| 868 | } |
| 869 | n := len(db.txs) |
| 870 | if db.freelist != nil { |
| 871 | db.freelist.RemoveReadonlyTXID(tx.meta.Txid()) |
| 872 | } |
| 873 | |
| 874 | // Unlock the meta pages. |
| 875 | db.metalock.Unlock() |
| 876 | |
| 877 | // Merge statistics. |
| 878 | db.statlock.Lock() |
| 879 | db.stats.OpenTxN = n |
| 880 | db.stats.TxStats.add(&tx.stats) |
| 881 | db.statlock.Unlock() |
| 882 | } |
| 883 | |
| 884 | // Update executes a function within the context of a read-write managed transaction. |
| 885 | // If no error is returned from the function then the transaction is committed. |
| 886 | // If an error is returned then the entire transaction is rolled back. |
| 887 | // Any error that is returned from the function or returned from the commit is |
| 888 | // returned from the Update() method. |
| 889 | // |
| 890 | // Attempting to manually commit or rollback within the function will cause a panic. |
| 891 | func (db *DB) Update(fn func(*Tx) error) error { |
| 892 | t, err := db.Begin(true) |
| 893 | if err != nil { |
| 894 | return err |
| 895 | } |
| 896 | |
| 897 | // Make sure the transaction rolls back in the event of a panic. |
| 898 | defer func() { |
| 899 | if t.db != nil { |
| 900 | t.rollback() |
| 901 | } |
| 902 | }() |
| 903 | |
| 904 | // Mark as a managed tx so that the inner function cannot manually commit. |
| 905 | t.managed = true |
| 906 | |
| 907 | // If an error is returned from the function then rollback and return error. |
| 908 | err = fn(t) |
| 909 | t.managed = false |
| 910 | if err != nil { |
| 911 | _ = t.Rollback() |
| 912 | return err |
| 913 | } |
| 914 | |
| 915 | return t.Commit() |
| 916 | } |
| 917 | |
| 918 | // View executes a function within the context of a managed read-only transaction. |
| 919 | // Any error that is returned from the function is returned from the View() method. |
| 920 | // |
| 921 | // Attempting to manually rollback within the function will cause a panic. |
| 922 | func (db *DB) View(fn func(*Tx) error) error { |
| 923 | t, err := db.Begin(false) |
| 924 | if err != nil { |
| 925 | return err |
| 926 | } |
| 927 | |
| 928 | // Make sure the transaction rolls back in the event of a panic. |
| 929 | defer func() { |
| 930 | if t.db != nil { |
| 931 | t.rollback() |
| 932 | } |
| 933 | }() |
| 934 | |
| 935 | // Mark as a managed tx so that the inner function cannot manually rollback. |
| 936 | t.managed = true |
| 937 | |
| 938 | // If an error is returned from the function then pass it through. |
| 939 | err = fn(t) |
| 940 | t.managed = false |
| 941 | if err != nil { |
| 942 | _ = t.Rollback() |
| 943 | return err |
| 944 | } |
| 945 | |
| 946 | return t.Rollback() |
| 947 | } |
| 948 | |
| 949 | // Batch calls fn as part of a batch. It behaves similar to Update, |
| 950 | // except: |
| 951 | // |
| 952 | // 1. concurrent Batch calls can be combined into a single Bolt |
| 953 | // transaction. |
| 954 | // |
| 955 | // 2. the function passed to Batch may be called multiple times, |
| 956 | // regardless of whether it returns error or not. |
| 957 | // |
| 958 | // This means that Batch function side effects must be idempotent and |
| 959 | // take permanent effect only after a successful return is seen in |
| 960 | // caller. |
| 961 | // |
| 962 | // The maximum batch size and delay can be adjusted with DB.MaxBatchSize |
| 963 | // and DB.MaxBatchDelay, respectively. |
| 964 | // |
| 965 | // Batch is only useful when there are multiple goroutines calling it. |
| 966 | func (db *DB) Batch(fn func(*Tx) error) error { |
| 967 | errCh := make(chan error, 1) |
| 968 | |
| 969 | db.batchMu.Lock() |
| 970 | if (db.batch == nil) || (db.batch != nil && len(db.batch.calls) >= db.MaxBatchSize) { |
| 971 | // There is no existing batch, or the existing batch is full; start a new one. |
| 972 | db.batch = &batch{ |
| 973 | db: db, |
| 974 | } |
| 975 | db.batch.timer = time.AfterFunc(db.MaxBatchDelay, db.batch.trigger) |
| 976 | } |
| 977 | db.batch.calls = append(db.batch.calls, call{fn: fn, err: errCh}) |
| 978 | if len(db.batch.calls) >= db.MaxBatchSize { |
| 979 | // wake up batch, it's ready to run |
| 980 | go db.batch.trigger() |
| 981 | } |
| 982 | db.batchMu.Unlock() |
| 983 | |
| 984 | err := <-errCh |
| 985 | if err == trySolo { |
| 986 | err = db.Update(fn) |
| 987 | } |
| 988 | return err |
| 989 | } |
| 990 | |
| 991 | type call struct { |
| 992 | fn func(*Tx) error |
| 993 | err chan<- error |
| 994 | } |
| 995 | |
| 996 | type batch struct { |
| 997 | db *DB |
| 998 | timer *time.Timer |
| 999 | start sync.Once |
| 1000 | calls []call |
| 1001 | } |
| 1002 | |
| 1003 | // trigger runs the batch if it hasn't already been run. |
| 1004 | func (b *batch) trigger() { |
| 1005 | b.start.Do(b.run) |
| 1006 | } |
| 1007 | |
| 1008 | // run performs the transactions in the batch and communicates results |
| 1009 | // back to DB.Batch. |
| 1010 | func (b *batch) run() { |
| 1011 | b.db.batchMu.Lock() |
| 1012 | b.timer.Stop() |
| 1013 | // Make sure no new work is added to this batch, but don't break |
| 1014 | // other batches. |
| 1015 | if b.db.batch == b { |
| 1016 | b.db.batch = nil |
| 1017 | } |
| 1018 | b.db.batchMu.Unlock() |
| 1019 | |
| 1020 | retry: |
| 1021 | for len(b.calls) > 0 { |
| 1022 | var failIdx = -1 |
| 1023 | err := b.db.Update(func(tx *Tx) error { |
| 1024 | for i, c := range b.calls { |
| 1025 | if err := safelyCall(c.fn, tx); err != nil { |
| 1026 | failIdx = i |
| 1027 | return err |
| 1028 | } |
| 1029 | } |
| 1030 | return nil |
| 1031 | }) |
| 1032 | |
| 1033 | if failIdx >= 0 { |
| 1034 | // take the failing transaction out of the batch. it's |
| 1035 | // safe to shorten b.calls here because db.batch no longer |
| 1036 | // points to us, and we hold the mutex anyway. |
| 1037 | c := b.calls[failIdx] |
| 1038 | b.calls[failIdx], b.calls = b.calls[len(b.calls)-1], b.calls[:len(b.calls)-1] |
| 1039 | // tell the submitter re-run it solo, continue with the rest of the batch |
| 1040 | c.err <- trySolo |
| 1041 | continue retry |
| 1042 | } |
| 1043 | |
| 1044 | // pass success, or bolt internal errors, to all callers |
| 1045 | for _, c := range b.calls { |
| 1046 | c.err <- err |
| 1047 | } |
| 1048 | break retry |
| 1049 | } |
| 1050 | } |
| 1051 | |
| 1052 | // trySolo is a special sentinel error value used for signaling that a |
| 1053 | // transaction function should be re-run. It should never be seen by |
| 1054 | // callers. |
| 1055 | var trySolo = errors.New("batch function returned an error and should be re-run solo") |
| 1056 | |
| 1057 | type panicked struct { |
| 1058 | reason interface{} |
| 1059 | } |
| 1060 | |
| 1061 | func (p panicked) Error() string { |
| 1062 | if err, ok := p.reason.(error); ok { |
| 1063 | return err.Error() |
| 1064 | } |
| 1065 | return fmt.Sprintf("panic: %v", p.reason) |
| 1066 | } |
| 1067 | |
| 1068 | func safelyCall(fn func(*Tx) error, tx *Tx) (err error) { |
| 1069 | defer func() { |
| 1070 | if p := recover(); p != nil { |
| 1071 | err = panicked{p} |
| 1072 | } |
| 1073 | }() |
| 1074 | return fn(tx) |
| 1075 | } |
| 1076 | |
| 1077 | // Sync executes fdatasync() against the database file handle. |
| 1078 | // |
| 1079 | // This is not necessary under normal operation, however, if you use NoSync |
| 1080 | // then it allows you to force the database file to sync against the disk. |
| 1081 | func (db *DB) Sync() (err error) { |
| 1082 | if lg := db.Logger(); lg != discardLogger { |
| 1083 | lg.Debugf("Syncing bbolt db (%s)", db.path) |
| 1084 | defer func() { |
| 1085 | if err != nil { |
| 1086 | lg.Errorf("[GOOS: %s, GOARCH: %s] syncing bbolt db (%s) failed: %v", runtime.GOOS, runtime.GOARCH, db.path, err) |
| 1087 | } else { |
| 1088 | lg.Debugf("Syncing bbolt db (%s) successfully", db.path) |
| 1089 | } |
| 1090 | }() |
| 1091 | } |
| 1092 | |
| 1093 | return fdatasync(db) |
| 1094 | } |
| 1095 | |
| 1096 | // Stats retrieves ongoing performance stats for the database. |
| 1097 | // This is only updated when a transaction closes. |
| 1098 | func (db *DB) Stats() Stats { |
| 1099 | db.statlock.RLock() |
| 1100 | defer db.statlock.RUnlock() |
| 1101 | return db.stats |
| 1102 | } |
| 1103 | |
| 1104 | // This is for internal access to the raw data bytes from the C cursor, use |
| 1105 | // carefully, or not at all. |
| 1106 | func (db *DB) Info() *Info { |
| 1107 | common.Assert(db.data != nil, "database file isn't correctly mapped") |
| 1108 | return &Info{uintptr(unsafe.Pointer(&db.data[0])), db.pageSize} |
| 1109 | } |
| 1110 | |
| 1111 | // page retrieves a page reference from the mmap based on the current page size. |
| 1112 | func (db *DB) page(id common.Pgid) *common.Page { |
| 1113 | pos := id * common.Pgid(db.pageSize) |
| 1114 | return (*common.Page)(unsafe.Pointer(&db.data[pos])) |
| 1115 | } |
| 1116 | |
| 1117 | // pageInBuffer retrieves a page reference from a given byte array based on the current page size. |
| 1118 | func (db *DB) pageInBuffer(b []byte, id common.Pgid) *common.Page { |
| 1119 | return (*common.Page)(unsafe.Pointer(&b[id*common.Pgid(db.pageSize)])) |
| 1120 | } |
| 1121 | |
| 1122 | // meta retrieves the current meta page reference. |
| 1123 | func (db *DB) meta() *common.Meta { |
| 1124 | // We have to return the meta with the highest txid which doesn't fail |
| 1125 | // validation. Otherwise, we can cause errors when in fact the database is |
| 1126 | // in a consistent state. metaA is the one with the higher txid. |
| 1127 | metaA := db.meta0 |
| 1128 | metaB := db.meta1 |
| 1129 | if db.meta1.Txid() > db.meta0.Txid() { |
| 1130 | metaA = db.meta1 |
| 1131 | metaB = db.meta0 |
| 1132 | } |
| 1133 | |
| 1134 | // Use higher meta page if valid. Otherwise, fallback to previous, if valid. |
| 1135 | if err := metaA.Validate(); err == nil { |
| 1136 | return metaA |
| 1137 | } else if err := metaB.Validate(); err == nil { |
| 1138 | return metaB |
| 1139 | } |
| 1140 | |
| 1141 | // This should never be reached, because both meta1 and meta0 were validated |
| 1142 | // on mmap() and we do fsync() on every write. |
| 1143 | panic("bolt.DB.meta(): invalid meta pages") |
| 1144 | } |
| 1145 | |
| 1146 | // allocate returns a contiguous block of memory starting at a given page. |
| 1147 | func (db *DB) allocate(txid common.Txid, count int) (*common.Page, error) { |
| 1148 | // Allocate a temporary buffer for the page. |
| 1149 | var buf []byte |
| 1150 | if count == 1 { |
| 1151 | buf = db.pagePool.Get().([]byte) |
| 1152 | } else { |
| 1153 | buf = make([]byte, count*db.pageSize) |
| 1154 | } |
| 1155 | p := (*common.Page)(unsafe.Pointer(&buf[0])) |
| 1156 | p.SetOverflow(uint32(count - 1)) |
| 1157 | |
| 1158 | // Use pages from the freelist if they are available. |
| 1159 | p.SetId(db.freelist.Allocate(txid, count)) |
| 1160 | if p.Id() != 0 { |
| 1161 | return p, nil |
| 1162 | } |
| 1163 | |
| 1164 | // Resize mmap() if we're at the end. |
| 1165 | p.SetId(db.rwtx.meta.Pgid()) |
| 1166 | var minsz = int((p.Id()+common.Pgid(count))+1) * db.pageSize |
| 1167 | if minsz >= db.datasz { |
| 1168 | if err := db.mmap(minsz); err != nil { |
| 1169 | return nil, fmt.Errorf("mmap allocate error: %s", err) |
| 1170 | } |
| 1171 | } |
| 1172 | |
| 1173 | // Move the page id high water mark. |
| 1174 | curPgid := db.rwtx.meta.Pgid() |
| 1175 | db.rwtx.meta.SetPgid(curPgid + common.Pgid(count)) |
| 1176 | |
| 1177 | return p, nil |
| 1178 | } |
| 1179 | |
| 1180 | // grow grows the size of the database to the given sz. |
| 1181 | func (db *DB) grow(sz int) error { |
| 1182 | // Ignore if the new size is less than available file size. |
| 1183 | lg := db.Logger() |
| 1184 | fileSize, err := db.fileSize() |
| 1185 | if err != nil { |
| 1186 | lg.Errorf("getting file size failed: %w", err) |
| 1187 | return err |
| 1188 | } |
| 1189 | if sz <= fileSize { |
| 1190 | return nil |
| 1191 | } |
| 1192 | |
| 1193 | // If the data is smaller than the alloc size then only allocate what's needed. |
| 1194 | // Once it goes over the allocation size then allocate in chunks. |
| 1195 | if db.datasz <= db.AllocSize { |
| 1196 | sz = db.datasz |
| 1197 | } else { |
| 1198 | sz += db.AllocSize |
| 1199 | } |
| 1200 | |
| 1201 | // Truncate and fsync to ensure file size metadata is flushed. |
| 1202 | // https://github.com/boltdb/bolt/issues/284 |
| 1203 | if !db.NoGrowSync && !db.readOnly { |
| 1204 | if runtime.GOOS != "windows" { |
| 1205 | // gofail: var resizeFileError string |
| 1206 | // return errors.New(resizeFileError) |
| 1207 | if err := db.file.Truncate(int64(sz)); err != nil { |
| 1208 | lg.Errorf("[GOOS: %s, GOARCH: %s] truncating file failed, size: %d, db.datasz: %d, error: %v", runtime.GOOS, runtime.GOARCH, sz, db.datasz, err) |
| 1209 | return fmt.Errorf("file resize error: %s", err) |
| 1210 | } |
| 1211 | } |
| 1212 | if err := db.file.Sync(); err != nil { |
| 1213 | lg.Errorf("[GOOS: %s, GOARCH: %s] syncing file failed, db.datasz: %d, error: %v", runtime.GOOS, runtime.GOARCH, db.datasz, err) |
| 1214 | return fmt.Errorf("file sync error: %s", err) |
| 1215 | } |
| 1216 | if db.Mlock { |
| 1217 | // unlock old file and lock new one |
| 1218 | if err := db.mrelock(fileSize, sz); err != nil { |
| 1219 | return fmt.Errorf("mlock/munlock error: %s", err) |
| 1220 | } |
| 1221 | } |
| 1222 | } |
| 1223 | |
| 1224 | return nil |
| 1225 | } |
| 1226 | |
| 1227 | func (db *DB) IsReadOnly() bool { |
| 1228 | return db.readOnly |
| 1229 | } |
| 1230 | |
| 1231 | func (db *DB) freepages() []common.Pgid { |
| 1232 | tx, err := db.beginTx() |
| 1233 | defer func() { |
| 1234 | err = tx.Rollback() |
| 1235 | if err != nil { |
| 1236 | panic("freepages: failed to rollback tx") |
| 1237 | } |
| 1238 | }() |
| 1239 | if err != nil { |
| 1240 | panic("freepages: failed to open read only tx") |
| 1241 | } |
| 1242 | |
| 1243 | reachable := make(map[common.Pgid]*common.Page) |
| 1244 | nofreed := make(map[common.Pgid]bool) |
| 1245 | ech := make(chan error) |
| 1246 | go func() { |
| 1247 | for e := range ech { |
| 1248 | panic(fmt.Sprintf("freepages: failed to get all reachable pages (%v)", e)) |
| 1249 | } |
| 1250 | }() |
| 1251 | tx.recursivelyCheckBucket(&tx.root, reachable, nofreed, HexKVStringer(), ech) |
| 1252 | close(ech) |
| 1253 | |
| 1254 | // TODO: If check bucket reported any corruptions (ech) we shouldn't proceed to freeing the pages. |
| 1255 | |
| 1256 | var fids []common.Pgid |
| 1257 | for i := common.Pgid(2); i < db.meta().Pgid(); i++ { |
| 1258 | if _, ok := reachable[i]; !ok { |
| 1259 | fids = append(fids, i) |
| 1260 | } |
| 1261 | } |
| 1262 | return fids |
| 1263 | } |
| 1264 | |
| 1265 | func newFreelist(freelistType FreelistType) fl.Interface { |
| 1266 | if freelistType == FreelistMapType { |
| 1267 | return fl.NewHashMapFreelist() |
| 1268 | } |
| 1269 | return fl.NewArrayFreelist() |
| 1270 | } |
| 1271 | |
| 1272 | // Options represents the options that can be set when opening a database. |
| 1273 | type Options struct { |
| 1274 | // Timeout is the amount of time to wait to obtain a file lock. |
| 1275 | // When set to zero it will wait indefinitely. |
| 1276 | Timeout time.Duration |
| 1277 | |
| 1278 | // Sets the DB.NoGrowSync flag before memory mapping the file. |
| 1279 | NoGrowSync bool |
| 1280 | |
| 1281 | // Do not sync freelist to disk. This improves the database write performance |
| 1282 | // under normal operation, but requires a full database re-sync during recovery. |
| 1283 | NoFreelistSync bool |
| 1284 | |
| 1285 | // PreLoadFreelist sets whether to load the free pages when opening |
| 1286 | // the db file. Note when opening db in write mode, bbolt will always |
| 1287 | // load the free pages. |
| 1288 | PreLoadFreelist bool |
| 1289 | |
| 1290 | // FreelistType sets the backend freelist type. There are two options. Array which is simple but endures |
| 1291 | // dramatic performance degradation if database is large and fragmentation in freelist is common. |
| 1292 | // The alternative one is using hashmap, it is faster in almost all circumstances |
| 1293 | // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe. |
| 1294 | // The default type is array |
| 1295 | FreelistType FreelistType |
| 1296 | |
| 1297 | // Open database in read-only mode. Uses flock(..., LOCK_SH |LOCK_NB) to |
| 1298 | // grab a shared lock (UNIX). |
| 1299 | ReadOnly bool |
| 1300 | |
| 1301 | // Sets the DB.MmapFlags flag before memory mapping the file. |
| 1302 | MmapFlags int |
| 1303 | |
| 1304 | // InitialMmapSize is the initial mmap size of the database |
| 1305 | // in bytes. Read transactions won't block write transaction |
| 1306 | // if the InitialMmapSize is large enough to hold database mmap |
| 1307 | // size. (See DB.Begin for more information) |
| 1308 | // |
| 1309 | // If <=0, the initial map size is 0. |
| 1310 | // If initialMmapSize is smaller than the previous database size, |
| 1311 | // it takes no effect. |
| 1312 | // |
| 1313 | // Note: On Windows, due to platform limitations, the database file size |
| 1314 | // will be immediately resized to match `InitialMmapSize` (aligned to page size) |
| 1315 | // when the DB is opened. On non-Windows platforms, the file size will grow |
| 1316 | // dynamically based on the actual amount of written data, regardless of `InitialMmapSize`. |
| 1317 | // Refer to https://github.com/etcd-io/bbolt/issues/378#issuecomment-1378121966. |
| 1318 | InitialMmapSize int |
| 1319 | |
| 1320 | // PageSize overrides the default OS page size. |
| 1321 | PageSize int |
| 1322 | |
| 1323 | // NoSync sets the initial value of DB.NoSync. Normally this can just be |
| 1324 | // set directly on the DB itself when returned from Open(), but this option |
| 1325 | // is useful in APIs which expose Options but not the underlying DB. |
| 1326 | NoSync bool |
| 1327 | |
| 1328 | // OpenFile is used to open files. It defaults to os.OpenFile. This option |
| 1329 | // is useful for writing hermetic tests. |
| 1330 | OpenFile func(string, int, os.FileMode) (*os.File, error) |
| 1331 | |
| 1332 | // Mlock locks database file in memory when set to true. |
| 1333 | // It prevents potential page faults, however |
| 1334 | // used memory can't be reclaimed. (UNIX only) |
| 1335 | Mlock bool |
| 1336 | |
| 1337 | // Logger is the logger used for bbolt. |
| 1338 | Logger Logger |
| 1339 | } |
| 1340 | |
| 1341 | func (o *Options) String() string { |
| 1342 | if o == nil { |
| 1343 | return "{}" |
| 1344 | } |
| 1345 | |
| 1346 | return fmt.Sprintf("{Timeout: %s, NoGrowSync: %t, NoFreelistSync: %t, PreLoadFreelist: %t, FreelistType: %s, ReadOnly: %t, MmapFlags: %x, InitialMmapSize: %d, PageSize: %d, NoSync: %t, OpenFile: %p, Mlock: %t, Logger: %p}", |
| 1347 | o.Timeout, o.NoGrowSync, o.NoFreelistSync, o.PreLoadFreelist, o.FreelistType, o.ReadOnly, o.MmapFlags, o.InitialMmapSize, o.PageSize, o.NoSync, o.OpenFile, o.Mlock, o.Logger) |
| 1348 | |
| 1349 | } |
| 1350 | |
| 1351 | // DefaultOptions represent the options used if nil options are passed into Open(). |
| 1352 | // No timeout is used which will cause Bolt to wait indefinitely for a lock. |
| 1353 | var DefaultOptions = &Options{ |
| 1354 | Timeout: 0, |
| 1355 | NoGrowSync: false, |
| 1356 | FreelistType: FreelistArrayType, |
| 1357 | } |
| 1358 | |
| 1359 | // Stats represents statistics about the database. |
| 1360 | type Stats struct { |
| 1361 | // Put `TxStats` at the first field to ensure it's 64-bit aligned. Note |
| 1362 | // that the first word in an allocated struct can be relied upon to be |
| 1363 | // 64-bit aligned. Refer to https://pkg.go.dev/sync/atomic#pkg-note-BUG. |
| 1364 | // Also refer to discussion in https://github.com/etcd-io/bbolt/issues/577. |
| 1365 | TxStats TxStats // global, ongoing stats. |
| 1366 | |
| 1367 | // Freelist stats |
| 1368 | FreePageN int // total number of free pages on the freelist |
| 1369 | PendingPageN int // total number of pending pages on the freelist |
| 1370 | FreeAlloc int // total bytes allocated in free pages |
| 1371 | FreelistInuse int // total bytes used by the freelist |
| 1372 | |
| 1373 | // Transaction stats |
| 1374 | TxN int // total number of started read transactions |
| 1375 | OpenTxN int // number of currently open read transactions |
| 1376 | } |
| 1377 | |
| 1378 | // Sub calculates and returns the difference between two sets of database stats. |
| 1379 | // This is useful when obtaining stats at two different points and time and |
| 1380 | // you need the performance counters that occurred within that time span. |
| 1381 | func (s *Stats) Sub(other *Stats) Stats { |
| 1382 | if other == nil { |
| 1383 | return *s |
| 1384 | } |
| 1385 | var diff Stats |
| 1386 | diff.FreePageN = s.FreePageN |
| 1387 | diff.PendingPageN = s.PendingPageN |
| 1388 | diff.FreeAlloc = s.FreeAlloc |
| 1389 | diff.FreelistInuse = s.FreelistInuse |
| 1390 | diff.TxN = s.TxN - other.TxN |
| 1391 | diff.TxStats = s.TxStats.Sub(&other.TxStats) |
| 1392 | return diff |
| 1393 | } |
| 1394 | |
| 1395 | type Info struct { |
| 1396 | Data uintptr |
| 1397 | PageSize int |
| 1398 | } |