blob: 022b1001e279cb44fb91a1707c0fea09788263d5 [file] [log] [blame]
Abhay Kumar40252eb2025-10-13 13:25:53 +00001package bbolt
2
3import (
4 "bytes"
5 "fmt"
6 "sort"
7
8 "go.etcd.io/bbolt/internal/common"
9)
10
11// node represents an in-memory, deserialized page.
12type node struct {
13 bucket *Bucket
14 isLeaf bool
15 unbalanced bool
16 spilled bool
17 key []byte
18 pgid common.Pgid
19 parent *node
20 children nodes
21 inodes common.Inodes
22}
23
24// root returns the top-level node this node is attached to.
25func (n *node) root() *node {
26 if n.parent == nil {
27 return n
28 }
29 return n.parent.root()
30}
31
32// minKeys returns the minimum number of inodes this node should have.
33func (n *node) minKeys() int {
34 if n.isLeaf {
35 return 1
36 }
37 return 2
38}
39
40// size returns the size of the node after serialization.
41func (n *node) size() int {
42 sz, elsz := common.PageHeaderSize, n.pageElementSize()
43 for i := 0; i < len(n.inodes); i++ {
44 item := &n.inodes[i]
45 sz += elsz + uintptr(len(item.Key())) + uintptr(len(item.Value()))
46 }
47 return int(sz)
48}
49
50// sizeLessThan returns true if the node is less than a given size.
51// This is an optimization to avoid calculating a large node when we only need
52// to know if it fits inside a certain page size.
53func (n *node) sizeLessThan(v uintptr) bool {
54 sz, elsz := common.PageHeaderSize, n.pageElementSize()
55 for i := 0; i < len(n.inodes); i++ {
56 item := &n.inodes[i]
57 sz += elsz + uintptr(len(item.Key())) + uintptr(len(item.Value()))
58 if sz >= v {
59 return false
60 }
61 }
62 return true
63}
64
65// pageElementSize returns the size of each page element based on the type of node.
66func (n *node) pageElementSize() uintptr {
67 if n.isLeaf {
68 return common.LeafPageElementSize
69 }
70 return common.BranchPageElementSize
71}
72
73// childAt returns the child node at a given index.
74func (n *node) childAt(index int) *node {
75 if n.isLeaf {
76 panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
77 }
78 return n.bucket.node(n.inodes[index].Pgid(), n)
79}
80
81// childIndex returns the index of a given child node.
82func (n *node) childIndex(child *node) int {
83 index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].Key(), child.key) != -1 })
84 return index
85}
86
87// numChildren returns the number of children.
88func (n *node) numChildren() int {
89 return len(n.inodes)
90}
91
92// nextSibling returns the next node with the same parent.
93func (n *node) nextSibling() *node {
94 if n.parent == nil {
95 return nil
96 }
97 index := n.parent.childIndex(n)
98 if index >= n.parent.numChildren()-1 {
99 return nil
100 }
101 return n.parent.childAt(index + 1)
102}
103
104// prevSibling returns the previous node with the same parent.
105func (n *node) prevSibling() *node {
106 if n.parent == nil {
107 return nil
108 }
109 index := n.parent.childIndex(n)
110 if index == 0 {
111 return nil
112 }
113 return n.parent.childAt(index - 1)
114}
115
116// put inserts a key/value.
117func (n *node) put(oldKey, newKey, value []byte, pgId common.Pgid, flags uint32) {
118 if pgId >= n.bucket.tx.meta.Pgid() {
119 panic(fmt.Sprintf("pgId (%d) above high water mark (%d)", pgId, n.bucket.tx.meta.Pgid()))
120 } else if len(oldKey) <= 0 {
121 panic("put: zero-length old key")
122 } else if len(newKey) <= 0 {
123 panic("put: zero-length new key")
124 }
125
126 // Find insertion index.
127 index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].Key(), oldKey) != -1 })
128
129 // Add capacity and shift nodes if we don't have an exact match and need to insert.
130 exact := len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].Key(), oldKey)
131 if !exact {
132 n.inodes = append(n.inodes, common.Inode{})
133 copy(n.inodes[index+1:], n.inodes[index:])
134 }
135
136 inode := &n.inodes[index]
137 inode.SetFlags(flags)
138 inode.SetKey(newKey)
139 inode.SetValue(value)
140 inode.SetPgid(pgId)
141 common.Assert(len(inode.Key()) > 0, "put: zero-length inode key")
142}
143
144// del removes a key from the node.
145func (n *node) del(key []byte) {
146 // Find index of key.
147 index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].Key(), key) != -1 })
148
149 // Exit if the key isn't found.
150 if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].Key(), key) {
151 return
152 }
153
154 // Delete inode from the node.
155 n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)
156
157 // Mark the node as needing rebalancing.
158 n.unbalanced = true
159}
160
161// read initializes the node from a page.
162func (n *node) read(p *common.Page) {
163 n.pgid = p.Id()
164 n.isLeaf = p.IsLeafPage()
165 n.inodes = common.ReadInodeFromPage(p)
166
167 // Save first key, so we can find the node in the parent when we spill.
168 if len(n.inodes) > 0 {
169 n.key = n.inodes[0].Key()
170 common.Assert(len(n.key) > 0, "read: zero-length node key")
171 } else {
172 n.key = nil
173 }
174}
175
176// write writes the items onto one or more pages.
177// The page should have p.id (might be 0 for meta or bucket-inline page) and p.overflow set
178// and the rest should be zeroed.
179func (n *node) write(p *common.Page) {
180 common.Assert(p.Count() == 0 && p.Flags() == 0, "node cannot be written into a not empty page")
181
182 // Initialize page.
183 if n.isLeaf {
184 p.SetFlags(common.LeafPageFlag)
185 } else {
186 p.SetFlags(common.BranchPageFlag)
187 }
188
189 if len(n.inodes) >= 0xFFFF {
190 panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.Id()))
191 }
192 p.SetCount(uint16(len(n.inodes)))
193
194 // Stop here if there are no items to write.
195 if p.Count() == 0 {
196 return
197 }
198
199 common.WriteInodeToPage(n.inodes, p)
200
201 // DEBUG ONLY: n.dump()
202}
203
204// split breaks up a node into multiple smaller nodes, if appropriate.
205// This should only be called from the spill() function.
206func (n *node) split(pageSize uintptr) []*node {
207 var nodes []*node
208
209 node := n
210 for {
211 // Split node into two.
212 a, b := node.splitTwo(pageSize)
213 nodes = append(nodes, a)
214
215 // If we can't split then exit the loop.
216 if b == nil {
217 break
218 }
219
220 // Set node to b so it gets split on the next iteration.
221 node = b
222 }
223
224 return nodes
225}
226
227// splitTwo breaks up a node into two smaller nodes, if appropriate.
228// This should only be called from the split() function.
229func (n *node) splitTwo(pageSize uintptr) (*node, *node) {
230 // Ignore the split if the page doesn't have at least enough nodes for
231 // two pages or if the nodes can fit in a single page.
232 if len(n.inodes) <= (common.MinKeysPerPage*2) || n.sizeLessThan(pageSize) {
233 return n, nil
234 }
235
236 // Determine the threshold before starting a new node.
237 var fillPercent = n.bucket.FillPercent
238 if fillPercent < minFillPercent {
239 fillPercent = minFillPercent
240 } else if fillPercent > maxFillPercent {
241 fillPercent = maxFillPercent
242 }
243 threshold := int(float64(pageSize) * fillPercent)
244
245 // Determine split position and sizes of the two pages.
246 splitIndex, _ := n.splitIndex(threshold)
247
248 // Split node into two separate nodes.
249 // If there's no parent then we'll need to create one.
250 if n.parent == nil {
251 n.parent = &node{bucket: n.bucket, children: []*node{n}}
252 }
253
254 // Create a new node and add it to the parent.
255 next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
256 n.parent.children = append(n.parent.children, next)
257
258 // Split inodes across two nodes.
259 next.inodes = n.inodes[splitIndex:]
260 n.inodes = n.inodes[:splitIndex]
261
262 // Update the statistics.
263 n.bucket.tx.stats.IncSplit(1)
264
265 return n, next
266}
267
268// splitIndex finds the position where a page will fill a given threshold.
269// It returns the index as well as the size of the first page.
270// This is only be called from split().
271func (n *node) splitIndex(threshold int) (index, sz uintptr) {
272 sz = common.PageHeaderSize
273
274 // Loop until we only have the minimum number of keys required for the second page.
275 for i := 0; i < len(n.inodes)-common.MinKeysPerPage; i++ {
276 index = uintptr(i)
277 inode := n.inodes[i]
278 elsize := n.pageElementSize() + uintptr(len(inode.Key())) + uintptr(len(inode.Value()))
279
280 // If we have at least the minimum number of keys and adding another
281 // node would put us over the threshold then exit and return.
282 if index >= common.MinKeysPerPage && sz+elsize > uintptr(threshold) {
283 break
284 }
285
286 // Add the element size to the total size.
287 sz += elsize
288 }
289
290 return
291}
292
293// spill writes the nodes to dirty pages and splits nodes as it goes.
294// Returns an error if dirty pages cannot be allocated.
295func (n *node) spill() error {
296 var tx = n.bucket.tx
297 if n.spilled {
298 return nil
299 }
300
301 // Spill child nodes first. Child nodes can materialize sibling nodes in
302 // the case of split-merge so we cannot use a range loop. We have to check
303 // the children size on every loop iteration.
304 sort.Sort(n.children)
305 for i := 0; i < len(n.children); i++ {
306 if err := n.children[i].spill(); err != nil {
307 return err
308 }
309 }
310
311 // We no longer need the child list because it's only used for spill tracking.
312 n.children = nil
313
314 // Split nodes into appropriate sizes. The first node will always be n.
315 var nodes = n.split(uintptr(tx.db.pageSize))
316 for _, node := range nodes {
317 // Add node's page to the freelist if it's not new.
318 if node.pgid > 0 {
319 tx.db.freelist.Free(tx.meta.Txid(), tx.page(node.pgid))
320 node.pgid = 0
321 }
322
323 // Allocate contiguous space for the node.
324 p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize)
325 if err != nil {
326 return err
327 }
328
329 // Write the node.
330 if p.Id() >= tx.meta.Pgid() {
331 panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.Id(), tx.meta.Pgid()))
332 }
333 node.pgid = p.Id()
334 node.write(p)
335 node.spilled = true
336
337 // Insert into parent inodes.
338 if node.parent != nil {
339 var key = node.key
340 if key == nil {
341 key = node.inodes[0].Key()
342 }
343
344 node.parent.put(key, node.inodes[0].Key(), nil, node.pgid, 0)
345 node.key = node.inodes[0].Key()
346 common.Assert(len(node.key) > 0, "spill: zero-length node key")
347 }
348
349 // Update the statistics.
350 tx.stats.IncSpill(1)
351 }
352
353 // If the root node split and created a new root then we need to spill that
354 // as well. We'll clear out the children to make sure it doesn't try to respill.
355 if n.parent != nil && n.parent.pgid == 0 {
356 n.children = nil
357 return n.parent.spill()
358 }
359
360 return nil
361}
362
363// rebalance attempts to combine the node with sibling nodes if the node fill
364// size is below a threshold or if there are not enough keys.
365func (n *node) rebalance() {
366 if !n.unbalanced {
367 return
368 }
369 n.unbalanced = false
370
371 // Update statistics.
372 n.bucket.tx.stats.IncRebalance(1)
373
374 // Ignore if node is above threshold (25% when FillPercent is set to DefaultFillPercent) and has enough keys.
375 var threshold = int(float64(n.bucket.tx.db.pageSize)*n.bucket.FillPercent) / 2
376 if n.size() > threshold && len(n.inodes) > n.minKeys() {
377 return
378 }
379
380 // Root node has special handling.
381 if n.parent == nil {
382 // If root node is a branch and only has one node then collapse it.
383 if !n.isLeaf && len(n.inodes) == 1 {
384 // Move root's child up.
385 child := n.bucket.node(n.inodes[0].Pgid(), n)
386 n.isLeaf = child.isLeaf
387 n.inodes = child.inodes[:]
388 n.children = child.children
389
390 // Reparent all child nodes being moved.
391 for _, inode := range n.inodes {
392 if child, ok := n.bucket.nodes[inode.Pgid()]; ok {
393 child.parent = n
394 }
395 }
396
397 // Remove old child.
398 child.parent = nil
399 delete(n.bucket.nodes, child.pgid)
400 child.free()
401 }
402
403 return
404 }
405
406 // If node has no keys then just remove it.
407 if n.numChildren() == 0 {
408 n.parent.del(n.key)
409 n.parent.removeChild(n)
410 delete(n.bucket.nodes, n.pgid)
411 n.free()
412 n.parent.rebalance()
413 return
414 }
415
416 common.Assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
417
418 // Merge with right sibling if idx == 0, otherwise left sibling.
419 var leftNode, rightNode *node
420 var useNextSibling = n.parent.childIndex(n) == 0
421 if useNextSibling {
422 leftNode = n
423 rightNode = n.nextSibling()
424 } else {
425 leftNode = n.prevSibling()
426 rightNode = n
427 }
428
429 // If both nodes are too small then merge them.
430 // Reparent all child nodes being moved.
431 for _, inode := range rightNode.inodes {
432 if child, ok := n.bucket.nodes[inode.Pgid()]; ok {
433 child.parent.removeChild(child)
434 child.parent = leftNode
435 child.parent.children = append(child.parent.children, child)
436 }
437 }
438
439 // Copy over inodes from right node to left node and remove right node.
440 leftNode.inodes = append(leftNode.inodes, rightNode.inodes...)
441 n.parent.del(rightNode.key)
442 n.parent.removeChild(rightNode)
443 delete(n.bucket.nodes, rightNode.pgid)
444 rightNode.free()
445
446 // Either this node or the sibling node was deleted from the parent so rebalance it.
447 n.parent.rebalance()
448}
449
450// removes a node from the list of in-memory children.
451// This does not affect the inodes.
452func (n *node) removeChild(target *node) {
453 for i, child := range n.children {
454 if child == target {
455 n.children = append(n.children[:i], n.children[i+1:]...)
456 return
457 }
458 }
459}
460
461// dereference causes the node to copy all its inode key/value references to heap memory.
462// This is required when the mmap is reallocated so inodes are not pointing to stale data.
463func (n *node) dereference() {
464 if n.key != nil {
465 key := make([]byte, len(n.key))
466 copy(key, n.key)
467 n.key = key
468 common.Assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node")
469 }
470
471 for i := range n.inodes {
472 inode := &n.inodes[i]
473
474 key := make([]byte, len(inode.Key()))
475 copy(key, inode.Key())
476 inode.SetKey(key)
477 common.Assert(len(inode.Key()) > 0, "dereference: zero-length inode key")
478
479 value := make([]byte, len(inode.Value()))
480 copy(value, inode.Value())
481 inode.SetValue(value)
482 }
483
484 // Recursively dereference children.
485 for _, child := range n.children {
486 child.dereference()
487 }
488
489 // Update statistics.
490 n.bucket.tx.stats.IncNodeDeref(1)
491}
492
493// free adds the node's underlying page to the freelist.
494func (n *node) free() {
495 if n.pgid != 0 {
496 n.bucket.tx.db.freelist.Free(n.bucket.tx.meta.Txid(), n.bucket.tx.page(n.pgid))
497 n.pgid = 0
498 }
499}
500
501// dump writes the contents of the node to STDERR for debugging purposes.
502/*
503func (n *node) dump() {
504 // Write node header.
505 var typ = "branch"
506 if n.isLeaf {
507 typ = "leaf"
508 }
509 warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes))
510
511 // Write out abbreviated version of each item.
512 for _, item := range n.inodes {
513 if n.isLeaf {
514 if item.flags&bucketLeafFlag != 0 {
515 bucket := (*bucket)(unsafe.Pointer(&item.value[0]))
516 warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root)
517 } else {
518 warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4))
519 }
520 } else {
521 warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid)
522 }
523 }
524 warn("")
525}
526*/
527
528func compareKeys(left, right []byte) int {
529 return bytes.Compare(left, right)
530}
531
532type nodes []*node
533
534func (s nodes) Len() int { return len(s) }
535func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
536func (s nodes) Less(i, j int) bool {
537 return bytes.Compare(s[i].inodes[0].Key(), s[j].inodes[0].Key()) == -1
538}