Blame - vendor/go.etcd.io/bbolt/node.go - voltha-lib-go

blob: 022b1001e279cb44fb91a1707c0fea09788263d5 [file] [log] [blame]

Abhay Kumar	40252eb	2025-10-13 13:25:53 +0000	[diff] [blame^]	1	package bbolt
				2
				3	import (
				4	"bytes"
				5	"fmt"
				6	"sort"
				7
				8	"go.etcd.io/bbolt/internal/common"
				9	)
				10
				11	// node represents an in-memory, deserialized page.
				12	type node struct {
				13	bucket *Bucket
				14	isLeaf bool
				15	unbalanced bool
				16	spilled bool
				17	key []byte
				18	pgid common.Pgid
				19	parent *node
				20	children nodes
				21	inodes common.Inodes
				22	}
				23
				24	// root returns the top-level node this node is attached to.
				25	func (n node) root() node {
				26	if n.parent == nil {
				27	return n
				28	}
				29	return n.parent.root()
				30	}
				31
				32	// minKeys returns the minimum number of inodes this node should have.
				33	func (n *node) minKeys() int {
				34	if n.isLeaf {
				35	return 1
				36	}
				37	return 2
				38	}
				39
				40	// size returns the size of the node after serialization.
				41	func (n *node) size() int {
				42	sz, elsz := common.PageHeaderSize, n.pageElementSize()
				43	for i := 0; i < len(n.inodes); i++ {
				44	item := &n.inodes[i]
				45	sz += elsz + uintptr(len(item.Key())) + uintptr(len(item.Value()))
				46	}
				47	return int(sz)
				48	}
				49
				50	// sizeLessThan returns true if the node is less than a given size.
				51	// This is an optimization to avoid calculating a large node when we only need
				52	// to know if it fits inside a certain page size.
				53	func (n *node) sizeLessThan(v uintptr) bool {
				54	sz, elsz := common.PageHeaderSize, n.pageElementSize()
				55	for i := 0; i < len(n.inodes); i++ {
				56	item := &n.inodes[i]
				57	sz += elsz + uintptr(len(item.Key())) + uintptr(len(item.Value()))
				58	if sz >= v {
				59	return false
				60	}
				61	}
				62	return true
				63	}
				64
				65	// pageElementSize returns the size of each page element based on the type of node.
				66	func (n *node) pageElementSize() uintptr {
				67	if n.isLeaf {
				68	return common.LeafPageElementSize
				69	}
				70	return common.BranchPageElementSize
				71	}
				72
				73	// childAt returns the child node at a given index.
				74	func (n node) childAt(index int) node {
				75	if n.isLeaf {
				76	panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
				77	}
				78	return n.bucket.node(n.inodes[index].Pgid(), n)
				79	}
				80
				81	// childIndex returns the index of a given child node.
				82	func (n node) childIndex(child node) int {
				83	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].Key(), child.key) != -1 })
				84	return index
				85	}
				86
				87	// numChildren returns the number of children.
				88	func (n *node) numChildren() int {
				89	return len(n.inodes)
				90	}
				91
				92	// nextSibling returns the next node with the same parent.
				93	func (n node) nextSibling() node {
				94	if n.parent == nil {
				95	return nil
				96	}
				97	index := n.parent.childIndex(n)
				98	if index >= n.parent.numChildren()-1 {
				99	return nil
				100	}
				101	return n.parent.childAt(index + 1)
				102	}
				103
				104	// prevSibling returns the previous node with the same parent.
				105	func (n node) prevSibling() node {
				106	if n.parent == nil {
				107	return nil
				108	}
				109	index := n.parent.childIndex(n)
				110	if index == 0 {
				111	return nil
				112	}
				113	return n.parent.childAt(index - 1)
				114	}
				115
				116	// put inserts a key/value.
				117	func (n *node) put(oldKey, newKey, value []byte, pgId common.Pgid, flags uint32) {
				118	if pgId >= n.bucket.tx.meta.Pgid() {
				119	panic(fmt.Sprintf("pgId (%d) above high water mark (%d)", pgId, n.bucket.tx.meta.Pgid()))
				120	} else if len(oldKey) <= 0 {
				121	panic("put: zero-length old key")
				122	} else if len(newKey) <= 0 {
				123	panic("put: zero-length new key")
				124	}
				125
				126	// Find insertion index.
				127	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].Key(), oldKey) != -1 })
				128
				129	// Add capacity and shift nodes if we don't have an exact match and need to insert.
				130	exact := len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].Key(), oldKey)
				131	if !exact {
				132	n.inodes = append(n.inodes, common.Inode{})
				133	copy(n.inodes[index+1:], n.inodes[index:])
				134	}
				135
				136	inode := &n.inodes[index]
				137	inode.SetFlags(flags)
				138	inode.SetKey(newKey)
				139	inode.SetValue(value)
				140	inode.SetPgid(pgId)
				141	common.Assert(len(inode.Key()) > 0, "put: zero-length inode key")
				142	}
				143
				144	// del removes a key from the node.
				145	func (n *node) del(key []byte) {
				146	// Find index of key.
				147	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].Key(), key) != -1 })
				148
				149	// Exit if the key isn't found.
				150	if index >= len(n.inodes) \|\| !bytes.Equal(n.inodes[index].Key(), key) {
				151	return
				152	}
				153
				154	// Delete inode from the node.
				155	n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)
				156
				157	// Mark the node as needing rebalancing.
				158	n.unbalanced = true
				159	}
				160
				161	// read initializes the node from a page.
				162	func (n node) read(p common.Page) {
				163	n.pgid = p.Id()
				164	n.isLeaf = p.IsLeafPage()
				165	n.inodes = common.ReadInodeFromPage(p)
				166
				167	// Save first key, so we can find the node in the parent when we spill.
				168	if len(n.inodes) > 0 {
				169	n.key = n.inodes[0].Key()
				170	common.Assert(len(n.key) > 0, "read: zero-length node key")
				171	} else {
				172	n.key = nil
				173	}
				174	}
				175
				176	// write writes the items onto one or more pages.
				177	// The page should have p.id (might be 0 for meta or bucket-inline page) and p.overflow set
				178	// and the rest should be zeroed.
				179	func (n node) write(p common.Page) {
				180	common.Assert(p.Count() == 0 && p.Flags() == 0, "node cannot be written into a not empty page")
				181
				182	// Initialize page.
				183	if n.isLeaf {
				184	p.SetFlags(common.LeafPageFlag)
				185	} else {
				186	p.SetFlags(common.BranchPageFlag)
				187	}
				188
				189	if len(n.inodes) >= 0xFFFF {
				190	panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.Id()))
				191	}
				192	p.SetCount(uint16(len(n.inodes)))
				193
				194	// Stop here if there are no items to write.
				195	if p.Count() == 0 {
				196	return
				197	}
				198
				199	common.WriteInodeToPage(n.inodes, p)
				200
				201	// DEBUG ONLY: n.dump()
				202	}
				203
				204	// split breaks up a node into multiple smaller nodes, if appropriate.
				205	// This should only be called from the spill() function.
				206	func (n node) split(pageSize uintptr) []node {
				207	var nodes []*node
				208
				209	node := n
				210	for {
				211	// Split node into two.
				212	a, b := node.splitTwo(pageSize)
				213	nodes = append(nodes, a)
				214
				215	// If we can't split then exit the loop.
				216	if b == nil {
				217	break
				218	}
				219
				220	// Set node to b so it gets split on the next iteration.
				221	node = b
				222	}
				223
				224	return nodes
				225	}
				226
				227	// splitTwo breaks up a node into two smaller nodes, if appropriate.
				228	// This should only be called from the split() function.
				229	func (n node) splitTwo(pageSize uintptr) (node, *node) {
				230	// Ignore the split if the page doesn't have at least enough nodes for
				231	// two pages or if the nodes can fit in a single page.
				232	if len(n.inodes) <= (common.MinKeysPerPage*2) \|\| n.sizeLessThan(pageSize) {
				233	return n, nil
				234	}
				235
				236	// Determine the threshold before starting a new node.
				237	var fillPercent = n.bucket.FillPercent
				238	if fillPercent < minFillPercent {
				239	fillPercent = minFillPercent
				240	} else if fillPercent > maxFillPercent {
				241	fillPercent = maxFillPercent
				242	}
				243	threshold := int(float64(pageSize) * fillPercent)
				244
				245	// Determine split position and sizes of the two pages.
				246	splitIndex, _ := n.splitIndex(threshold)
				247
				248	// Split node into two separate nodes.
				249	// If there's no parent then we'll need to create one.
				250	if n.parent == nil {
				251	n.parent = &node{bucket: n.bucket, children: []*node{n}}
				252	}
				253
				254	// Create a new node and add it to the parent.
				255	next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
				256	n.parent.children = append(n.parent.children, next)
				257
				258	// Split inodes across two nodes.
				259	next.inodes = n.inodes[splitIndex:]
				260	n.inodes = n.inodes[:splitIndex]
				261
				262	// Update the statistics.
				263	n.bucket.tx.stats.IncSplit(1)
				264
				265	return n, next
				266	}
				267
				268	// splitIndex finds the position where a page will fill a given threshold.
				269	// It returns the index as well as the size of the first page.
				270	// This is only be called from split().
				271	func (n *node) splitIndex(threshold int) (index, sz uintptr) {
				272	sz = common.PageHeaderSize
				273
				274	// Loop until we only have the minimum number of keys required for the second page.
				275	for i := 0; i < len(n.inodes)-common.MinKeysPerPage; i++ {
				276	index = uintptr(i)
				277	inode := n.inodes[i]
				278	elsize := n.pageElementSize() + uintptr(len(inode.Key())) + uintptr(len(inode.Value()))
				279
				280	// If we have at least the minimum number of keys and adding another
				281	// node would put us over the threshold then exit and return.
				282	if index >= common.MinKeysPerPage && sz+elsize > uintptr(threshold) {
				283	break
				284	}
				285
				286	// Add the element size to the total size.
				287	sz += elsize
				288	}
				289
				290	return
				291	}
				292
				293	// spill writes the nodes to dirty pages and splits nodes as it goes.
				294	// Returns an error if dirty pages cannot be allocated.
				295	func (n *node) spill() error {
				296	var tx = n.bucket.tx
				297	if n.spilled {
				298	return nil
				299	}
				300
				301	// Spill child nodes first. Child nodes can materialize sibling nodes in
				302	// the case of split-merge so we cannot use a range loop. We have to check
				303	// the children size on every loop iteration.
				304	sort.Sort(n.children)
				305	for i := 0; i < len(n.children); i++ {
				306	if err := n.children[i].spill(); err != nil {
				307	return err
				308	}
				309	}
				310
				311	// We no longer need the child list because it's only used for spill tracking.
				312	n.children = nil
				313
				314	// Split nodes into appropriate sizes. The first node will always be n.
				315	var nodes = n.split(uintptr(tx.db.pageSize))
				316	for _, node := range nodes {
				317	// Add node's page to the freelist if it's not new.
				318	if node.pgid > 0 {
				319	tx.db.freelist.Free(tx.meta.Txid(), tx.page(node.pgid))
				320	node.pgid = 0
				321	}
				322
				323	// Allocate contiguous space for the node.
				324	p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize)
				325	if err != nil {
				326	return err
				327	}
				328
				329	// Write the node.
				330	if p.Id() >= tx.meta.Pgid() {
				331	panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.Id(), tx.meta.Pgid()))
				332	}
				333	node.pgid = p.Id()
				334	node.write(p)
				335	node.spilled = true
				336
				337	// Insert into parent inodes.
				338	if node.parent != nil {
				339	var key = node.key
				340	if key == nil {
				341	key = node.inodes[0].Key()
				342	}
				343
				344	node.parent.put(key, node.inodes[0].Key(), nil, node.pgid, 0)
				345	node.key = node.inodes[0].Key()
				346	common.Assert(len(node.key) > 0, "spill: zero-length node key")
				347	}
				348
				349	// Update the statistics.
				350	tx.stats.IncSpill(1)
				351	}
				352
				353	// If the root node split and created a new root then we need to spill that
				354	// as well. We'll clear out the children to make sure it doesn't try to respill.
				355	if n.parent != nil && n.parent.pgid == 0 {
				356	n.children = nil
				357	return n.parent.spill()
				358	}
				359
				360	return nil
				361	}
				362
				363	// rebalance attempts to combine the node with sibling nodes if the node fill
				364	// size is below a threshold or if there are not enough keys.
				365	func (n *node) rebalance() {
				366	if !n.unbalanced {
				367	return
				368	}
				369	n.unbalanced = false
				370
				371	// Update statistics.
				372	n.bucket.tx.stats.IncRebalance(1)
				373
				374	// Ignore if node is above threshold (25% when FillPercent is set to DefaultFillPercent) and has enough keys.
				375	var threshold = int(float64(n.bucket.tx.db.pageSize)*n.bucket.FillPercent) / 2
				376	if n.size() > threshold && len(n.inodes) > n.minKeys() {
				377	return
				378	}
				379
				380	// Root node has special handling.
				381	if n.parent == nil {
				382	// If root node is a branch and only has one node then collapse it.
				383	if !n.isLeaf && len(n.inodes) == 1 {
				384	// Move root's child up.
				385	child := n.bucket.node(n.inodes[0].Pgid(), n)
				386	n.isLeaf = child.isLeaf
				387	n.inodes = child.inodes[:]
				388	n.children = child.children
				389
				390	// Reparent all child nodes being moved.
				391	for _, inode := range n.inodes {
				392	if child, ok := n.bucket.nodes[inode.Pgid()]; ok {
				393	child.parent = n
				394	}
				395	}
				396
				397	// Remove old child.
				398	child.parent = nil
				399	delete(n.bucket.nodes, child.pgid)
				400	child.free()
				401	}
				402
				403	return
				404	}
				405
				406	// If node has no keys then just remove it.
				407	if n.numChildren() == 0 {
				408	n.parent.del(n.key)
				409	n.parent.removeChild(n)
				410	delete(n.bucket.nodes, n.pgid)
				411	n.free()
				412	n.parent.rebalance()
				413	return
				414	}
				415
				416	common.Assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
				417
				418	// Merge with right sibling if idx == 0, otherwise left sibling.
				419	var leftNode, rightNode *node
				420	var useNextSibling = n.parent.childIndex(n) == 0
				421	if useNextSibling {
				422	leftNode = n
				423	rightNode = n.nextSibling()
				424	} else {
				425	leftNode = n.prevSibling()
				426	rightNode = n
				427	}
				428
				429	// If both nodes are too small then merge them.
				430	// Reparent all child nodes being moved.
				431	for _, inode := range rightNode.inodes {
				432	if child, ok := n.bucket.nodes[inode.Pgid()]; ok {
				433	child.parent.removeChild(child)
				434	child.parent = leftNode
				435	child.parent.children = append(child.parent.children, child)
				436	}
				437	}
				438
				439	// Copy over inodes from right node to left node and remove right node.
				440	leftNode.inodes = append(leftNode.inodes, rightNode.inodes...)
				441	n.parent.del(rightNode.key)
				442	n.parent.removeChild(rightNode)
				443	delete(n.bucket.nodes, rightNode.pgid)
				444	rightNode.free()
				445
				446	// Either this node or the sibling node was deleted from the parent so rebalance it.
				447	n.parent.rebalance()
				448	}
				449
				450	// removes a node from the list of in-memory children.
				451	// This does not affect the inodes.
				452	func (n node) removeChild(target node) {
				453	for i, child := range n.children {
				454	if child == target {
				455	n.children = append(n.children[:i], n.children[i+1:]...)
				456	return
				457	}
				458	}
				459	}
				460
				461	// dereference causes the node to copy all its inode key/value references to heap memory.
				462	// This is required when the mmap is reallocated so inodes are not pointing to stale data.
				463	func (n *node) dereference() {
				464	if n.key != nil {
				465	key := make([]byte, len(n.key))
				466	copy(key, n.key)
				467	n.key = key
				468	common.Assert(n.pgid == 0 \|\| len(n.key) > 0, "dereference: zero-length node key on existing node")
				469	}
				470
				471	for i := range n.inodes {
				472	inode := &n.inodes[i]
				473
				474	key := make([]byte, len(inode.Key()))
				475	copy(key, inode.Key())
				476	inode.SetKey(key)
				477	common.Assert(len(inode.Key()) > 0, "dereference: zero-length inode key")
				478
				479	value := make([]byte, len(inode.Value()))
				480	copy(value, inode.Value())
				481	inode.SetValue(value)
				482	}
				483
				484	// Recursively dereference children.
				485	for _, child := range n.children {
				486	child.dereference()
				487	}
				488
				489	// Update statistics.
				490	n.bucket.tx.stats.IncNodeDeref(1)
				491	}
				492
				493	// free adds the node's underlying page to the freelist.
				494	func (n *node) free() {
				495	if n.pgid != 0 {
				496	n.bucket.tx.db.freelist.Free(n.bucket.tx.meta.Txid(), n.bucket.tx.page(n.pgid))
				497	n.pgid = 0
				498	}
				499	}
				500
				501	// dump writes the contents of the node to STDERR for debugging purposes.
				502	/*
				503	func (n *node) dump() {
				504	// Write node header.
				505	var typ = "branch"
				506	if n.isLeaf {
				507	typ = "leaf"
				508	}
				509	warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes))
				510
				511	// Write out abbreviated version of each item.
				512	for _, item := range n.inodes {
				513	if n.isLeaf {
				514	if item.flags&bucketLeafFlag != 0 {
				515	bucket := (*bucket)(unsafe.Pointer(&item.value[0]))
				516	warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root)
				517	} else {
				518	warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4))
				519	}
				520	} else {
				521	warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid)
				522	}
				523	}
				524	warn("")
				525	}
				526	*/
				527
				528	func compareKeys(left, right []byte) int {
				529	return bytes.Compare(left, right)
				530	}
				531
				532	type nodes []*node
				533
				534	func (s nodes) Len() int { return len(s) }
				535	func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
				536	func (s nodes) Less(i, j int) bool {
				537	return bytes.Compare(s[i].inodes[0].Key(), s[j].inodes[0].Key()) == -1
				538	}