Blame - vendor/github.com/coreos/bbolt/node.go - voltha-go

blob: 1690eef3f7a490d7ff84ada0e5bd73a8dd9911ea [file] [log] [blame]

khenaidoo	d948f77	2021-08-11 17:49:24 -0400	[diff] [blame^]	1	package bbolt
				2
				3	import (
				4	"bytes"
				5	"fmt"
				6	"reflect"
				7	"sort"
				8	"unsafe"
				9	)
				10
				11	// node represents an in-memory, deserialized page.
				12	type node struct {
				13	bucket *Bucket
				14	isLeaf bool
				15	unbalanced bool
				16	spilled bool
				17	key []byte
				18	pgid pgid
				19	parent *node
				20	children nodes
				21	inodes inodes
				22	}
				23
				24	// root returns the top-level node this node is attached to.
				25	func (n node) root() node {
				26	if n.parent == nil {
				27	return n
				28	}
				29	return n.parent.root()
				30	}
				31
				32	// minKeys returns the minimum number of inodes this node should have.
				33	func (n *node) minKeys() int {
				34	if n.isLeaf {
				35	return 1
				36	}
				37	return 2
				38	}
				39
				40	// size returns the size of the node after serialization.
				41	func (n *node) size() int {
				42	sz, elsz := pageHeaderSize, n.pageElementSize()
				43	for i := 0; i < len(n.inodes); i++ {
				44	item := &n.inodes[i]
				45	sz += elsz + uintptr(len(item.key)) + uintptr(len(item.value))
				46	}
				47	return int(sz)
				48	}
				49
				50	// sizeLessThan returns true if the node is less than a given size.
				51	// This is an optimization to avoid calculating a large node when we only need
				52	// to know if it fits inside a certain page size.
				53	func (n *node) sizeLessThan(v uintptr) bool {
				54	sz, elsz := pageHeaderSize, n.pageElementSize()
				55	for i := 0; i < len(n.inodes); i++ {
				56	item := &n.inodes[i]
				57	sz += elsz + uintptr(len(item.key)) + uintptr(len(item.value))
				58	if sz >= v {
				59	return false
				60	}
				61	}
				62	return true
				63	}
				64
				65	// pageElementSize returns the size of each page element based on the type of node.
				66	func (n *node) pageElementSize() uintptr {
				67	if n.isLeaf {
				68	return leafPageElementSize
				69	}
				70	return branchPageElementSize
				71	}
				72
				73	// childAt returns the child node at a given index.
				74	func (n node) childAt(index int) node {
				75	if n.isLeaf {
				76	panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
				77	}
				78	return n.bucket.node(n.inodes[index].pgid, n)
				79	}
				80
				81	// childIndex returns the index of a given child node.
				82	func (n node) childIndex(child node) int {
				83	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 })
				84	return index
				85	}
				86
				87	// numChildren returns the number of children.
				88	func (n *node) numChildren() int {
				89	return len(n.inodes)
				90	}
				91
				92	// nextSibling returns the next node with the same parent.
				93	func (n node) nextSibling() node {
				94	if n.parent == nil {
				95	return nil
				96	}
				97	index := n.parent.childIndex(n)
				98	if index >= n.parent.numChildren()-1 {
				99	return nil
				100	}
				101	return n.parent.childAt(index + 1)
				102	}
				103
				104	// prevSibling returns the previous node with the same parent.
				105	func (n node) prevSibling() node {
				106	if n.parent == nil {
				107	return nil
				108	}
				109	index := n.parent.childIndex(n)
				110	if index == 0 {
				111	return nil
				112	}
				113	return n.parent.childAt(index - 1)
				114	}
				115
				116	// put inserts a key/value.
				117	func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
				118	if pgid >= n.bucket.tx.meta.pgid {
				119	panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid))
				120	} else if len(oldKey) <= 0 {
				121	panic("put: zero-length old key")
				122	} else if len(newKey) <= 0 {
				123	panic("put: zero-length new key")
				124	}
				125
				126	// Find insertion index.
				127	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })
				128
				129	// Add capacity and shift nodes if we don't have an exact match and need to insert.
				130	exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
				131	if !exact {
				132	n.inodes = append(n.inodes, inode{})
				133	copy(n.inodes[index+1:], n.inodes[index:])
				134	}
				135
				136	inode := &n.inodes[index]
				137	inode.flags = flags
				138	inode.key = newKey
				139	inode.value = value
				140	inode.pgid = pgid
				141	_assert(len(inode.key) > 0, "put: zero-length inode key")
				142	}
				143
				144	// del removes a key from the node.
				145	func (n *node) del(key []byte) {
				146	// Find index of key.
				147	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 })
				148
				149	// Exit if the key isn't found.
				150	if index >= len(n.inodes) \|\| !bytes.Equal(n.inodes[index].key, key) {
				151	return
				152	}
				153
				154	// Delete inode from the node.
				155	n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)
				156
				157	// Mark the node as needing rebalancing.
				158	n.unbalanced = true
				159	}
				160
				161	// read initializes the node from a page.
				162	func (n node) read(p page) {
				163	n.pgid = p.id
				164	n.isLeaf = ((p.flags & leafPageFlag) != 0)
				165	n.inodes = make(inodes, int(p.count))
				166
				167	for i := 0; i < int(p.count); i++ {
				168	inode := &n.inodes[i]
				169	if n.isLeaf {
				170	elem := p.leafPageElement(uint16(i))
				171	inode.flags = elem.flags
				172	inode.key = elem.key()
				173	inode.value = elem.value()
				174	} else {
				175	elem := p.branchPageElement(uint16(i))
				176	inode.pgid = elem.pgid
				177	inode.key = elem.key()
				178	}
				179	_assert(len(inode.key) > 0, "read: zero-length inode key")
				180	}
				181
				182	// Save first key so we can find the node in the parent when we spill.
				183	if len(n.inodes) > 0 {
				184	n.key = n.inodes[0].key
				185	_assert(len(n.key) > 0, "read: zero-length node key")
				186	} else {
				187	n.key = nil
				188	}
				189	}
				190
				191	// write writes the items onto one or more pages.
				192	func (n node) write(p page) {
				193	// Initialize page.
				194	if n.isLeaf {
				195	p.flags \|= leafPageFlag
				196	} else {
				197	p.flags \|= branchPageFlag
				198	}
				199
				200	if len(n.inodes) >= 0xFFFF {
				201	panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id))
				202	}
				203	p.count = uint16(len(n.inodes))
				204
				205	// Stop here if there are no items to write.
				206	if p.count == 0 {
				207	return
				208	}
				209
				210	// Loop over each item and write it to the page.
				211	bp := uintptr(unsafe.Pointer(p)) + unsafe.Sizeof(p) + n.pageElementSize()uintptr(len(n.inodes))
				212	for i, item := range n.inodes {
				213	_assert(len(item.key) > 0, "write: zero-length inode key")
				214
				215	// Write the page element.
				216	if n.isLeaf {
				217	elem := p.leafPageElement(uint16(i))
				218	elem.pos = uint32(bp - uintptr(unsafe.Pointer(elem)))
				219	elem.flags = item.flags
				220	elem.ksize = uint32(len(item.key))
				221	elem.vsize = uint32(len(item.value))
				222	} else {
				223	elem := p.branchPageElement(uint16(i))
				224	elem.pos = uint32(bp - uintptr(unsafe.Pointer(elem)))
				225	elem.ksize = uint32(len(item.key))
				226	elem.pgid = item.pgid
				227	_assert(elem.pgid != p.id, "write: circular dependency occurred")
				228	}
				229
				230	// Create a slice to write into of needed size and advance
				231	// byte pointer for next iteration.
				232	klen, vlen := len(item.key), len(item.value)
				233	sz := klen + vlen
				234	b := ([]byte)(unsafe.Pointer(&reflect.SliceHeader{
				235	Data: bp,
				236	Len: sz,
				237	Cap: sz,
				238	}))
				239	bp += uintptr(sz)
				240
				241	// Write data for the element to the end of the page.
				242	l := copy(b, item.key)
				243	copy(b[l:], item.value)
				244	}
				245
				246	// DEBUG ONLY: n.dump()
				247	}
				248
				249	// split breaks up a node into multiple smaller nodes, if appropriate.
				250	// This should only be called from the spill() function.
				251	func (n node) split(pageSize uintptr) []node {
				252	var nodes []*node
				253
				254	node := n
				255	for {
				256	// Split node into two.
				257	a, b := node.splitTwo(pageSize)
				258	nodes = append(nodes, a)
				259
				260	// If we can't split then exit the loop.
				261	if b == nil {
				262	break
				263	}
				264
				265	// Set node to b so it gets split on the next iteration.
				266	node = b
				267	}
				268
				269	return nodes
				270	}
				271
				272	// splitTwo breaks up a node into two smaller nodes, if appropriate.
				273	// This should only be called from the split() function.
				274	func (n node) splitTwo(pageSize uintptr) (node, *node) {
				275	// Ignore the split if the page doesn't have at least enough nodes for
				276	// two pages or if the nodes can fit in a single page.
				277	if len(n.inodes) <= (minKeysPerPage*2) \|\| n.sizeLessThan(pageSize) {
				278	return n, nil
				279	}
				280
				281	// Determine the threshold before starting a new node.
				282	var fillPercent = n.bucket.FillPercent
				283	if fillPercent < minFillPercent {
				284	fillPercent = minFillPercent
				285	} else if fillPercent > maxFillPercent {
				286	fillPercent = maxFillPercent
				287	}
				288	threshold := int(float64(pageSize) * fillPercent)
				289
				290	// Determine split position and sizes of the two pages.
				291	splitIndex, _ := n.splitIndex(threshold)
				292
				293	// Split node into two separate nodes.
				294	// If there's no parent then we'll need to create one.
				295	if n.parent == nil {
				296	n.parent = &node{bucket: n.bucket, children: []*node{n}}
				297	}
				298
				299	// Create a new node and add it to the parent.
				300	next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
				301	n.parent.children = append(n.parent.children, next)
				302
				303	// Split inodes across two nodes.
				304	next.inodes = n.inodes[splitIndex:]
				305	n.inodes = n.inodes[:splitIndex]
				306
				307	// Update the statistics.
				308	n.bucket.tx.stats.Split++
				309
				310	return n, next
				311	}
				312
				313	// splitIndex finds the position where a page will fill a given threshold.
				314	// It returns the index as well as the size of the first page.
				315	// This is only be called from split().
				316	func (n *node) splitIndex(threshold int) (index, sz uintptr) {
				317	sz = pageHeaderSize
				318
				319	// Loop until we only have the minimum number of keys required for the second page.
				320	for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
				321	index = uintptr(i)
				322	inode := n.inodes[i]
				323	elsize := n.pageElementSize() + uintptr(len(inode.key)) + uintptr(len(inode.value))
				324
				325	// If we have at least the minimum number of keys and adding another
				326	// node would put us over the threshold then exit and return.
				327	if index >= minKeysPerPage && sz+elsize > uintptr(threshold) {
				328	break
				329	}
				330
				331	// Add the element size to the total size.
				332	sz += elsize
				333	}
				334
				335	return
				336	}
				337
				338	// spill writes the nodes to dirty pages and splits nodes as it goes.
				339	// Returns an error if dirty pages cannot be allocated.
				340	func (n *node) spill() error {
				341	var tx = n.bucket.tx
				342	if n.spilled {
				343	return nil
				344	}
				345
				346	// Spill child nodes first. Child nodes can materialize sibling nodes in
				347	// the case of split-merge so we cannot use a range loop. We have to check
				348	// the children size on every loop iteration.
				349	sort.Sort(n.children)
				350	for i := 0; i < len(n.children); i++ {
				351	if err := n.children[i].spill(); err != nil {
				352	return err
				353	}
				354	}
				355
				356	// We no longer need the child list because it's only used for spill tracking.
				357	n.children = nil
				358
				359	// Split nodes into appropriate sizes. The first node will always be n.
				360	var nodes = n.split(uintptr(tx.db.pageSize))
				361	for _, node := range nodes {
				362	// Add node's page to the freelist if it's not new.
				363	if node.pgid > 0 {
				364	tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
				365	node.pgid = 0
				366	}
				367
				368	// Allocate contiguous space for the node.
				369	p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize)
				370	if err != nil {
				371	return err
				372	}
				373
				374	// Write the node.
				375	if p.id >= tx.meta.pgid {
				376	panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid))
				377	}
				378	node.pgid = p.id
				379	node.write(p)
				380	node.spilled = true
				381
				382	// Insert into parent inodes.
				383	if node.parent != nil {
				384	var key = node.key
				385	if key == nil {
				386	key = node.inodes[0].key
				387	}
				388
				389	node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
				390	node.key = node.inodes[0].key
				391	_assert(len(node.key) > 0, "spill: zero-length node key")
				392	}
				393
				394	// Update the statistics.
				395	tx.stats.Spill++
				396	}
				397
				398	// If the root node split and created a new root then we need to spill that
				399	// as well. We'll clear out the children to make sure it doesn't try to respill.
				400	if n.parent != nil && n.parent.pgid == 0 {
				401	n.children = nil
				402	return n.parent.spill()
				403	}
				404
				405	return nil
				406	}
				407
				408	// rebalance attempts to combine the node with sibling nodes if the node fill
				409	// size is below a threshold or if there are not enough keys.
				410	func (n *node) rebalance() {
				411	if !n.unbalanced {
				412	return
				413	}
				414	n.unbalanced = false
				415
				416	// Update statistics.
				417	n.bucket.tx.stats.Rebalance++
				418
				419	// Ignore if node is above threshold (25%) and has enough keys.
				420	var threshold = n.bucket.tx.db.pageSize / 4
				421	if n.size() > threshold && len(n.inodes) > n.minKeys() {
				422	return
				423	}
				424
				425	// Root node has special handling.
				426	if n.parent == nil {
				427	// If root node is a branch and only has one node then collapse it.
				428	if !n.isLeaf && len(n.inodes) == 1 {
				429	// Move root's child up.
				430	child := n.bucket.node(n.inodes[0].pgid, n)
				431	n.isLeaf = child.isLeaf
				432	n.inodes = child.inodes[:]
				433	n.children = child.children
				434
				435	// Reparent all child nodes being moved.
				436	for _, inode := range n.inodes {
				437	if child, ok := n.bucket.nodes[inode.pgid]; ok {
				438	child.parent = n
				439	}
				440	}
				441
				442	// Remove old child.
				443	child.parent = nil
				444	delete(n.bucket.nodes, child.pgid)
				445	child.free()
				446	}
				447
				448	return
				449	}
				450
				451	// If node has no keys then just remove it.
				452	if n.numChildren() == 0 {
				453	n.parent.del(n.key)
				454	n.parent.removeChild(n)
				455	delete(n.bucket.nodes, n.pgid)
				456	n.free()
				457	n.parent.rebalance()
				458	return
				459	}
				460
				461	_assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
				462
				463	// Destination node is right sibling if idx == 0, otherwise left sibling.
				464	var target *node
				465	var useNextSibling = (n.parent.childIndex(n) == 0)
				466	if useNextSibling {
				467	target = n.nextSibling()
				468	} else {
				469	target = n.prevSibling()
				470	}
				471
				472	// If both this node and the target node are too small then merge them.
				473	if useNextSibling {
				474	// Reparent all child nodes being moved.
				475	for _, inode := range target.inodes {
				476	if child, ok := n.bucket.nodes[inode.pgid]; ok {
				477	child.parent.removeChild(child)
				478	child.parent = n
				479	child.parent.children = append(child.parent.children, child)
				480	}
				481	}
				482
				483	// Copy over inodes from target and remove target.
				484	n.inodes = append(n.inodes, target.inodes...)
				485	n.parent.del(target.key)
				486	n.parent.removeChild(target)
				487	delete(n.bucket.nodes, target.pgid)
				488	target.free()
				489	} else {
				490	// Reparent all child nodes being moved.
				491	for _, inode := range n.inodes {
				492	if child, ok := n.bucket.nodes[inode.pgid]; ok {
				493	child.parent.removeChild(child)
				494	child.parent = target
				495	child.parent.children = append(child.parent.children, child)
				496	}
				497	}
				498
				499	// Copy over inodes to target and remove node.
				500	target.inodes = append(target.inodes, n.inodes...)
				501	n.parent.del(n.key)
				502	n.parent.removeChild(n)
				503	delete(n.bucket.nodes, n.pgid)
				504	n.free()
				505	}
				506
				507	// Either this node or the target node was deleted from the parent so rebalance it.
				508	n.parent.rebalance()
				509	}
				510
				511	// removes a node from the list of in-memory children.
				512	// This does not affect the inodes.
				513	func (n node) removeChild(target node) {
				514	for i, child := range n.children {
				515	if child == target {
				516	n.children = append(n.children[:i], n.children[i+1:]...)
				517	return
				518	}
				519	}
				520	}
				521
				522	// dereference causes the node to copy all its inode key/value references to heap memory.
				523	// This is required when the mmap is reallocated so inodes are not pointing to stale data.
				524	func (n *node) dereference() {
				525	if n.key != nil {
				526	key := make([]byte, len(n.key))
				527	copy(key, n.key)
				528	n.key = key
				529	_assert(n.pgid == 0 \|\| len(n.key) > 0, "dereference: zero-length node key on existing node")
				530	}
				531
				532	for i := range n.inodes {
				533	inode := &n.inodes[i]
				534
				535	key := make([]byte, len(inode.key))
				536	copy(key, inode.key)
				537	inode.key = key
				538	_assert(len(inode.key) > 0, "dereference: zero-length inode key")
				539
				540	value := make([]byte, len(inode.value))
				541	copy(value, inode.value)
				542	inode.value = value
				543	}
				544
				545	// Recursively dereference children.
				546	for _, child := range n.children {
				547	child.dereference()
				548	}
				549
				550	// Update statistics.
				551	n.bucket.tx.stats.NodeDeref++
				552	}
				553
				554	// free adds the node's underlying page to the freelist.
				555	func (n *node) free() {
				556	if n.pgid != 0 {
				557	n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid))
				558	n.pgid = 0
				559	}
				560	}
				561
				562	// dump writes the contents of the node to STDERR for debugging purposes.
				563	/*
				564	func (n *node) dump() {
				565	// Write node header.
				566	var typ = "branch"
				567	if n.isLeaf {
				568	typ = "leaf"
				569	}
				570	warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes))
				571
				572	// Write out abbreviated version of each item.
				573	for _, item := range n.inodes {
				574	if n.isLeaf {
				575	if item.flags&bucketLeafFlag != 0 {
				576	bucket := (*bucket)(unsafe.Pointer(&item.value[0]))
				577	warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root)
				578	} else {
				579	warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4))
				580	}
				581	} else {
				582	warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid)
				583	}
				584	}
				585	warn("")
				586	}
				587	*/
				588
				589	type nodes []*node
				590
				591	func (s nodes) Len() int { return len(s) }
				592	func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
				593	func (s nodes) Less(i, j int) bool {
				594	return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1
				595	}
				596
				597	// inode represents an internal node inside of a node.
				598	// It can be used to point to elements in a page or point
				599	// to an element which hasn't been added to a page yet.
				600	type inode struct {
				601	flags uint32
				602	pgid pgid
				603	key []byte
				604	value []byte
				605	}
				606
				607	type inodes []inode