Blame - vendor/go.etcd.io/bbolt/node.go - voltha-go

blob: 6c3fa553ea68d762f9392c825775856168c28429 [file] [log] [blame]

khenaidoo	ab1f7bd	2019-11-14 14:00:27 -0500	[diff] [blame]	1	package bbolt
				2
				3	import (
				4	"bytes"
				5	"fmt"
				6	"sort"
				7	"unsafe"
				8	)
				9
				10	// node represents an in-memory, deserialized page.
				11	type node struct {
				12	bucket *Bucket
				13	isLeaf bool
				14	unbalanced bool
				15	spilled bool
				16	key []byte
				17	pgid pgid
				18	parent *node
				19	children nodes
				20	inodes inodes
				21	}
				22
				23	// root returns the top-level node this node is attached to.
				24	func (n node) root() node {
				25	if n.parent == nil {
				26	return n
				27	}
				28	return n.parent.root()
				29	}
				30
				31	// minKeys returns the minimum number of inodes this node should have.
				32	func (n *node) minKeys() int {
				33	if n.isLeaf {
				34	return 1
				35	}
				36	return 2
				37	}
				38
				39	// size returns the size of the node after serialization.
				40	func (n *node) size() int {
				41	sz, elsz := pageHeaderSize, n.pageElementSize()
				42	for i := 0; i < len(n.inodes); i++ {
				43	item := &n.inodes[i]
				44	sz += elsz + len(item.key) + len(item.value)
				45	}
				46	return sz
				47	}
				48
				49	// sizeLessThan returns true if the node is less than a given size.
				50	// This is an optimization to avoid calculating a large node when we only need
				51	// to know if it fits inside a certain page size.
				52	func (n *node) sizeLessThan(v int) bool {
				53	sz, elsz := pageHeaderSize, n.pageElementSize()
				54	for i := 0; i < len(n.inodes); i++ {
				55	item := &n.inodes[i]
				56	sz += elsz + len(item.key) + len(item.value)
				57	if sz >= v {
				58	return false
				59	}
				60	}
				61	return true
				62	}
				63
				64	// pageElementSize returns the size of each page element based on the type of node.
				65	func (n *node) pageElementSize() int {
				66	if n.isLeaf {
				67	return leafPageElementSize
				68	}
				69	return branchPageElementSize
				70	}
				71
				72	// childAt returns the child node at a given index.
				73	func (n node) childAt(index int) node {
				74	if n.isLeaf {
				75	panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
				76	}
				77	return n.bucket.node(n.inodes[index].pgid, n)
				78	}
				79
				80	// childIndex returns the index of a given child node.
				81	func (n node) childIndex(child node) int {
				82	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 })
				83	return index
				84	}
				85
				86	// numChildren returns the number of children.
				87	func (n *node) numChildren() int {
				88	return len(n.inodes)
				89	}
				90
				91	// nextSibling returns the next node with the same parent.
				92	func (n node) nextSibling() node {
				93	if n.parent == nil {
				94	return nil
				95	}
				96	index := n.parent.childIndex(n)
				97	if index >= n.parent.numChildren()-1 {
				98	return nil
				99	}
				100	return n.parent.childAt(index + 1)
				101	}
				102
				103	// prevSibling returns the previous node with the same parent.
				104	func (n node) prevSibling() node {
				105	if n.parent == nil {
				106	return nil
				107	}
				108	index := n.parent.childIndex(n)
				109	if index == 0 {
				110	return nil
				111	}
				112	return n.parent.childAt(index - 1)
				113	}
				114
				115	// put inserts a key/value.
				116	func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
				117	if pgid >= n.bucket.tx.meta.pgid {
				118	panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid))
				119	} else if len(oldKey) <= 0 {
				120	panic("put: zero-length old key")
				121	} else if len(newKey) <= 0 {
				122	panic("put: zero-length new key")
				123	}
				124
				125	// Find insertion index.
				126	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })
				127
				128	// Add capacity and shift nodes if we don't have an exact match and need to insert.
				129	exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
				130	if !exact {
				131	n.inodes = append(n.inodes, inode{})
				132	copy(n.inodes[index+1:], n.inodes[index:])
				133	}
				134
				135	inode := &n.inodes[index]
				136	inode.flags = flags
				137	inode.key = newKey
				138	inode.value = value
				139	inode.pgid = pgid
				140	_assert(len(inode.key) > 0, "put: zero-length inode key")
				141	}
				142
				143	// del removes a key from the node.
				144	func (n *node) del(key []byte) {
				145	// Find index of key.
				146	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 })
				147
				148	// Exit if the key isn't found.
				149	if index >= len(n.inodes) \|\| !bytes.Equal(n.inodes[index].key, key) {
				150	return
				151	}
				152
				153	// Delete inode from the node.
				154	n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)
				155
				156	// Mark the node as needing rebalancing.
				157	n.unbalanced = true
				158	}
				159
				160	// read initializes the node from a page.
				161	func (n node) read(p page) {
				162	n.pgid = p.id
				163	n.isLeaf = ((p.flags & leafPageFlag) != 0)
				164	n.inodes = make(inodes, int(p.count))
				165
				166	for i := 0; i < int(p.count); i++ {
				167	inode := &n.inodes[i]
				168	if n.isLeaf {
				169	elem := p.leafPageElement(uint16(i))
				170	inode.flags = elem.flags
				171	inode.key = elem.key()
				172	inode.value = elem.value()
				173	} else {
				174	elem := p.branchPageElement(uint16(i))
				175	inode.pgid = elem.pgid
				176	inode.key = elem.key()
				177	}
				178	_assert(len(inode.key) > 0, "read: zero-length inode key")
				179	}
				180
				181	// Save first key so we can find the node in the parent when we spill.
				182	if len(n.inodes) > 0 {
				183	n.key = n.inodes[0].key
				184	_assert(len(n.key) > 0, "read: zero-length node key")
				185	} else {
				186	n.key = nil
				187	}
				188	}
				189
				190	// write writes the items onto one or more pages.
				191	func (n node) write(p page) {
				192	// Initialize page.
				193	if n.isLeaf {
				194	p.flags \|= leafPageFlag
				195	} else {
				196	p.flags \|= branchPageFlag
				197	}
				198
				199	if len(n.inodes) >= 0xFFFF {
				200	panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id))
				201	}
				202	p.count = uint16(len(n.inodes))
				203
				204	// Stop here if there are no items to write.
				205	if p.count == 0 {
				206	return
				207	}
				208
				209	// Loop over each item and write it to the page.
				210	b := ([maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()len(n.inodes):]
				211	for i, item := range n.inodes {
				212	_assert(len(item.key) > 0, "write: zero-length inode key")
				213
				214	// Write the page element.
				215	if n.isLeaf {
				216	elem := p.leafPageElement(uint16(i))
				217	elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
				218	elem.flags = item.flags
				219	elem.ksize = uint32(len(item.key))
				220	elem.vsize = uint32(len(item.value))
				221	} else {
				222	elem := p.branchPageElement(uint16(i))
				223	elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
				224	elem.ksize = uint32(len(item.key))
				225	elem.pgid = item.pgid
				226	_assert(elem.pgid != p.id, "write: circular dependency occurred")
				227	}
				228
				229	// If the length of key+value is larger than the max allocation size
				230	// then we need to reallocate the byte array pointer.
				231	//
				232	// See: https://github.com/boltdb/bolt/pull/335
				233	klen, vlen := len(item.key), len(item.value)
				234	if len(b) < klen+vlen {
				235	b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0]))[:]
				236	}
				237
				238	// Write data for the element to the end of the page.
				239	copy(b[0:], item.key)
				240	b = b[klen:]
				241	copy(b[0:], item.value)
				242	b = b[vlen:]
				243	}
				244
				245	// DEBUG ONLY: n.dump()
				246	}
				247
				248	// split breaks up a node into multiple smaller nodes, if appropriate.
				249	// This should only be called from the spill() function.
				250	func (n node) split(pageSize int) []node {
				251	var nodes []*node
				252
				253	node := n
				254	for {
				255	// Split node into two.
				256	a, b := node.splitTwo(pageSize)
				257	nodes = append(nodes, a)
				258
				259	// If we can't split then exit the loop.
				260	if b == nil {
				261	break
				262	}
				263
				264	// Set node to b so it gets split on the next iteration.
				265	node = b
				266	}
				267
				268	return nodes
				269	}
				270
				271	// splitTwo breaks up a node into two smaller nodes, if appropriate.
				272	// This should only be called from the split() function.
				273	func (n node) splitTwo(pageSize int) (node, *node) {
				274	// Ignore the split if the page doesn't have at least enough nodes for
				275	// two pages or if the nodes can fit in a single page.
				276	if len(n.inodes) <= (minKeysPerPage*2) \|\| n.sizeLessThan(pageSize) {
				277	return n, nil
				278	}
				279
				280	// Determine the threshold before starting a new node.
				281	var fillPercent = n.bucket.FillPercent
				282	if fillPercent < minFillPercent {
				283	fillPercent = minFillPercent
				284	} else if fillPercent > maxFillPercent {
				285	fillPercent = maxFillPercent
				286	}
				287	threshold := int(float64(pageSize) * fillPercent)
				288
				289	// Determine split position and sizes of the two pages.
				290	splitIndex, _ := n.splitIndex(threshold)
				291
				292	// Split node into two separate nodes.
				293	// If there's no parent then we'll need to create one.
				294	if n.parent == nil {
				295	n.parent = &node{bucket: n.bucket, children: []*node{n}}
				296	}
				297
				298	// Create a new node and add it to the parent.
				299	next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
				300	n.parent.children = append(n.parent.children, next)
				301
				302	// Split inodes across two nodes.
				303	next.inodes = n.inodes[splitIndex:]
				304	n.inodes = n.inodes[:splitIndex]
				305
				306	// Update the statistics.
				307	n.bucket.tx.stats.Split++
				308
				309	return n, next
				310	}
				311
				312	// splitIndex finds the position where a page will fill a given threshold.
				313	// It returns the index as well as the size of the first page.
				314	// This is only be called from split().
				315	func (n *node) splitIndex(threshold int) (index, sz int) {
				316	sz = pageHeaderSize
				317
				318	// Loop until we only have the minimum number of keys required for the second page.
				319	for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
				320	index = i
				321	inode := n.inodes[i]
				322	elsize := n.pageElementSize() + len(inode.key) + len(inode.value)
				323
				324	// If we have at least the minimum number of keys and adding another
				325	// node would put us over the threshold then exit and return.
				326	if i >= minKeysPerPage && sz+elsize > threshold {
				327	break
				328	}
				329
				330	// Add the element size to the total size.
				331	sz += elsize
				332	}
				333
				334	return
				335	}
				336
				337	// spill writes the nodes to dirty pages and splits nodes as it goes.
				338	// Returns an error if dirty pages cannot be allocated.
				339	func (n *node) spill() error {
				340	var tx = n.bucket.tx
				341	if n.spilled {
				342	return nil
				343	}
				344
				345	// Spill child nodes first. Child nodes can materialize sibling nodes in
				346	// the case of split-merge so we cannot use a range loop. We have to check
				347	// the children size on every loop iteration.
				348	sort.Sort(n.children)
				349	for i := 0; i < len(n.children); i++ {
				350	if err := n.children[i].spill(); err != nil {
				351	return err
				352	}
				353	}
				354
				355	// We no longer need the child list because it's only used for spill tracking.
				356	n.children = nil
				357
				358	// Split nodes into appropriate sizes. The first node will always be n.
				359	var nodes = n.split(tx.db.pageSize)
				360	for _, node := range nodes {
				361	// Add node's page to the freelist if it's not new.
				362	if node.pgid > 0 {
				363	tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
				364	node.pgid = 0
				365	}
				366
				367	// Allocate contiguous space for the node.
				368	p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize)
				369	if err != nil {
				370	return err
				371	}
				372
				373	// Write the node.
				374	if p.id >= tx.meta.pgid {
				375	panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid))
				376	}
				377	node.pgid = p.id
				378	node.write(p)
				379	node.spilled = true
				380
				381	// Insert into parent inodes.
				382	if node.parent != nil {
				383	var key = node.key
				384	if key == nil {
				385	key = node.inodes[0].key
				386	}
				387
				388	node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
				389	node.key = node.inodes[0].key
				390	_assert(len(node.key) > 0, "spill: zero-length node key")
				391	}
				392
				393	// Update the statistics.
				394	tx.stats.Spill++
				395	}
				396
				397	// If the root node split and created a new root then we need to spill that
				398	// as well. We'll clear out the children to make sure it doesn't try to respill.
				399	if n.parent != nil && n.parent.pgid == 0 {
				400	n.children = nil
				401	return n.parent.spill()
				402	}
				403
				404	return nil
				405	}
				406
				407	// rebalance attempts to combine the node with sibling nodes if the node fill
				408	// size is below a threshold or if there are not enough keys.
				409	func (n *node) rebalance() {
				410	if !n.unbalanced {
				411	return
				412	}
				413	n.unbalanced = false
				414
				415	// Update statistics.
				416	n.bucket.tx.stats.Rebalance++
				417
				418	// Ignore if node is above threshold (25%) and has enough keys.
				419	var threshold = n.bucket.tx.db.pageSize / 4
				420	if n.size() > threshold && len(n.inodes) > n.minKeys() {
				421	return
				422	}
				423
				424	// Root node has special handling.
				425	if n.parent == nil {
				426	// If root node is a branch and only has one node then collapse it.
				427	if !n.isLeaf && len(n.inodes) == 1 {
				428	// Move root's child up.
				429	child := n.bucket.node(n.inodes[0].pgid, n)
				430	n.isLeaf = child.isLeaf
				431	n.inodes = child.inodes[:]
				432	n.children = child.children
				433
				434	// Reparent all child nodes being moved.
				435	for _, inode := range n.inodes {
				436	if child, ok := n.bucket.nodes[inode.pgid]; ok {
				437	child.parent = n
				438	}
				439	}
				440
				441	// Remove old child.
				442	child.parent = nil
				443	delete(n.bucket.nodes, child.pgid)
				444	child.free()
				445	}
				446
				447	return
				448	}
				449
				450	// If node has no keys then just remove it.
				451	if n.numChildren() == 0 {
				452	n.parent.del(n.key)
				453	n.parent.removeChild(n)
				454	delete(n.bucket.nodes, n.pgid)
				455	n.free()
				456	n.parent.rebalance()
				457	return
				458	}
				459
				460	_assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
				461
				462	// Destination node is right sibling if idx == 0, otherwise left sibling.
				463	var target *node
				464	var useNextSibling = (n.parent.childIndex(n) == 0)
				465	if useNextSibling {
				466	target = n.nextSibling()
				467	} else {
				468	target = n.prevSibling()
				469	}
				470
				471	// If both this node and the target node are too small then merge them.
				472	if useNextSibling {
				473	// Reparent all child nodes being moved.
				474	for _, inode := range target.inodes {
				475	if child, ok := n.bucket.nodes[inode.pgid]; ok {
				476	child.parent.removeChild(child)
				477	child.parent = n
				478	child.parent.children = append(child.parent.children, child)
				479	}
				480	}
				481
				482	// Copy over inodes from target and remove target.
				483	n.inodes = append(n.inodes, target.inodes...)
				484	n.parent.del(target.key)
				485	n.parent.removeChild(target)
				486	delete(n.bucket.nodes, target.pgid)
				487	target.free()
				488	} else {
				489	// Reparent all child nodes being moved.
				490	for _, inode := range n.inodes {
				491	if child, ok := n.bucket.nodes[inode.pgid]; ok {
				492	child.parent.removeChild(child)
				493	child.parent = target
				494	child.parent.children = append(child.parent.children, child)
				495	}
				496	}
				497
				498	// Copy over inodes to target and remove node.
				499	target.inodes = append(target.inodes, n.inodes...)
				500	n.parent.del(n.key)
				501	n.parent.removeChild(n)
				502	delete(n.bucket.nodes, n.pgid)
				503	n.free()
				504	}
				505
				506	// Either this node or the target node was deleted from the parent so rebalance it.
				507	n.parent.rebalance()
				508	}
				509
				510	// removes a node from the list of in-memory children.
				511	// This does not affect the inodes.
				512	func (n node) removeChild(target node) {
				513	for i, child := range n.children {
				514	if child == target {
				515	n.children = append(n.children[:i], n.children[i+1:]...)
				516	return
				517	}
				518	}
				519	}
				520
				521	// dereference causes the node to copy all its inode key/value references to heap memory.
				522	// This is required when the mmap is reallocated so inodes are not pointing to stale data.
				523	func (n *node) dereference() {
				524	if n.key != nil {
				525	key := make([]byte, len(n.key))
				526	copy(key, n.key)
				527	n.key = key
				528	_assert(n.pgid == 0 \|\| len(n.key) > 0, "dereference: zero-length node key on existing node")
				529	}
				530
				531	for i := range n.inodes {
				532	inode := &n.inodes[i]
				533
				534	key := make([]byte, len(inode.key))
				535	copy(key, inode.key)
				536	inode.key = key
				537	_assert(len(inode.key) > 0, "dereference: zero-length inode key")
				538
				539	value := make([]byte, len(inode.value))
				540	copy(value, inode.value)
				541	inode.value = value
				542	}
				543
				544	// Recursively dereference children.
				545	for _, child := range n.children {
				546	child.dereference()
				547	}
				548
				549	// Update statistics.
				550	n.bucket.tx.stats.NodeDeref++
				551	}
				552
				553	// free adds the node's underlying page to the freelist.
				554	func (n *node) free() {
				555	if n.pgid != 0 {
				556	n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid))
				557	n.pgid = 0
				558	}
				559	}
				560
				561	// dump writes the contents of the node to STDERR for debugging purposes.
				562	/*
				563	func (n *node) dump() {
				564	// Write node header.
				565	var typ = "branch"
				566	if n.isLeaf {
				567	typ = "leaf"
				568	}
				569	warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes))
				570
				571	// Write out abbreviated version of each item.
				572	for _, item := range n.inodes {
				573	if n.isLeaf {
				574	if item.flags&bucketLeafFlag != 0 {
				575	bucket := (*bucket)(unsafe.Pointer(&item.value[0]))
				576	warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root)
				577	} else {
				578	warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4))
				579	}
				580	} else {
				581	warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid)
				582	}
				583	}
				584	warn("")
				585	}
				586	*/
				587
				588	type nodes []*node
				589
				590	func (s nodes) Len() int { return len(s) }
				591	func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
				592	func (s nodes) Less(i, j int) bool { return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 }
				593
				594	// inode represents an internal node inside of a node.
				595	// It can be used to point to elements in a page or point
				596	// to an element which hasn't been added to a page yet.
				597	type inode struct {
				598	flags uint32
				599	pgid pgid
				600	key []byte
				601	value []byte
				602	}
				603
				604	type inodes []inode