vendor/go.etcd.io/bbolt/node.go - voltha-go - Gitiles

 package bbolt

 import (
 	"bytes"
 	"fmt"
 	"sort"
 	"unsafe"
 )

 // node represents an in-memory, deserialized page.
 type node struct {
 	bucket     *Bucket
 	isLeaf     bool
 	unbalanced bool
 	spilled    bool
 	key        []byte
 	pgid       pgid
 	parent     *node
 	children   nodes
 	inodes     inodes
 }

 // root returns the top-level node this node is attached to.
 func (n *node) root() *node {
 	if n.parent == nil {
 		return n
 	}
 	return n.parent.root()
 }

 // minKeys returns the minimum number of inodes this node should have.
 func (n *node) minKeys() int {
 	if n.isLeaf {
 		return 1
 	}
 	return 2
 }

 // size returns the size of the node after serialization.
 func (n *node) size() int {
 	sz, elsz := pageHeaderSize, n.pageElementSize()
 	for i := 0; i < len(n.inodes); i++ {
 		item := &n.inodes[i]
 		sz += elsz + len(item.key) + len(item.value)
 	}
 	return sz
 }

 // sizeLessThan returns true if the node is less than a given size.
 // This is an optimization to avoid calculating a large node when we only need
 // to know if it fits inside a certain page size.
 func (n *node) sizeLessThan(v int) bool {
 	sz, elsz := pageHeaderSize, n.pageElementSize()
 	for i := 0; i < len(n.inodes); i++ {
 		item := &n.inodes[i]
 		sz += elsz + len(item.key) + len(item.value)
 		if sz >= v {
 			return false
 		}
 	}
 	return true
 }

 // pageElementSize returns the size of each page element based on the type of node.
 func (n *node) pageElementSize() int {
 	if n.isLeaf {
 		return leafPageElementSize
 	}
 	return branchPageElementSize
 }

 // childAt returns the child node at a given index.
 func (n *node) childAt(index int) *node {
 	if n.isLeaf {
 		panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
 	}
 	return n.bucket.node(n.inodes[index].pgid, n)
 }

 // childIndex returns the index of a given child node.
 func (n *node) childIndex(child *node) int {
 	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 })
 	return index
 }

 // numChildren returns the number of children.
 func (n *node) numChildren() int {
 	return len(n.inodes)
 }

 // nextSibling returns the next node with the same parent.
 func (n *node) nextSibling() *node {
 	if n.parent == nil {
 		return nil
 	}
 	index := n.parent.childIndex(n)
 	if index >= n.parent.numChildren()-1 {
 		return nil
 	}
 	return n.parent.childAt(index + 1)
 }

 // prevSibling returns the previous node with the same parent.
 func (n *node) prevSibling() *node {
 	if n.parent == nil {
 		return nil
 	}
 	index := n.parent.childIndex(n)
 	if index == 0 {
 		return nil
 	}
 	return n.parent.childAt(index - 1)
 }

 // put inserts a key/value.
 func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
 	if pgid >= n.bucket.tx.meta.pgid {
 		panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid))
 	} else if len(oldKey) <= 0 {
 		panic("put: zero-length old key")
 	} else if len(newKey) <= 0 {
 		panic("put: zero-length new key")
 	}

 	// Find insertion index.
 	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })

 	// Add capacity and shift nodes if we don't have an exact match and need to insert.
 	exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
 	if !exact {
 		n.inodes = append(n.inodes, inode{})
 		copy(n.inodes[index+1:], n.inodes[index:])
 	}

 	inode := &n.inodes[index]
 	inode.flags = flags
 	inode.key = newKey
 	inode.value = value
 	inode.pgid = pgid
 	_assert(len(inode.key) > 0, "put: zero-length inode key")
 }

 // del removes a key from the node.
 func (n *node) del(key []byte) {
 	// Find index of key.
 	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 })

 	// Exit if the key isn't found.
 	if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) {
 		return
 	}

 	// Delete inode from the node.
 	n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)

 	// Mark the node as needing rebalancing.
 	n.unbalanced = true
 }

 // read initializes the node from a page.
 func (n *node) read(p *page) {
 	n.pgid = p.id
 	n.isLeaf = ((p.flags & leafPageFlag) != 0)
 	n.inodes = make(inodes, int(p.count))

 	for i := 0; i < int(p.count); i++ {
 		inode := &n.inodes[i]
 		if n.isLeaf {
 			elem := p.leafPageElement(uint16(i))
 			inode.flags = elem.flags
 			inode.key = elem.key()
 			inode.value = elem.value()
 		} else {
 			elem := p.branchPageElement(uint16(i))
 			inode.pgid = elem.pgid
 			inode.key = elem.key()
 		}
 		_assert(len(inode.key) > 0, "read: zero-length inode key")
 	}

 	// Save first key so we can find the node in the parent when we spill.
 	if len(n.inodes) > 0 {
 		n.key = n.inodes[0].key
 		_assert(len(n.key) > 0, "read: zero-length node key")
 	} else {
 		n.key = nil
 	}
 }

 // write writes the items onto one or more pages.
 func (n *node) write(p *page) {
 	// Initialize page.
 	if n.isLeaf {
 		p.flags |= leafPageFlag
 	} else {
 		p.flags |= branchPageFlag
 	}

 	if len(n.inodes) >= 0xFFFF {
 		panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id))
 	}
 	p.count = uint16(len(n.inodes))

 	// Stop here if there are no items to write.
 	if p.count == 0 {
 		return
 	}

 	// Loop over each item and write it to the page.
 	b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):]
 	for i, item := range n.inodes {
 		_assert(len(item.key) > 0, "write: zero-length inode key")

 		// Write the page element.
 		if n.isLeaf {
 			elem := p.leafPageElement(uint16(i))
 			elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
 			elem.flags = item.flags
 			elem.ksize = uint32(len(item.key))
 			elem.vsize = uint32(len(item.value))
 		} else {
 			elem := p.branchPageElement(uint16(i))
 			elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
 			elem.ksize = uint32(len(item.key))
 			elem.pgid = item.pgid
 			_assert(elem.pgid != p.id, "write: circular dependency occurred")
 		}

 		// If the length of key+value is larger than the max allocation size
 		// then we need to reallocate the byte array pointer.
 		//
 		// See: https://github.com/boltdb/bolt/pull/335
 		klen, vlen := len(item.key), len(item.value)
 		if len(b) < klen+vlen {
 			b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0]))[:]
 		}

 		// Write data for the element to the end of the page.
 		copy(b[0:], item.key)
 		b = b[klen:]
 		copy(b[0:], item.value)
 		b = b[vlen:]
 	}

 	// DEBUG ONLY: n.dump()
 }

 // split breaks up a node into multiple smaller nodes, if appropriate.
 // This should only be called from the spill() function.
 func (n *node) split(pageSize int) []*node {
 	var nodes []*node

 	node := n
 	for {
 		// Split node into two.
 		a, b := node.splitTwo(pageSize)
 		nodes = append(nodes, a)

 		// If we can't split then exit the loop.
 		if b == nil {
 			break
 		}

 		// Set node to b so it gets split on the next iteration.
 		node = b
 	}

 	return nodes
 }

 // splitTwo breaks up a node into two smaller nodes, if appropriate.
 // This should only be called from the split() function.
 func (n *node) splitTwo(pageSize int) (*node, *node) {
 	// Ignore the split if the page doesn't have at least enough nodes for
 	// two pages or if the nodes can fit in a single page.
 	if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) {
 		return n, nil
 	}

 	// Determine the threshold before starting a new node.
 	var fillPercent = n.bucket.FillPercent
 	if fillPercent < minFillPercent {
 		fillPercent = minFillPercent
 	} else if fillPercent > maxFillPercent {
 		fillPercent = maxFillPercent
 	}
 	threshold := int(float64(pageSize) * fillPercent)

 	// Determine split position and sizes of the two pages.
 	splitIndex, _ := n.splitIndex(threshold)

 	// Split node into two separate nodes.
 	// If there's no parent then we'll need to create one.
 	if n.parent == nil {
 		n.parent = &node{bucket: n.bucket, children: []*node{n}}
 	}

 	// Create a new node and add it to the parent.
 	next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
 	n.parent.children = append(n.parent.children, next)

 	// Split inodes across two nodes.
 	next.inodes = n.inodes[splitIndex:]
 	n.inodes = n.inodes[:splitIndex]

 	// Update the statistics.
 	n.bucket.tx.stats.Split++

 	return n, next
 }

 // splitIndex finds the position where a page will fill a given threshold.
 // It returns the index as well as the size of the first page.
 // This is only be called from split().
 func (n *node) splitIndex(threshold int) (index, sz int) {
 	sz = pageHeaderSize

 	// Loop until we only have the minimum number of keys required for the second page.
 	for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
 		index = i
 		inode := n.inodes[i]
 		elsize := n.pageElementSize() + len(inode.key) + len(inode.value)

 		// If we have at least the minimum number of keys and adding another
 		// node would put us over the threshold then exit and return.
 		if i >= minKeysPerPage && sz+elsize > threshold {
 			break
 		}

 		// Add the element size to the total size.
 		sz += elsize
 	}

 	return
 }

 // spill writes the nodes to dirty pages and splits nodes as it goes.
 // Returns an error if dirty pages cannot be allocated.
 func (n *node) spill() error {
 	var tx = n.bucket.tx
 	if n.spilled {
 		return nil
 	}

 	// Spill child nodes first. Child nodes can materialize sibling nodes in
 	// the case of split-merge so we cannot use a range loop. We have to check
 	// the children size on every loop iteration.
 	sort.Sort(n.children)
 	for i := 0; i < len(n.children); i++ {
 		if err := n.children[i].spill(); err != nil {
 			return err
 		}
 	}

 	// We no longer need the child list because it's only used for spill tracking.
 	n.children = nil

 	// Split nodes into appropriate sizes. The first node will always be n.
 	var nodes = n.split(tx.db.pageSize)
 	for _, node := range nodes {
 		// Add node's page to the freelist if it's not new.
 		if node.pgid > 0 {
 			tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
 			node.pgid = 0
 		}

 		// Allocate contiguous space for the node.
 		p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize)
 		if err != nil {
 			return err
 		}

 		// Write the node.
 		if p.id >= tx.meta.pgid {
 			panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid))
 		}
 		node.pgid = p.id
 		node.write(p)
 		node.spilled = true

 		// Insert into parent inodes.
 		if node.parent != nil {
 			var key = node.key
 			if key == nil {
 				key = node.inodes[0].key
 			}

 			node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
 			node.key = node.inodes[0].key
 			_assert(len(node.key) > 0, "spill: zero-length node key")
 		}

 		// Update the statistics.
 		tx.stats.Spill++
 	}

 	// If the root node split and created a new root then we need to spill that
 	// as well. We'll clear out the children to make sure it doesn't try to respill.
 	if n.parent != nil && n.parent.pgid == 0 {
 		n.children = nil
 		return n.parent.spill()
 	}

 	return nil
 }

 // rebalance attempts to combine the node with sibling nodes if the node fill
 // size is below a threshold or if there are not enough keys.
 func (n *node) rebalance() {
 	if !n.unbalanced {
 		return
 	}
 	n.unbalanced = false

 	// Update statistics.
 	n.bucket.tx.stats.Rebalance++

 	// Ignore if node is above threshold (25%) and has enough keys.
 	var threshold = n.bucket.tx.db.pageSize / 4
 	if n.size() > threshold && len(n.inodes) > n.minKeys() {
 		return
 	}

 	// Root node has special handling.
 	if n.parent == nil {
 		// If root node is a branch and only has one node then collapse it.
 		if !n.isLeaf && len(n.inodes) == 1 {
 			// Move root's child up.
 			child := n.bucket.node(n.inodes[0].pgid, n)
 			n.isLeaf = child.isLeaf
 			n.inodes = child.inodes[:]
 			n.children = child.children

 			// Reparent all child nodes being moved.
 			for _, inode := range n.inodes {
 				if child, ok := n.bucket.nodes[inode.pgid]; ok {
 					child.parent = n
 				}
 			}

 			// Remove old child.
 			child.parent = nil
 			delete(n.bucket.nodes, child.pgid)
 			child.free()
 		}

 		return
 	}

 	// If node has no keys then just remove it.
 	if n.numChildren() == 0 {
 		n.parent.del(n.key)
 		n.parent.removeChild(n)
 		delete(n.bucket.nodes, n.pgid)
 		n.free()
 		n.parent.rebalance()
 		return
 	}

 	_assert(n.parent.numChildren() > 1, "parent must have at least 2 children")

 	// Destination node is right sibling if idx == 0, otherwise left sibling.
 	var target *node
 	var useNextSibling = (n.parent.childIndex(n) == 0)
 	if useNextSibling {
 		target = n.nextSibling()
 	} else {
 		target = n.prevSibling()
 	}

 	// If both this node and the target node are too small then merge them.
 	if useNextSibling {
 		// Reparent all child nodes being moved.
 		for _, inode := range target.inodes {
 			if child, ok := n.bucket.nodes[inode.pgid]; ok {
 				child.parent.removeChild(child)
 				child.parent = n
 				child.parent.children = append(child.parent.children, child)
 			}
 		}

 		// Copy over inodes from target and remove target.
 		n.inodes = append(n.inodes, target.inodes...)
 		n.parent.del(target.key)
 		n.parent.removeChild(target)
 		delete(n.bucket.nodes, target.pgid)
 		target.free()
 	} else {
 		// Reparent all child nodes being moved.
 		for _, inode := range n.inodes {
 			if child, ok := n.bucket.nodes[inode.pgid]; ok {
 				child.parent.removeChild(child)
 				child.parent = target
 				child.parent.children = append(child.parent.children, child)
 			}
 		}

 		// Copy over inodes to target and remove node.
 		target.inodes = append(target.inodes, n.inodes...)
 		n.parent.del(n.key)
 		n.parent.removeChild(n)
 		delete(n.bucket.nodes, n.pgid)
 		n.free()
 	}

 	// Either this node or the target node was deleted from the parent so rebalance it.
 	n.parent.rebalance()
 }

 // removes a node from the list of in-memory children.
 // This does not affect the inodes.
 func (n *node) removeChild(target *node) {
 	for i, child := range n.children {
 		if child == target {
 			n.children = append(n.children[:i], n.children[i+1:]...)
 			return
 		}
 	}
 }

 // dereference causes the node to copy all its inode key/value references to heap memory.
 // This is required when the mmap is reallocated so inodes are not pointing to stale data.
 func (n *node) dereference() {
 	if n.key != nil {
 		key := make([]byte, len(n.key))
 		copy(key, n.key)
 		n.key = key
 		_assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node")
 	}

 	for i := range n.inodes {
 		inode := &n.inodes[i]

 		key := make([]byte, len(inode.key))
 		copy(key, inode.key)
 		inode.key = key
 		_assert(len(inode.key) > 0, "dereference: zero-length inode key")

 		value := make([]byte, len(inode.value))
 		copy(value, inode.value)
 		inode.value = value
 	}

 	// Recursively dereference children.
 	for _, child := range n.children {
 		child.dereference()
 	}

 	// Update statistics.
 	n.bucket.tx.stats.NodeDeref++
 }

 // free adds the node's underlying page to the freelist.
 func (n *node) free() {
 	if n.pgid != 0 {
 		n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid))
 		n.pgid = 0
 	}
 }

 // dump writes the contents of the node to STDERR for debugging purposes.
 /*
 func (n *node) dump() {
 	// Write node header.
 	var typ = "branch"
 	if n.isLeaf {
 		typ = "leaf"
 	}
 	warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes))

 	// Write out abbreviated version of each item.
 	for _, item := range n.inodes {
 		if n.isLeaf {
 			if item.flags&bucketLeafFlag != 0 {
 				bucket := (*bucket)(unsafe.Pointer(&item.value[0]))
 				warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root)
 			} else {
 				warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4))
 			}
 		} else {
 			warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid)
 		}
 	}
 	warn("")
 }
 */

 type nodes []*node

 func (s nodes) Len() int           { return len(s) }
 func (s nodes) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
 func (s nodes) Less(i, j int) bool { return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 }

 // inode represents an internal node inside of a node.
 // It can be used to point to elements in a page or point
 // to an element which hasn't been added to a page yet.
 type inode struct {
 	flags uint32
 	pgid  pgid
 	key   []byte
 	value []byte
 }

 type inodes []inode
	package bbolt

	import (
	"bytes"
	"fmt"
	"sort"
	"unsafe"
	)

	// node represents an in-memory, deserialized page.
	type node struct {
	bucket *Bucket
	isLeaf bool
	unbalanced bool
	spilled bool
	key []byte
	pgid pgid
	parent *node
	children nodes
	inodes inodes
	}

	// root returns the top-level node this node is attached to.
	func (n node) root() node {
	if n.parent == nil {
	return n
	}
	return n.parent.root()
	}

	// minKeys returns the minimum number of inodes this node should have.
	func (n *node) minKeys() int {
	if n.isLeaf {
	return 1
	}
	return 2
	}

	// size returns the size of the node after serialization.
	func (n *node) size() int {
	sz, elsz := pageHeaderSize, n.pageElementSize()
	for i := 0; i < len(n.inodes); i++ {
	item := &n.inodes[i]
	sz += elsz + len(item.key) + len(item.value)
	}
	return sz
	}

	// sizeLessThan returns true if the node is less than a given size.
	// This is an optimization to avoid calculating a large node when we only need
	// to know if it fits inside a certain page size.
	func (n *node) sizeLessThan(v int) bool {
	sz, elsz := pageHeaderSize, n.pageElementSize()
	for i := 0; i < len(n.inodes); i++ {
	item := &n.inodes[i]
	sz += elsz + len(item.key) + len(item.value)
	if sz >= v {
	return false
	}
	}
	return true
	}

	// pageElementSize returns the size of each page element based on the type of node.
	func (n *node) pageElementSize() int {
	if n.isLeaf {
	return leafPageElementSize
	}
	return branchPageElementSize
	}

	// childAt returns the child node at a given index.
	func (n node) childAt(index int) node {
	if n.isLeaf {
	panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
	}
	return n.bucket.node(n.inodes[index].pgid, n)
	}

	// childIndex returns the index of a given child node.
	func (n node) childIndex(child node) int {
	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 })
	return index
	}

	// numChildren returns the number of children.
	func (n *node) numChildren() int {
	return len(n.inodes)
	}

	// nextSibling returns the next node with the same parent.
	func (n node) nextSibling() node {
	if n.parent == nil {
	return nil
	}
	index := n.parent.childIndex(n)
	if index >= n.parent.numChildren()-1 {
	return nil
	}
	return n.parent.childAt(index + 1)
	}

	// prevSibling returns the previous node with the same parent.
	func (n node) prevSibling() node {
	if n.parent == nil {
	return nil
	}
	index := n.parent.childIndex(n)
	if index == 0 {
	return nil
	}
	return n.parent.childAt(index - 1)
	}

	// put inserts a key/value.
	func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
	if pgid >= n.bucket.tx.meta.pgid {
	panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid))
	} else if len(oldKey) <= 0 {
	panic("put: zero-length old key")
	} else if len(newKey) <= 0 {
	panic("put: zero-length new key")
	}

	// Find insertion index.
	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })

	// Add capacity and shift nodes if we don't have an exact match and need to insert.
	exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
	if !exact {
	n.inodes = append(n.inodes, inode{})
	copy(n.inodes[index+1:], n.inodes[index:])
	}

	inode := &n.inodes[index]
	inode.flags = flags
	inode.key = newKey
	inode.value = value
	inode.pgid = pgid
	_assert(len(inode.key) > 0, "put: zero-length inode key")
	}

	// del removes a key from the node.
	func (n *node) del(key []byte) {
	// Find index of key.
	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 })

	// Exit if the key isn't found.
	if index >= len(n.inodes) \|\| !bytes.Equal(n.inodes[index].key, key) {
	return
	}

	// Delete inode from the node.
	n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)

	// Mark the node as needing rebalancing.
	n.unbalanced = true
	}

	// read initializes the node from a page.
	func (n node) read(p page) {
	n.pgid = p.id
	n.isLeaf = ((p.flags & leafPageFlag) != 0)
	n.inodes = make(inodes, int(p.count))

	for i := 0; i < int(p.count); i++ {
	inode := &n.inodes[i]
	if n.isLeaf {
	elem := p.leafPageElement(uint16(i))
	inode.flags = elem.flags
	inode.key = elem.key()
	inode.value = elem.value()
	} else {
	elem := p.branchPageElement(uint16(i))
	inode.pgid = elem.pgid
	inode.key = elem.key()
	}
	_assert(len(inode.key) > 0, "read: zero-length inode key")
	}

	// Save first key so we can find the node in the parent when we spill.
	if len(n.inodes) > 0 {
	n.key = n.inodes[0].key
	_assert(len(n.key) > 0, "read: zero-length node key")
	} else {
	n.key = nil
	}
	}

	// write writes the items onto one or more pages.
	func (n node) write(p page) {
	// Initialize page.
	if n.isLeaf {
	p.flags \|= leafPageFlag
	} else {
	p.flags \|= branchPageFlag
	}

	if len(n.inodes) >= 0xFFFF {
	panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id))
	}
	p.count = uint16(len(n.inodes))

	// Stop here if there are no items to write.
	if p.count == 0 {
	return
	}

	// Loop over each item and write it to the page.
	b := ([maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()len(n.inodes):]
	for i, item := range n.inodes {
	_assert(len(item.key) > 0, "write: zero-length inode key")

	// Write the page element.
	if n.isLeaf {
	elem := p.leafPageElement(uint16(i))
	elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
	elem.flags = item.flags
	elem.ksize = uint32(len(item.key))
	elem.vsize = uint32(len(item.value))
	} else {
	elem := p.branchPageElement(uint16(i))
	elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
	elem.ksize = uint32(len(item.key))
	elem.pgid = item.pgid
	_assert(elem.pgid != p.id, "write: circular dependency occurred")
	}

	// If the length of key+value is larger than the max allocation size
	// then we need to reallocate the byte array pointer.
	//
	// See: https://github.com/boltdb/bolt/pull/335
	klen, vlen := len(item.key), len(item.value)
	if len(b) < klen+vlen {
	b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0]))[:]
	}

	// Write data for the element to the end of the page.
	copy(b[0:], item.key)
	b = b[klen:]
	copy(b[0:], item.value)
	b = b[vlen:]
	}

	// DEBUG ONLY: n.dump()
	}

	// split breaks up a node into multiple smaller nodes, if appropriate.
	// This should only be called from the spill() function.
	func (n node) split(pageSize int) []node {
	var nodes []*node

	node := n
	for {
	// Split node into two.
	a, b := node.splitTwo(pageSize)
	nodes = append(nodes, a)

	// If we can't split then exit the loop.
	if b == nil {
	break
	}

	// Set node to b so it gets split on the next iteration.
	node = b
	}

	return nodes
	}

	// splitTwo breaks up a node into two smaller nodes, if appropriate.
	// This should only be called from the split() function.
	func (n node) splitTwo(pageSize int) (node, *node) {
	// Ignore the split if the page doesn't have at least enough nodes for
	// two pages or if the nodes can fit in a single page.
	if len(n.inodes) <= (minKeysPerPage*2) \|\| n.sizeLessThan(pageSize) {
	return n, nil
	}

	// Determine the threshold before starting a new node.
	var fillPercent = n.bucket.FillPercent
	if fillPercent < minFillPercent {
	fillPercent = minFillPercent
	} else if fillPercent > maxFillPercent {
	fillPercent = maxFillPercent
	}
	threshold := int(float64(pageSize) * fillPercent)

	// Determine split position and sizes of the two pages.
	splitIndex, _ := n.splitIndex(threshold)

	// Split node into two separate nodes.
	// If there's no parent then we'll need to create one.
	if n.parent == nil {
	n.parent = &node{bucket: n.bucket, children: []*node{n}}
	}

	// Create a new node and add it to the parent.
	next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
	n.parent.children = append(n.parent.children, next)

	// Split inodes across two nodes.
	next.inodes = n.inodes[splitIndex:]
	n.inodes = n.inodes[:splitIndex]

	// Update the statistics.
	n.bucket.tx.stats.Split++

	return n, next
	}

	// splitIndex finds the position where a page will fill a given threshold.
	// It returns the index as well as the size of the first page.
	// This is only be called from split().
	func (n *node) splitIndex(threshold int) (index, sz int) {
	sz = pageHeaderSize

	// Loop until we only have the minimum number of keys required for the second page.
	for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
	index = i
	inode := n.inodes[i]
	elsize := n.pageElementSize() + len(inode.key) + len(inode.value)

	// If we have at least the minimum number of keys and adding another
	// node would put us over the threshold then exit and return.
	if i >= minKeysPerPage && sz+elsize > threshold {
	break
	}

	// Add the element size to the total size.
	sz += elsize
	}

	return
	}

	// spill writes the nodes to dirty pages and splits nodes as it goes.
	// Returns an error if dirty pages cannot be allocated.
	func (n *node) spill() error {
	var tx = n.bucket.tx
	if n.spilled {
	return nil
	}

	// Spill child nodes first. Child nodes can materialize sibling nodes in
	// the case of split-merge so we cannot use a range loop. We have to check
	// the children size on every loop iteration.
	sort.Sort(n.children)
	for i := 0; i < len(n.children); i++ {
	if err := n.children[i].spill(); err != nil {
	return err
	}
	}

	// We no longer need the child list because it's only used for spill tracking.
	n.children = nil

	// Split nodes into appropriate sizes. The first node will always be n.
	var nodes = n.split(tx.db.pageSize)
	for _, node := range nodes {
	// Add node's page to the freelist if it's not new.
	if node.pgid > 0 {
	tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
	node.pgid = 0
	}

	// Allocate contiguous space for the node.
	p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize)
	if err != nil {
	return err
	}

	// Write the node.
	if p.id >= tx.meta.pgid {
	panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid))
	}
	node.pgid = p.id
	node.write(p)
	node.spilled = true

	// Insert into parent inodes.
	if node.parent != nil {
	var key = node.key
	if key == nil {
	key = node.inodes[0].key
	}

	node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
	node.key = node.inodes[0].key
	_assert(len(node.key) > 0, "spill: zero-length node key")
	}

	// Update the statistics.
	tx.stats.Spill++
	}

	// If the root node split and created a new root then we need to spill that
	// as well. We'll clear out the children to make sure it doesn't try to respill.
	if n.parent != nil && n.parent.pgid == 0 {
	n.children = nil
	return n.parent.spill()
	}

	return nil
	}

	// rebalance attempts to combine the node with sibling nodes if the node fill
	// size is below a threshold or if there are not enough keys.
	func (n *node) rebalance() {
	if !n.unbalanced {
	return
	}
	n.unbalanced = false

	// Update statistics.
	n.bucket.tx.stats.Rebalance++

	// Ignore if node is above threshold (25%) and has enough keys.
	var threshold = n.bucket.tx.db.pageSize / 4
	if n.size() > threshold && len(n.inodes) > n.minKeys() {
	return
	}

	// Root node has special handling.
	if n.parent == nil {
	// If root node is a branch and only has one node then collapse it.
	if !n.isLeaf && len(n.inodes) == 1 {
	// Move root's child up.
	child := n.bucket.node(n.inodes[0].pgid, n)
	n.isLeaf = child.isLeaf
	n.inodes = child.inodes[:]
	n.children = child.children

	// Reparent all child nodes being moved.
	for _, inode := range n.inodes {
	if child, ok := n.bucket.nodes[inode.pgid]; ok {
	child.parent = n
	}
	}

	// Remove old child.
	child.parent = nil
	delete(n.bucket.nodes, child.pgid)
	child.free()
	}

	return
	}

	// If node has no keys then just remove it.
	if n.numChildren() == 0 {
	n.parent.del(n.key)
	n.parent.removeChild(n)
	delete(n.bucket.nodes, n.pgid)
	n.free()
	n.parent.rebalance()
	return
	}

	_assert(n.parent.numChildren() > 1, "parent must have at least 2 children")

	// Destination node is right sibling if idx == 0, otherwise left sibling.
	var target *node
	var useNextSibling = (n.parent.childIndex(n) == 0)
	if useNextSibling {
	target = n.nextSibling()
	} else {
	target = n.prevSibling()
	}

	// If both this node and the target node are too small then merge them.
	if useNextSibling {
	// Reparent all child nodes being moved.
	for _, inode := range target.inodes {
	if child, ok := n.bucket.nodes[inode.pgid]; ok {
	child.parent.removeChild(child)
	child.parent = n
	child.parent.children = append(child.parent.children, child)
	}
	}

	// Copy over inodes from target and remove target.
	n.inodes = append(n.inodes, target.inodes...)
	n.parent.del(target.key)
	n.parent.removeChild(target)
	delete(n.bucket.nodes, target.pgid)
	target.free()
	} else {
	// Reparent all child nodes being moved.
	for _, inode := range n.inodes {
	if child, ok := n.bucket.nodes[inode.pgid]; ok {
	child.parent.removeChild(child)
	child.parent = target
	child.parent.children = append(child.parent.children, child)
	}
	}

	// Copy over inodes to target and remove node.
	target.inodes = append(target.inodes, n.inodes...)
	n.parent.del(n.key)
	n.parent.removeChild(n)
	delete(n.bucket.nodes, n.pgid)
	n.free()
	}

	// Either this node or the target node was deleted from the parent so rebalance it.
	n.parent.rebalance()
	}

	// removes a node from the list of in-memory children.
	// This does not affect the inodes.
	func (n node) removeChild(target node) {
	for i, child := range n.children {
	if child == target {
	n.children = append(n.children[:i], n.children[i+1:]...)
	return
	}
	}
	}

	// dereference causes the node to copy all its inode key/value references to heap memory.
	// This is required when the mmap is reallocated so inodes are not pointing to stale data.
	func (n *node) dereference() {
	if n.key != nil {
	key := make([]byte, len(n.key))
	copy(key, n.key)
	n.key = key
	_assert(n.pgid == 0 \|\| len(n.key) > 0, "dereference: zero-length node key on existing node")
	}

	for i := range n.inodes {
	inode := &n.inodes[i]

	key := make([]byte, len(inode.key))
	copy(key, inode.key)
	inode.key = key
	_assert(len(inode.key) > 0, "dereference: zero-length inode key")

	value := make([]byte, len(inode.value))
	copy(value, inode.value)
	inode.value = value
	}

	// Recursively dereference children.
	for _, child := range n.children {
	child.dereference()
	}

	// Update statistics.
	n.bucket.tx.stats.NodeDeref++
	}

	// free adds the node's underlying page to the freelist.
	func (n *node) free() {
	if n.pgid != 0 {
	n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid))
	n.pgid = 0
	}
	}

	// dump writes the contents of the node to STDERR for debugging purposes.
	/*
	func (n *node) dump() {
	// Write node header.
	var typ = "branch"
	if n.isLeaf {
	typ = "leaf"
	}
	warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes))

	// Write out abbreviated version of each item.
	for _, item := range n.inodes {
	if n.isLeaf {
	if item.flags&bucketLeafFlag != 0 {
	bucket := (*bucket)(unsafe.Pointer(&item.value[0]))
	warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root)
	} else {
	warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4))
	}
	} else {
	warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid)
	}
	}
	warn("")
	}
	*/

	type nodes []*node

	func (s nodes) Len() int { return len(s) }
	func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
	func (s nodes) Less(i, j int) bool { return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 }

	// inode represents an internal node inside of a node.
	// It can be used to point to elements in a page or point
	// to an element which hasn't been added to a page yet.
	type inode struct {
	flags uint32
	pgid pgid
	key []byte
	value []byte
	}

	type inodes []inode