| package bbolt |
| |
| import ( |
| "bytes" |
| "fmt" |
| "sort" |
| "unsafe" |
| ) |
| |
| // node represents an in-memory, deserialized page. |
| type node struct { |
| bucket *Bucket |
| isLeaf bool |
| unbalanced bool |
| spilled bool |
| key []byte |
| pgid pgid |
| parent *node |
| children nodes |
| inodes inodes |
| } |
| |
| // root returns the top-level node this node is attached to. |
| func (n *node) root() *node { |
| if n.parent == nil { |
| return n |
| } |
| return n.parent.root() |
| } |
| |
| // minKeys returns the minimum number of inodes this node should have. |
| func (n *node) minKeys() int { |
| if n.isLeaf { |
| return 1 |
| } |
| return 2 |
| } |
| |
| // size returns the size of the node after serialization. |
| func (n *node) size() int { |
| sz, elsz := pageHeaderSize, n.pageElementSize() |
| for i := 0; i < len(n.inodes); i++ { |
| item := &n.inodes[i] |
| sz += elsz + len(item.key) + len(item.value) |
| } |
| return sz |
| } |
| |
| // sizeLessThan returns true if the node is less than a given size. |
| // This is an optimization to avoid calculating a large node when we only need |
| // to know if it fits inside a certain page size. |
| func (n *node) sizeLessThan(v int) bool { |
| sz, elsz := pageHeaderSize, n.pageElementSize() |
| for i := 0; i < len(n.inodes); i++ { |
| item := &n.inodes[i] |
| sz += elsz + len(item.key) + len(item.value) |
| if sz >= v { |
| return false |
| } |
| } |
| return true |
| } |
| |
| // pageElementSize returns the size of each page element based on the type of node. |
| func (n *node) pageElementSize() int { |
| if n.isLeaf { |
| return leafPageElementSize |
| } |
| return branchPageElementSize |
| } |
| |
| // childAt returns the child node at a given index. |
| func (n *node) childAt(index int) *node { |
| if n.isLeaf { |
| panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index)) |
| } |
| return n.bucket.node(n.inodes[index].pgid, n) |
| } |
| |
| // childIndex returns the index of a given child node. |
| func (n *node) childIndex(child *node) int { |
| index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 }) |
| return index |
| } |
| |
| // numChildren returns the number of children. |
| func (n *node) numChildren() int { |
| return len(n.inodes) |
| } |
| |
| // nextSibling returns the next node with the same parent. |
| func (n *node) nextSibling() *node { |
| if n.parent == nil { |
| return nil |
| } |
| index := n.parent.childIndex(n) |
| if index >= n.parent.numChildren()-1 { |
| return nil |
| } |
| return n.parent.childAt(index + 1) |
| } |
| |
| // prevSibling returns the previous node with the same parent. |
| func (n *node) prevSibling() *node { |
| if n.parent == nil { |
| return nil |
| } |
| index := n.parent.childIndex(n) |
| if index == 0 { |
| return nil |
| } |
| return n.parent.childAt(index - 1) |
| } |
| |
| // put inserts a key/value. |
| func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) { |
| if pgid >= n.bucket.tx.meta.pgid { |
| panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid)) |
| } else if len(oldKey) <= 0 { |
| panic("put: zero-length old key") |
| } else if len(newKey) <= 0 { |
| panic("put: zero-length new key") |
| } |
| |
| // Find insertion index. |
| index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 }) |
| |
| // Add capacity and shift nodes if we don't have an exact match and need to insert. |
| exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey)) |
| if !exact { |
| n.inodes = append(n.inodes, inode{}) |
| copy(n.inodes[index+1:], n.inodes[index:]) |
| } |
| |
| inode := &n.inodes[index] |
| inode.flags = flags |
| inode.key = newKey |
| inode.value = value |
| inode.pgid = pgid |
| _assert(len(inode.key) > 0, "put: zero-length inode key") |
| } |
| |
| // del removes a key from the node. |
| func (n *node) del(key []byte) { |
| // Find index of key. |
| index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 }) |
| |
| // Exit if the key isn't found. |
| if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) { |
| return |
| } |
| |
| // Delete inode from the node. |
| n.inodes = append(n.inodes[:index], n.inodes[index+1:]...) |
| |
| // Mark the node as needing rebalancing. |
| n.unbalanced = true |
| } |
| |
| // read initializes the node from a page. |
| func (n *node) read(p *page) { |
| n.pgid = p.id |
| n.isLeaf = ((p.flags & leafPageFlag) != 0) |
| n.inodes = make(inodes, int(p.count)) |
| |
| for i := 0; i < int(p.count); i++ { |
| inode := &n.inodes[i] |
| if n.isLeaf { |
| elem := p.leafPageElement(uint16(i)) |
| inode.flags = elem.flags |
| inode.key = elem.key() |
| inode.value = elem.value() |
| } else { |
| elem := p.branchPageElement(uint16(i)) |
| inode.pgid = elem.pgid |
| inode.key = elem.key() |
| } |
| _assert(len(inode.key) > 0, "read: zero-length inode key") |
| } |
| |
| // Save first key so we can find the node in the parent when we spill. |
| if len(n.inodes) > 0 { |
| n.key = n.inodes[0].key |
| _assert(len(n.key) > 0, "read: zero-length node key") |
| } else { |
| n.key = nil |
| } |
| } |
| |
| // write writes the items onto one or more pages. |
| func (n *node) write(p *page) { |
| // Initialize page. |
| if n.isLeaf { |
| p.flags |= leafPageFlag |
| } else { |
| p.flags |= branchPageFlag |
| } |
| |
| if len(n.inodes) >= 0xFFFF { |
| panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id)) |
| } |
| p.count = uint16(len(n.inodes)) |
| |
| // Stop here if there are no items to write. |
| if p.count == 0 { |
| return |
| } |
| |
| // Loop over each item and write it to the page. |
| b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):] |
| for i, item := range n.inodes { |
| _assert(len(item.key) > 0, "write: zero-length inode key") |
| |
| // Write the page element. |
| if n.isLeaf { |
| elem := p.leafPageElement(uint16(i)) |
| elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem))) |
| elem.flags = item.flags |
| elem.ksize = uint32(len(item.key)) |
| elem.vsize = uint32(len(item.value)) |
| } else { |
| elem := p.branchPageElement(uint16(i)) |
| elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem))) |
| elem.ksize = uint32(len(item.key)) |
| elem.pgid = item.pgid |
| _assert(elem.pgid != p.id, "write: circular dependency occurred") |
| } |
| |
| // If the length of key+value is larger than the max allocation size |
| // then we need to reallocate the byte array pointer. |
| // |
| // See: https://github.com/boltdb/bolt/pull/335 |
| klen, vlen := len(item.key), len(item.value) |
| if len(b) < klen+vlen { |
| b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0]))[:] |
| } |
| |
| // Write data for the element to the end of the page. |
| copy(b[0:], item.key) |
| b = b[klen:] |
| copy(b[0:], item.value) |
| b = b[vlen:] |
| } |
| |
| // DEBUG ONLY: n.dump() |
| } |
| |
| // split breaks up a node into multiple smaller nodes, if appropriate. |
| // This should only be called from the spill() function. |
| func (n *node) split(pageSize int) []*node { |
| var nodes []*node |
| |
| node := n |
| for { |
| // Split node into two. |
| a, b := node.splitTwo(pageSize) |
| nodes = append(nodes, a) |
| |
| // If we can't split then exit the loop. |
| if b == nil { |
| break |
| } |
| |
| // Set node to b so it gets split on the next iteration. |
| node = b |
| } |
| |
| return nodes |
| } |
| |
| // splitTwo breaks up a node into two smaller nodes, if appropriate. |
| // This should only be called from the split() function. |
| func (n *node) splitTwo(pageSize int) (*node, *node) { |
| // Ignore the split if the page doesn't have at least enough nodes for |
| // two pages or if the nodes can fit in a single page. |
| if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) { |
| return n, nil |
| } |
| |
| // Determine the threshold before starting a new node. |
| var fillPercent = n.bucket.FillPercent |
| if fillPercent < minFillPercent { |
| fillPercent = minFillPercent |
| } else if fillPercent > maxFillPercent { |
| fillPercent = maxFillPercent |
| } |
| threshold := int(float64(pageSize) * fillPercent) |
| |
| // Determine split position and sizes of the two pages. |
| splitIndex, _ := n.splitIndex(threshold) |
| |
| // Split node into two separate nodes. |
| // If there's no parent then we'll need to create one. |
| if n.parent == nil { |
| n.parent = &node{bucket: n.bucket, children: []*node{n}} |
| } |
| |
| // Create a new node and add it to the parent. |
| next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent} |
| n.parent.children = append(n.parent.children, next) |
| |
| // Split inodes across two nodes. |
| next.inodes = n.inodes[splitIndex:] |
| n.inodes = n.inodes[:splitIndex] |
| |
| // Update the statistics. |
| n.bucket.tx.stats.Split++ |
| |
| return n, next |
| } |
| |
| // splitIndex finds the position where a page will fill a given threshold. |
| // It returns the index as well as the size of the first page. |
| // This is only be called from split(). |
| func (n *node) splitIndex(threshold int) (index, sz int) { |
| sz = pageHeaderSize |
| |
| // Loop until we only have the minimum number of keys required for the second page. |
| for i := 0; i < len(n.inodes)-minKeysPerPage; i++ { |
| index = i |
| inode := n.inodes[i] |
| elsize := n.pageElementSize() + len(inode.key) + len(inode.value) |
| |
| // If we have at least the minimum number of keys and adding another |
| // node would put us over the threshold then exit and return. |
| if i >= minKeysPerPage && sz+elsize > threshold { |
| break |
| } |
| |
| // Add the element size to the total size. |
| sz += elsize |
| } |
| |
| return |
| } |
| |
| // spill writes the nodes to dirty pages and splits nodes as it goes. |
| // Returns an error if dirty pages cannot be allocated. |
| func (n *node) spill() error { |
| var tx = n.bucket.tx |
| if n.spilled { |
| return nil |
| } |
| |
| // Spill child nodes first. Child nodes can materialize sibling nodes in |
| // the case of split-merge so we cannot use a range loop. We have to check |
| // the children size on every loop iteration. |
| sort.Sort(n.children) |
| for i := 0; i < len(n.children); i++ { |
| if err := n.children[i].spill(); err != nil { |
| return err |
| } |
| } |
| |
| // We no longer need the child list because it's only used for spill tracking. |
| n.children = nil |
| |
| // Split nodes into appropriate sizes. The first node will always be n. |
| var nodes = n.split(tx.db.pageSize) |
| for _, node := range nodes { |
| // Add node's page to the freelist if it's not new. |
| if node.pgid > 0 { |
| tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid)) |
| node.pgid = 0 |
| } |
| |
| // Allocate contiguous space for the node. |
| p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize) |
| if err != nil { |
| return err |
| } |
| |
| // Write the node. |
| if p.id >= tx.meta.pgid { |
| panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid)) |
| } |
| node.pgid = p.id |
| node.write(p) |
| node.spilled = true |
| |
| // Insert into parent inodes. |
| if node.parent != nil { |
| var key = node.key |
| if key == nil { |
| key = node.inodes[0].key |
| } |
| |
| node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0) |
| node.key = node.inodes[0].key |
| _assert(len(node.key) > 0, "spill: zero-length node key") |
| } |
| |
| // Update the statistics. |
| tx.stats.Spill++ |
| } |
| |
| // If the root node split and created a new root then we need to spill that |
| // as well. We'll clear out the children to make sure it doesn't try to respill. |
| if n.parent != nil && n.parent.pgid == 0 { |
| n.children = nil |
| return n.parent.spill() |
| } |
| |
| return nil |
| } |
| |
| // rebalance attempts to combine the node with sibling nodes if the node fill |
| // size is below a threshold or if there are not enough keys. |
| func (n *node) rebalance() { |
| if !n.unbalanced { |
| return |
| } |
| n.unbalanced = false |
| |
| // Update statistics. |
| n.bucket.tx.stats.Rebalance++ |
| |
| // Ignore if node is above threshold (25%) and has enough keys. |
| var threshold = n.bucket.tx.db.pageSize / 4 |
| if n.size() > threshold && len(n.inodes) > n.minKeys() { |
| return |
| } |
| |
| // Root node has special handling. |
| if n.parent == nil { |
| // If root node is a branch and only has one node then collapse it. |
| if !n.isLeaf && len(n.inodes) == 1 { |
| // Move root's child up. |
| child := n.bucket.node(n.inodes[0].pgid, n) |
| n.isLeaf = child.isLeaf |
| n.inodes = child.inodes[:] |
| n.children = child.children |
| |
| // Reparent all child nodes being moved. |
| for _, inode := range n.inodes { |
| if child, ok := n.bucket.nodes[inode.pgid]; ok { |
| child.parent = n |
| } |
| } |
| |
| // Remove old child. |
| child.parent = nil |
| delete(n.bucket.nodes, child.pgid) |
| child.free() |
| } |
| |
| return |
| } |
| |
| // If node has no keys then just remove it. |
| if n.numChildren() == 0 { |
| n.parent.del(n.key) |
| n.parent.removeChild(n) |
| delete(n.bucket.nodes, n.pgid) |
| n.free() |
| n.parent.rebalance() |
| return |
| } |
| |
| _assert(n.parent.numChildren() > 1, "parent must have at least 2 children") |
| |
| // Destination node is right sibling if idx == 0, otherwise left sibling. |
| var target *node |
| var useNextSibling = (n.parent.childIndex(n) == 0) |
| if useNextSibling { |
| target = n.nextSibling() |
| } else { |
| target = n.prevSibling() |
| } |
| |
| // If both this node and the target node are too small then merge them. |
| if useNextSibling { |
| // Reparent all child nodes being moved. |
| for _, inode := range target.inodes { |
| if child, ok := n.bucket.nodes[inode.pgid]; ok { |
| child.parent.removeChild(child) |
| child.parent = n |
| child.parent.children = append(child.parent.children, child) |
| } |
| } |
| |
| // Copy over inodes from target and remove target. |
| n.inodes = append(n.inodes, target.inodes...) |
| n.parent.del(target.key) |
| n.parent.removeChild(target) |
| delete(n.bucket.nodes, target.pgid) |
| target.free() |
| } else { |
| // Reparent all child nodes being moved. |
| for _, inode := range n.inodes { |
| if child, ok := n.bucket.nodes[inode.pgid]; ok { |
| child.parent.removeChild(child) |
| child.parent = target |
| child.parent.children = append(child.parent.children, child) |
| } |
| } |
| |
| // Copy over inodes to target and remove node. |
| target.inodes = append(target.inodes, n.inodes...) |
| n.parent.del(n.key) |
| n.parent.removeChild(n) |
| delete(n.bucket.nodes, n.pgid) |
| n.free() |
| } |
| |
| // Either this node or the target node was deleted from the parent so rebalance it. |
| n.parent.rebalance() |
| } |
| |
| // removes a node from the list of in-memory children. |
| // This does not affect the inodes. |
| func (n *node) removeChild(target *node) { |
| for i, child := range n.children { |
| if child == target { |
| n.children = append(n.children[:i], n.children[i+1:]...) |
| return |
| } |
| } |
| } |
| |
| // dereference causes the node to copy all its inode key/value references to heap memory. |
| // This is required when the mmap is reallocated so inodes are not pointing to stale data. |
| func (n *node) dereference() { |
| if n.key != nil { |
| key := make([]byte, len(n.key)) |
| copy(key, n.key) |
| n.key = key |
| _assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node") |
| } |
| |
| for i := range n.inodes { |
| inode := &n.inodes[i] |
| |
| key := make([]byte, len(inode.key)) |
| copy(key, inode.key) |
| inode.key = key |
| _assert(len(inode.key) > 0, "dereference: zero-length inode key") |
| |
| value := make([]byte, len(inode.value)) |
| copy(value, inode.value) |
| inode.value = value |
| } |
| |
| // Recursively dereference children. |
| for _, child := range n.children { |
| child.dereference() |
| } |
| |
| // Update statistics. |
| n.bucket.tx.stats.NodeDeref++ |
| } |
| |
| // free adds the node's underlying page to the freelist. |
| func (n *node) free() { |
| if n.pgid != 0 { |
| n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid)) |
| n.pgid = 0 |
| } |
| } |
| |
| // dump writes the contents of the node to STDERR for debugging purposes. |
| /* |
| func (n *node) dump() { |
| // Write node header. |
| var typ = "branch" |
| if n.isLeaf { |
| typ = "leaf" |
| } |
| warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes)) |
| |
| // Write out abbreviated version of each item. |
| for _, item := range n.inodes { |
| if n.isLeaf { |
| if item.flags&bucketLeafFlag != 0 { |
| bucket := (*bucket)(unsafe.Pointer(&item.value[0])) |
| warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root) |
| } else { |
| warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4)) |
| } |
| } else { |
| warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid) |
| } |
| } |
| warn("") |
| } |
| */ |
| |
| type nodes []*node |
| |
| func (s nodes) Len() int { return len(s) } |
| func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] } |
| func (s nodes) Less(i, j int) bool { return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 } |
| |
| // inode represents an internal node inside of a node. |
| // It can be used to point to elements in a page or point |
| // to an element which hasn't been added to a page yet. |
| type inode struct { |
| flags uint32 |
| pgid pgid |
| key []byte |
| value []byte |
| } |
| |
| type inodes []inode |