blob: 1690eef3f7a490d7ff84ada0e5bd73a8dd9911ea [file] [log] [blame]
khenaidood948f772021-08-11 17:49:24 -04001package bbolt
2
3import (
4 "bytes"
5 "fmt"
6 "reflect"
7 "sort"
8 "unsafe"
9)
10
11// node represents an in-memory, deserialized page.
12type node struct {
13 bucket *Bucket
14 isLeaf bool
15 unbalanced bool
16 spilled bool
17 key []byte
18 pgid pgid
19 parent *node
20 children nodes
21 inodes inodes
22}
23
24// root returns the top-level node this node is attached to.
25func (n *node) root() *node {
26 if n.parent == nil {
27 return n
28 }
29 return n.parent.root()
30}
31
32// minKeys returns the minimum number of inodes this node should have.
33func (n *node) minKeys() int {
34 if n.isLeaf {
35 return 1
36 }
37 return 2
38}
39
40// size returns the size of the node after serialization.
41func (n *node) size() int {
42 sz, elsz := pageHeaderSize, n.pageElementSize()
43 for i := 0; i < len(n.inodes); i++ {
44 item := &n.inodes[i]
45 sz += elsz + uintptr(len(item.key)) + uintptr(len(item.value))
46 }
47 return int(sz)
48}
49
50// sizeLessThan returns true if the node is less than a given size.
51// This is an optimization to avoid calculating a large node when we only need
52// to know if it fits inside a certain page size.
53func (n *node) sizeLessThan(v uintptr) bool {
54 sz, elsz := pageHeaderSize, n.pageElementSize()
55 for i := 0; i < len(n.inodes); i++ {
56 item := &n.inodes[i]
57 sz += elsz + uintptr(len(item.key)) + uintptr(len(item.value))
58 if sz >= v {
59 return false
60 }
61 }
62 return true
63}
64
65// pageElementSize returns the size of each page element based on the type of node.
66func (n *node) pageElementSize() uintptr {
67 if n.isLeaf {
68 return leafPageElementSize
69 }
70 return branchPageElementSize
71}
72
73// childAt returns the child node at a given index.
74func (n *node) childAt(index int) *node {
75 if n.isLeaf {
76 panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
77 }
78 return n.bucket.node(n.inodes[index].pgid, n)
79}
80
81// childIndex returns the index of a given child node.
82func (n *node) childIndex(child *node) int {
83 index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 })
84 return index
85}
86
87// numChildren returns the number of children.
88func (n *node) numChildren() int {
89 return len(n.inodes)
90}
91
92// nextSibling returns the next node with the same parent.
93func (n *node) nextSibling() *node {
94 if n.parent == nil {
95 return nil
96 }
97 index := n.parent.childIndex(n)
98 if index >= n.parent.numChildren()-1 {
99 return nil
100 }
101 return n.parent.childAt(index + 1)
102}
103
104// prevSibling returns the previous node with the same parent.
105func (n *node) prevSibling() *node {
106 if n.parent == nil {
107 return nil
108 }
109 index := n.parent.childIndex(n)
110 if index == 0 {
111 return nil
112 }
113 return n.parent.childAt(index - 1)
114}
115
116// put inserts a key/value.
117func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
118 if pgid >= n.bucket.tx.meta.pgid {
119 panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid))
120 } else if len(oldKey) <= 0 {
121 panic("put: zero-length old key")
122 } else if len(newKey) <= 0 {
123 panic("put: zero-length new key")
124 }
125
126 // Find insertion index.
127 index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })
128
129 // Add capacity and shift nodes if we don't have an exact match and need to insert.
130 exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
131 if !exact {
132 n.inodes = append(n.inodes, inode{})
133 copy(n.inodes[index+1:], n.inodes[index:])
134 }
135
136 inode := &n.inodes[index]
137 inode.flags = flags
138 inode.key = newKey
139 inode.value = value
140 inode.pgid = pgid
141 _assert(len(inode.key) > 0, "put: zero-length inode key")
142}
143
144// del removes a key from the node.
145func (n *node) del(key []byte) {
146 // Find index of key.
147 index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 })
148
149 // Exit if the key isn't found.
150 if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) {
151 return
152 }
153
154 // Delete inode from the node.
155 n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)
156
157 // Mark the node as needing rebalancing.
158 n.unbalanced = true
159}
160
161// read initializes the node from a page.
162func (n *node) read(p *page) {
163 n.pgid = p.id
164 n.isLeaf = ((p.flags & leafPageFlag) != 0)
165 n.inodes = make(inodes, int(p.count))
166
167 for i := 0; i < int(p.count); i++ {
168 inode := &n.inodes[i]
169 if n.isLeaf {
170 elem := p.leafPageElement(uint16(i))
171 inode.flags = elem.flags
172 inode.key = elem.key()
173 inode.value = elem.value()
174 } else {
175 elem := p.branchPageElement(uint16(i))
176 inode.pgid = elem.pgid
177 inode.key = elem.key()
178 }
179 _assert(len(inode.key) > 0, "read: zero-length inode key")
180 }
181
182 // Save first key so we can find the node in the parent when we spill.
183 if len(n.inodes) > 0 {
184 n.key = n.inodes[0].key
185 _assert(len(n.key) > 0, "read: zero-length node key")
186 } else {
187 n.key = nil
188 }
189}
190
191// write writes the items onto one or more pages.
192func (n *node) write(p *page) {
193 // Initialize page.
194 if n.isLeaf {
195 p.flags |= leafPageFlag
196 } else {
197 p.flags |= branchPageFlag
198 }
199
200 if len(n.inodes) >= 0xFFFF {
201 panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id))
202 }
203 p.count = uint16(len(n.inodes))
204
205 // Stop here if there are no items to write.
206 if p.count == 0 {
207 return
208 }
209
210 // Loop over each item and write it to the page.
211 bp := uintptr(unsafe.Pointer(p)) + unsafe.Sizeof(*p) + n.pageElementSize()*uintptr(len(n.inodes))
212 for i, item := range n.inodes {
213 _assert(len(item.key) > 0, "write: zero-length inode key")
214
215 // Write the page element.
216 if n.isLeaf {
217 elem := p.leafPageElement(uint16(i))
218 elem.pos = uint32(bp - uintptr(unsafe.Pointer(elem)))
219 elem.flags = item.flags
220 elem.ksize = uint32(len(item.key))
221 elem.vsize = uint32(len(item.value))
222 } else {
223 elem := p.branchPageElement(uint16(i))
224 elem.pos = uint32(bp - uintptr(unsafe.Pointer(elem)))
225 elem.ksize = uint32(len(item.key))
226 elem.pgid = item.pgid
227 _assert(elem.pgid != p.id, "write: circular dependency occurred")
228 }
229
230 // Create a slice to write into of needed size and advance
231 // byte pointer for next iteration.
232 klen, vlen := len(item.key), len(item.value)
233 sz := klen + vlen
234 b := *(*[]byte)(unsafe.Pointer(&reflect.SliceHeader{
235 Data: bp,
236 Len: sz,
237 Cap: sz,
238 }))
239 bp += uintptr(sz)
240
241 // Write data for the element to the end of the page.
242 l := copy(b, item.key)
243 copy(b[l:], item.value)
244 }
245
246 // DEBUG ONLY: n.dump()
247}
248
249// split breaks up a node into multiple smaller nodes, if appropriate.
250// This should only be called from the spill() function.
251func (n *node) split(pageSize uintptr) []*node {
252 var nodes []*node
253
254 node := n
255 for {
256 // Split node into two.
257 a, b := node.splitTwo(pageSize)
258 nodes = append(nodes, a)
259
260 // If we can't split then exit the loop.
261 if b == nil {
262 break
263 }
264
265 // Set node to b so it gets split on the next iteration.
266 node = b
267 }
268
269 return nodes
270}
271
272// splitTwo breaks up a node into two smaller nodes, if appropriate.
273// This should only be called from the split() function.
274func (n *node) splitTwo(pageSize uintptr) (*node, *node) {
275 // Ignore the split if the page doesn't have at least enough nodes for
276 // two pages or if the nodes can fit in a single page.
277 if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) {
278 return n, nil
279 }
280
281 // Determine the threshold before starting a new node.
282 var fillPercent = n.bucket.FillPercent
283 if fillPercent < minFillPercent {
284 fillPercent = minFillPercent
285 } else if fillPercent > maxFillPercent {
286 fillPercent = maxFillPercent
287 }
288 threshold := int(float64(pageSize) * fillPercent)
289
290 // Determine split position and sizes of the two pages.
291 splitIndex, _ := n.splitIndex(threshold)
292
293 // Split node into two separate nodes.
294 // If there's no parent then we'll need to create one.
295 if n.parent == nil {
296 n.parent = &node{bucket: n.bucket, children: []*node{n}}
297 }
298
299 // Create a new node and add it to the parent.
300 next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
301 n.parent.children = append(n.parent.children, next)
302
303 // Split inodes across two nodes.
304 next.inodes = n.inodes[splitIndex:]
305 n.inodes = n.inodes[:splitIndex]
306
307 // Update the statistics.
308 n.bucket.tx.stats.Split++
309
310 return n, next
311}
312
313// splitIndex finds the position where a page will fill a given threshold.
314// It returns the index as well as the size of the first page.
315// This is only be called from split().
316func (n *node) splitIndex(threshold int) (index, sz uintptr) {
317 sz = pageHeaderSize
318
319 // Loop until we only have the minimum number of keys required for the second page.
320 for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
321 index = uintptr(i)
322 inode := n.inodes[i]
323 elsize := n.pageElementSize() + uintptr(len(inode.key)) + uintptr(len(inode.value))
324
325 // If we have at least the minimum number of keys and adding another
326 // node would put us over the threshold then exit and return.
327 if index >= minKeysPerPage && sz+elsize > uintptr(threshold) {
328 break
329 }
330
331 // Add the element size to the total size.
332 sz += elsize
333 }
334
335 return
336}
337
338// spill writes the nodes to dirty pages and splits nodes as it goes.
339// Returns an error if dirty pages cannot be allocated.
340func (n *node) spill() error {
341 var tx = n.bucket.tx
342 if n.spilled {
343 return nil
344 }
345
346 // Spill child nodes first. Child nodes can materialize sibling nodes in
347 // the case of split-merge so we cannot use a range loop. We have to check
348 // the children size on every loop iteration.
349 sort.Sort(n.children)
350 for i := 0; i < len(n.children); i++ {
351 if err := n.children[i].spill(); err != nil {
352 return err
353 }
354 }
355
356 // We no longer need the child list because it's only used for spill tracking.
357 n.children = nil
358
359 // Split nodes into appropriate sizes. The first node will always be n.
360 var nodes = n.split(uintptr(tx.db.pageSize))
361 for _, node := range nodes {
362 // Add node's page to the freelist if it's not new.
363 if node.pgid > 0 {
364 tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
365 node.pgid = 0
366 }
367
368 // Allocate contiguous space for the node.
369 p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize)
370 if err != nil {
371 return err
372 }
373
374 // Write the node.
375 if p.id >= tx.meta.pgid {
376 panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid))
377 }
378 node.pgid = p.id
379 node.write(p)
380 node.spilled = true
381
382 // Insert into parent inodes.
383 if node.parent != nil {
384 var key = node.key
385 if key == nil {
386 key = node.inodes[0].key
387 }
388
389 node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
390 node.key = node.inodes[0].key
391 _assert(len(node.key) > 0, "spill: zero-length node key")
392 }
393
394 // Update the statistics.
395 tx.stats.Spill++
396 }
397
398 // If the root node split and created a new root then we need to spill that
399 // as well. We'll clear out the children to make sure it doesn't try to respill.
400 if n.parent != nil && n.parent.pgid == 0 {
401 n.children = nil
402 return n.parent.spill()
403 }
404
405 return nil
406}
407
408// rebalance attempts to combine the node with sibling nodes if the node fill
409// size is below a threshold or if there are not enough keys.
410func (n *node) rebalance() {
411 if !n.unbalanced {
412 return
413 }
414 n.unbalanced = false
415
416 // Update statistics.
417 n.bucket.tx.stats.Rebalance++
418
419 // Ignore if node is above threshold (25%) and has enough keys.
420 var threshold = n.bucket.tx.db.pageSize / 4
421 if n.size() > threshold && len(n.inodes) > n.minKeys() {
422 return
423 }
424
425 // Root node has special handling.
426 if n.parent == nil {
427 // If root node is a branch and only has one node then collapse it.
428 if !n.isLeaf && len(n.inodes) == 1 {
429 // Move root's child up.
430 child := n.bucket.node(n.inodes[0].pgid, n)
431 n.isLeaf = child.isLeaf
432 n.inodes = child.inodes[:]
433 n.children = child.children
434
435 // Reparent all child nodes being moved.
436 for _, inode := range n.inodes {
437 if child, ok := n.bucket.nodes[inode.pgid]; ok {
438 child.parent = n
439 }
440 }
441
442 // Remove old child.
443 child.parent = nil
444 delete(n.bucket.nodes, child.pgid)
445 child.free()
446 }
447
448 return
449 }
450
451 // If node has no keys then just remove it.
452 if n.numChildren() == 0 {
453 n.parent.del(n.key)
454 n.parent.removeChild(n)
455 delete(n.bucket.nodes, n.pgid)
456 n.free()
457 n.parent.rebalance()
458 return
459 }
460
461 _assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
462
463 // Destination node is right sibling if idx == 0, otherwise left sibling.
464 var target *node
465 var useNextSibling = (n.parent.childIndex(n) == 0)
466 if useNextSibling {
467 target = n.nextSibling()
468 } else {
469 target = n.prevSibling()
470 }
471
472 // If both this node and the target node are too small then merge them.
473 if useNextSibling {
474 // Reparent all child nodes being moved.
475 for _, inode := range target.inodes {
476 if child, ok := n.bucket.nodes[inode.pgid]; ok {
477 child.parent.removeChild(child)
478 child.parent = n
479 child.parent.children = append(child.parent.children, child)
480 }
481 }
482
483 // Copy over inodes from target and remove target.
484 n.inodes = append(n.inodes, target.inodes...)
485 n.parent.del(target.key)
486 n.parent.removeChild(target)
487 delete(n.bucket.nodes, target.pgid)
488 target.free()
489 } else {
490 // Reparent all child nodes being moved.
491 for _, inode := range n.inodes {
492 if child, ok := n.bucket.nodes[inode.pgid]; ok {
493 child.parent.removeChild(child)
494 child.parent = target
495 child.parent.children = append(child.parent.children, child)
496 }
497 }
498
499 // Copy over inodes to target and remove node.
500 target.inodes = append(target.inodes, n.inodes...)
501 n.parent.del(n.key)
502 n.parent.removeChild(n)
503 delete(n.bucket.nodes, n.pgid)
504 n.free()
505 }
506
507 // Either this node or the target node was deleted from the parent so rebalance it.
508 n.parent.rebalance()
509}
510
511// removes a node from the list of in-memory children.
512// This does not affect the inodes.
513func (n *node) removeChild(target *node) {
514 for i, child := range n.children {
515 if child == target {
516 n.children = append(n.children[:i], n.children[i+1:]...)
517 return
518 }
519 }
520}
521
522// dereference causes the node to copy all its inode key/value references to heap memory.
523// This is required when the mmap is reallocated so inodes are not pointing to stale data.
524func (n *node) dereference() {
525 if n.key != nil {
526 key := make([]byte, len(n.key))
527 copy(key, n.key)
528 n.key = key
529 _assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node")
530 }
531
532 for i := range n.inodes {
533 inode := &n.inodes[i]
534
535 key := make([]byte, len(inode.key))
536 copy(key, inode.key)
537 inode.key = key
538 _assert(len(inode.key) > 0, "dereference: zero-length inode key")
539
540 value := make([]byte, len(inode.value))
541 copy(value, inode.value)
542 inode.value = value
543 }
544
545 // Recursively dereference children.
546 for _, child := range n.children {
547 child.dereference()
548 }
549
550 // Update statistics.
551 n.bucket.tx.stats.NodeDeref++
552}
553
554// free adds the node's underlying page to the freelist.
555func (n *node) free() {
556 if n.pgid != 0 {
557 n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid))
558 n.pgid = 0
559 }
560}
561
562// dump writes the contents of the node to STDERR for debugging purposes.
563/*
564func (n *node) dump() {
565 // Write node header.
566 var typ = "branch"
567 if n.isLeaf {
568 typ = "leaf"
569 }
570 warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes))
571
572 // Write out abbreviated version of each item.
573 for _, item := range n.inodes {
574 if n.isLeaf {
575 if item.flags&bucketLeafFlag != 0 {
576 bucket := (*bucket)(unsafe.Pointer(&item.value[0]))
577 warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root)
578 } else {
579 warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4))
580 }
581 } else {
582 warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid)
583 }
584 }
585 warn("")
586}
587*/
588
589type nodes []*node
590
591func (s nodes) Len() int { return len(s) }
592func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
593func (s nodes) Less(i, j int) bool {
594 return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1
595}
596
597// inode represents an internal node inside of a node.
598// It can be used to point to elements in a page or point
599// to an element which hasn't been added to a page yet.
600type inode struct {
601 flags uint32
602 pgid pgid
603 key []byte
604 value []byte
605}
606
607type inodes []inode