blob: 6c3fa553ea68d762f9392c825775856168c28429 [file] [log] [blame]
khenaidooab1f7bd2019-11-14 14:00:27 -05001package bbolt
2
3import (
4 "bytes"
5 "fmt"
6 "sort"
7 "unsafe"
8)
9
10// node represents an in-memory, deserialized page.
11type node struct {
12 bucket *Bucket
13 isLeaf bool
14 unbalanced bool
15 spilled bool
16 key []byte
17 pgid pgid
18 parent *node
19 children nodes
20 inodes inodes
21}
22
23// root returns the top-level node this node is attached to.
24func (n *node) root() *node {
25 if n.parent == nil {
26 return n
27 }
28 return n.parent.root()
29}
30
31// minKeys returns the minimum number of inodes this node should have.
32func (n *node) minKeys() int {
33 if n.isLeaf {
34 return 1
35 }
36 return 2
37}
38
39// size returns the size of the node after serialization.
40func (n *node) size() int {
41 sz, elsz := pageHeaderSize, n.pageElementSize()
42 for i := 0; i < len(n.inodes); i++ {
43 item := &n.inodes[i]
44 sz += elsz + len(item.key) + len(item.value)
45 }
46 return sz
47}
48
49// sizeLessThan returns true if the node is less than a given size.
50// This is an optimization to avoid calculating a large node when we only need
51// to know if it fits inside a certain page size.
52func (n *node) sizeLessThan(v int) bool {
53 sz, elsz := pageHeaderSize, n.pageElementSize()
54 for i := 0; i < len(n.inodes); i++ {
55 item := &n.inodes[i]
56 sz += elsz + len(item.key) + len(item.value)
57 if sz >= v {
58 return false
59 }
60 }
61 return true
62}
63
64// pageElementSize returns the size of each page element based on the type of node.
65func (n *node) pageElementSize() int {
66 if n.isLeaf {
67 return leafPageElementSize
68 }
69 return branchPageElementSize
70}
71
72// childAt returns the child node at a given index.
73func (n *node) childAt(index int) *node {
74 if n.isLeaf {
75 panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
76 }
77 return n.bucket.node(n.inodes[index].pgid, n)
78}
79
80// childIndex returns the index of a given child node.
81func (n *node) childIndex(child *node) int {
82 index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 })
83 return index
84}
85
86// numChildren returns the number of children.
87func (n *node) numChildren() int {
88 return len(n.inodes)
89}
90
91// nextSibling returns the next node with the same parent.
92func (n *node) nextSibling() *node {
93 if n.parent == nil {
94 return nil
95 }
96 index := n.parent.childIndex(n)
97 if index >= n.parent.numChildren()-1 {
98 return nil
99 }
100 return n.parent.childAt(index + 1)
101}
102
103// prevSibling returns the previous node with the same parent.
104func (n *node) prevSibling() *node {
105 if n.parent == nil {
106 return nil
107 }
108 index := n.parent.childIndex(n)
109 if index == 0 {
110 return nil
111 }
112 return n.parent.childAt(index - 1)
113}
114
115// put inserts a key/value.
116func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
117 if pgid >= n.bucket.tx.meta.pgid {
118 panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid))
119 } else if len(oldKey) <= 0 {
120 panic("put: zero-length old key")
121 } else if len(newKey) <= 0 {
122 panic("put: zero-length new key")
123 }
124
125 // Find insertion index.
126 index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })
127
128 // Add capacity and shift nodes if we don't have an exact match and need to insert.
129 exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
130 if !exact {
131 n.inodes = append(n.inodes, inode{})
132 copy(n.inodes[index+1:], n.inodes[index:])
133 }
134
135 inode := &n.inodes[index]
136 inode.flags = flags
137 inode.key = newKey
138 inode.value = value
139 inode.pgid = pgid
140 _assert(len(inode.key) > 0, "put: zero-length inode key")
141}
142
143// del removes a key from the node.
144func (n *node) del(key []byte) {
145 // Find index of key.
146 index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 })
147
148 // Exit if the key isn't found.
149 if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) {
150 return
151 }
152
153 // Delete inode from the node.
154 n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)
155
156 // Mark the node as needing rebalancing.
157 n.unbalanced = true
158}
159
160// read initializes the node from a page.
161func (n *node) read(p *page) {
162 n.pgid = p.id
163 n.isLeaf = ((p.flags & leafPageFlag) != 0)
164 n.inodes = make(inodes, int(p.count))
165
166 for i := 0; i < int(p.count); i++ {
167 inode := &n.inodes[i]
168 if n.isLeaf {
169 elem := p.leafPageElement(uint16(i))
170 inode.flags = elem.flags
171 inode.key = elem.key()
172 inode.value = elem.value()
173 } else {
174 elem := p.branchPageElement(uint16(i))
175 inode.pgid = elem.pgid
176 inode.key = elem.key()
177 }
178 _assert(len(inode.key) > 0, "read: zero-length inode key")
179 }
180
181 // Save first key so we can find the node in the parent when we spill.
182 if len(n.inodes) > 0 {
183 n.key = n.inodes[0].key
184 _assert(len(n.key) > 0, "read: zero-length node key")
185 } else {
186 n.key = nil
187 }
188}
189
190// write writes the items onto one or more pages.
191func (n *node) write(p *page) {
192 // Initialize page.
193 if n.isLeaf {
194 p.flags |= leafPageFlag
195 } else {
196 p.flags |= branchPageFlag
197 }
198
199 if len(n.inodes) >= 0xFFFF {
200 panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id))
201 }
202 p.count = uint16(len(n.inodes))
203
204 // Stop here if there are no items to write.
205 if p.count == 0 {
206 return
207 }
208
209 // Loop over each item and write it to the page.
210 b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):]
211 for i, item := range n.inodes {
212 _assert(len(item.key) > 0, "write: zero-length inode key")
213
214 // Write the page element.
215 if n.isLeaf {
216 elem := p.leafPageElement(uint16(i))
217 elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
218 elem.flags = item.flags
219 elem.ksize = uint32(len(item.key))
220 elem.vsize = uint32(len(item.value))
221 } else {
222 elem := p.branchPageElement(uint16(i))
223 elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
224 elem.ksize = uint32(len(item.key))
225 elem.pgid = item.pgid
226 _assert(elem.pgid != p.id, "write: circular dependency occurred")
227 }
228
229 // If the length of key+value is larger than the max allocation size
230 // then we need to reallocate the byte array pointer.
231 //
232 // See: https://github.com/boltdb/bolt/pull/335
233 klen, vlen := len(item.key), len(item.value)
234 if len(b) < klen+vlen {
235 b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0]))[:]
236 }
237
238 // Write data for the element to the end of the page.
239 copy(b[0:], item.key)
240 b = b[klen:]
241 copy(b[0:], item.value)
242 b = b[vlen:]
243 }
244
245 // DEBUG ONLY: n.dump()
246}
247
248// split breaks up a node into multiple smaller nodes, if appropriate.
249// This should only be called from the spill() function.
250func (n *node) split(pageSize int) []*node {
251 var nodes []*node
252
253 node := n
254 for {
255 // Split node into two.
256 a, b := node.splitTwo(pageSize)
257 nodes = append(nodes, a)
258
259 // If we can't split then exit the loop.
260 if b == nil {
261 break
262 }
263
264 // Set node to b so it gets split on the next iteration.
265 node = b
266 }
267
268 return nodes
269}
270
271// splitTwo breaks up a node into two smaller nodes, if appropriate.
272// This should only be called from the split() function.
273func (n *node) splitTwo(pageSize int) (*node, *node) {
274 // Ignore the split if the page doesn't have at least enough nodes for
275 // two pages or if the nodes can fit in a single page.
276 if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) {
277 return n, nil
278 }
279
280 // Determine the threshold before starting a new node.
281 var fillPercent = n.bucket.FillPercent
282 if fillPercent < minFillPercent {
283 fillPercent = minFillPercent
284 } else if fillPercent > maxFillPercent {
285 fillPercent = maxFillPercent
286 }
287 threshold := int(float64(pageSize) * fillPercent)
288
289 // Determine split position and sizes of the two pages.
290 splitIndex, _ := n.splitIndex(threshold)
291
292 // Split node into two separate nodes.
293 // If there's no parent then we'll need to create one.
294 if n.parent == nil {
295 n.parent = &node{bucket: n.bucket, children: []*node{n}}
296 }
297
298 // Create a new node and add it to the parent.
299 next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
300 n.parent.children = append(n.parent.children, next)
301
302 // Split inodes across two nodes.
303 next.inodes = n.inodes[splitIndex:]
304 n.inodes = n.inodes[:splitIndex]
305
306 // Update the statistics.
307 n.bucket.tx.stats.Split++
308
309 return n, next
310}
311
312// splitIndex finds the position where a page will fill a given threshold.
313// It returns the index as well as the size of the first page.
314// This is only be called from split().
315func (n *node) splitIndex(threshold int) (index, sz int) {
316 sz = pageHeaderSize
317
318 // Loop until we only have the minimum number of keys required for the second page.
319 for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
320 index = i
321 inode := n.inodes[i]
322 elsize := n.pageElementSize() + len(inode.key) + len(inode.value)
323
324 // If we have at least the minimum number of keys and adding another
325 // node would put us over the threshold then exit and return.
326 if i >= minKeysPerPage && sz+elsize > threshold {
327 break
328 }
329
330 // Add the element size to the total size.
331 sz += elsize
332 }
333
334 return
335}
336
337// spill writes the nodes to dirty pages and splits nodes as it goes.
338// Returns an error if dirty pages cannot be allocated.
339func (n *node) spill() error {
340 var tx = n.bucket.tx
341 if n.spilled {
342 return nil
343 }
344
345 // Spill child nodes first. Child nodes can materialize sibling nodes in
346 // the case of split-merge so we cannot use a range loop. We have to check
347 // the children size on every loop iteration.
348 sort.Sort(n.children)
349 for i := 0; i < len(n.children); i++ {
350 if err := n.children[i].spill(); err != nil {
351 return err
352 }
353 }
354
355 // We no longer need the child list because it's only used for spill tracking.
356 n.children = nil
357
358 // Split nodes into appropriate sizes. The first node will always be n.
359 var nodes = n.split(tx.db.pageSize)
360 for _, node := range nodes {
361 // Add node's page to the freelist if it's not new.
362 if node.pgid > 0 {
363 tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
364 node.pgid = 0
365 }
366
367 // Allocate contiguous space for the node.
368 p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize)
369 if err != nil {
370 return err
371 }
372
373 // Write the node.
374 if p.id >= tx.meta.pgid {
375 panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid))
376 }
377 node.pgid = p.id
378 node.write(p)
379 node.spilled = true
380
381 // Insert into parent inodes.
382 if node.parent != nil {
383 var key = node.key
384 if key == nil {
385 key = node.inodes[0].key
386 }
387
388 node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
389 node.key = node.inodes[0].key
390 _assert(len(node.key) > 0, "spill: zero-length node key")
391 }
392
393 // Update the statistics.
394 tx.stats.Spill++
395 }
396
397 // If the root node split and created a new root then we need to spill that
398 // as well. We'll clear out the children to make sure it doesn't try to respill.
399 if n.parent != nil && n.parent.pgid == 0 {
400 n.children = nil
401 return n.parent.spill()
402 }
403
404 return nil
405}
406
407// rebalance attempts to combine the node with sibling nodes if the node fill
408// size is below a threshold or if there are not enough keys.
409func (n *node) rebalance() {
410 if !n.unbalanced {
411 return
412 }
413 n.unbalanced = false
414
415 // Update statistics.
416 n.bucket.tx.stats.Rebalance++
417
418 // Ignore if node is above threshold (25%) and has enough keys.
419 var threshold = n.bucket.tx.db.pageSize / 4
420 if n.size() > threshold && len(n.inodes) > n.minKeys() {
421 return
422 }
423
424 // Root node has special handling.
425 if n.parent == nil {
426 // If root node is a branch and only has one node then collapse it.
427 if !n.isLeaf && len(n.inodes) == 1 {
428 // Move root's child up.
429 child := n.bucket.node(n.inodes[0].pgid, n)
430 n.isLeaf = child.isLeaf
431 n.inodes = child.inodes[:]
432 n.children = child.children
433
434 // Reparent all child nodes being moved.
435 for _, inode := range n.inodes {
436 if child, ok := n.bucket.nodes[inode.pgid]; ok {
437 child.parent = n
438 }
439 }
440
441 // Remove old child.
442 child.parent = nil
443 delete(n.bucket.nodes, child.pgid)
444 child.free()
445 }
446
447 return
448 }
449
450 // If node has no keys then just remove it.
451 if n.numChildren() == 0 {
452 n.parent.del(n.key)
453 n.parent.removeChild(n)
454 delete(n.bucket.nodes, n.pgid)
455 n.free()
456 n.parent.rebalance()
457 return
458 }
459
460 _assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
461
462 // Destination node is right sibling if idx == 0, otherwise left sibling.
463 var target *node
464 var useNextSibling = (n.parent.childIndex(n) == 0)
465 if useNextSibling {
466 target = n.nextSibling()
467 } else {
468 target = n.prevSibling()
469 }
470
471 // If both this node and the target node are too small then merge them.
472 if useNextSibling {
473 // Reparent all child nodes being moved.
474 for _, inode := range target.inodes {
475 if child, ok := n.bucket.nodes[inode.pgid]; ok {
476 child.parent.removeChild(child)
477 child.parent = n
478 child.parent.children = append(child.parent.children, child)
479 }
480 }
481
482 // Copy over inodes from target and remove target.
483 n.inodes = append(n.inodes, target.inodes...)
484 n.parent.del(target.key)
485 n.parent.removeChild(target)
486 delete(n.bucket.nodes, target.pgid)
487 target.free()
488 } else {
489 // Reparent all child nodes being moved.
490 for _, inode := range n.inodes {
491 if child, ok := n.bucket.nodes[inode.pgid]; ok {
492 child.parent.removeChild(child)
493 child.parent = target
494 child.parent.children = append(child.parent.children, child)
495 }
496 }
497
498 // Copy over inodes to target and remove node.
499 target.inodes = append(target.inodes, n.inodes...)
500 n.parent.del(n.key)
501 n.parent.removeChild(n)
502 delete(n.bucket.nodes, n.pgid)
503 n.free()
504 }
505
506 // Either this node or the target node was deleted from the parent so rebalance it.
507 n.parent.rebalance()
508}
509
510// removes a node from the list of in-memory children.
511// This does not affect the inodes.
512func (n *node) removeChild(target *node) {
513 for i, child := range n.children {
514 if child == target {
515 n.children = append(n.children[:i], n.children[i+1:]...)
516 return
517 }
518 }
519}
520
521// dereference causes the node to copy all its inode key/value references to heap memory.
522// This is required when the mmap is reallocated so inodes are not pointing to stale data.
523func (n *node) dereference() {
524 if n.key != nil {
525 key := make([]byte, len(n.key))
526 copy(key, n.key)
527 n.key = key
528 _assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node")
529 }
530
531 for i := range n.inodes {
532 inode := &n.inodes[i]
533
534 key := make([]byte, len(inode.key))
535 copy(key, inode.key)
536 inode.key = key
537 _assert(len(inode.key) > 0, "dereference: zero-length inode key")
538
539 value := make([]byte, len(inode.value))
540 copy(value, inode.value)
541 inode.value = value
542 }
543
544 // Recursively dereference children.
545 for _, child := range n.children {
546 child.dereference()
547 }
548
549 // Update statistics.
550 n.bucket.tx.stats.NodeDeref++
551}
552
553// free adds the node's underlying page to the freelist.
554func (n *node) free() {
555 if n.pgid != 0 {
556 n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid))
557 n.pgid = 0
558 }
559}
560
561// dump writes the contents of the node to STDERR for debugging purposes.
562/*
563func (n *node) dump() {
564 // Write node header.
565 var typ = "branch"
566 if n.isLeaf {
567 typ = "leaf"
568 }
569 warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes))
570
571 // Write out abbreviated version of each item.
572 for _, item := range n.inodes {
573 if n.isLeaf {
574 if item.flags&bucketLeafFlag != 0 {
575 bucket := (*bucket)(unsafe.Pointer(&item.value[0]))
576 warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root)
577 } else {
578 warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4))
579 }
580 } else {
581 warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid)
582 }
583 }
584 warn("")
585}
586*/
587
588type nodes []*node
589
590func (s nodes) Len() int { return len(s) }
591func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
592func (s nodes) Less(i, j int) bool { return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 }
593
594// inode represents an internal node inside of a node.
595// It can be used to point to elements in a page or point
596// to an element which hasn't been added to a page yet.
597type inode struct {
598 flags uint32
599 pgid pgid
600 key []byte
601 value []byte
602}
603
604type inodes []inode