blob: d441b69256dad669342d4d71933b21e3005487c7 [file] [log] [blame]
khenaidood948f772021-08-11 17:49:24 -04001package bbolt
2
3import (
4 "fmt"
5 "reflect"
6 "sort"
7 "unsafe"
8)
9
10// txPending holds a list of pgids and corresponding allocation txns
11// that are pending to be freed.
12type txPending struct {
13 ids []pgid
14 alloctx []txid // txids allocating the ids
15 lastReleaseBegin txid // beginning txid of last matching releaseRange
16}
17
18// pidSet holds the set of starting pgids which have the same span size
19type pidSet map[pgid]struct{}
20
21// freelist represents a list of all pages that are available for allocation.
22// It also tracks pages that have been freed but are still in use by open transactions.
23type freelist struct {
24 freelistType FreelistType // freelist type
25 ids []pgid // all free and available free page ids.
26 allocs map[pgid]txid // mapping of txid that allocated a pgid.
27 pending map[txid]*txPending // mapping of soon-to-be free page ids by tx.
28 cache map[pgid]bool // fast lookup of all free and pending page ids.
29 freemaps map[uint64]pidSet // key is the size of continuous pages(span), value is a set which contains the starting pgids of same size
30 forwardMap map[pgid]uint64 // key is start pgid, value is its span size
31 backwardMap map[pgid]uint64 // key is end pgid, value is its span size
32 allocate func(txid txid, n int) pgid // the freelist allocate func
33 free_count func() int // the function which gives you free page number
34 mergeSpans func(ids pgids) // the mergeSpan func
35 getFreePageIDs func() []pgid // get free pgids func
36 readIDs func(pgids []pgid) // readIDs func reads list of pages and init the freelist
37}
38
39// newFreelist returns an empty, initialized freelist.
40func newFreelist(freelistType FreelistType) *freelist {
41 f := &freelist{
42 freelistType: freelistType,
43 allocs: make(map[pgid]txid),
44 pending: make(map[txid]*txPending),
45 cache: make(map[pgid]bool),
46 freemaps: make(map[uint64]pidSet),
47 forwardMap: make(map[pgid]uint64),
48 backwardMap: make(map[pgid]uint64),
49 }
50
51 if freelistType == FreelistMapType {
52 f.allocate = f.hashmapAllocate
53 f.free_count = f.hashmapFreeCount
54 f.mergeSpans = f.hashmapMergeSpans
55 f.getFreePageIDs = f.hashmapGetFreePageIDs
56 f.readIDs = f.hashmapReadIDs
57 } else {
58 f.allocate = f.arrayAllocate
59 f.free_count = f.arrayFreeCount
60 f.mergeSpans = f.arrayMergeSpans
61 f.getFreePageIDs = f.arrayGetFreePageIDs
62 f.readIDs = f.arrayReadIDs
63 }
64
65 return f
66}
67
68// size returns the size of the page after serialization.
69func (f *freelist) size() int {
70 n := f.count()
71 if n >= 0xFFFF {
72 // The first element will be used to store the count. See freelist.write.
73 n++
74 }
75 return int(pageHeaderSize) + (int(unsafe.Sizeof(pgid(0))) * n)
76}
77
78// count returns count of pages on the freelist
79func (f *freelist) count() int {
80 return f.free_count() + f.pending_count()
81}
82
83// arrayFreeCount returns count of free pages(array version)
84func (f *freelist) arrayFreeCount() int {
85 return len(f.ids)
86}
87
88// pending_count returns count of pending pages
89func (f *freelist) pending_count() int {
90 var count int
91 for _, txp := range f.pending {
92 count += len(txp.ids)
93 }
94 return count
95}
96
97// copyallunsafe copies a list of all free ids and all pending ids in one sorted list.
98// f.count returns the minimum length required for dst.
99func (f *freelist) copyallunsafe(dstptr unsafe.Pointer) { // dstptr is []pgid data pointer
100 m := make(pgids, 0, f.pending_count())
101 for _, txp := range f.pending {
102 m = append(m, txp.ids...)
103 }
104 sort.Sort(m)
105 fpgids := f.getFreePageIDs()
106 sz := len(fpgids) + len(m)
107 dst := *(*[]pgid)(unsafe.Pointer(&reflect.SliceHeader{
108 Data: uintptr(dstptr),
109 Len: sz,
110 Cap: sz,
111 }))
112 mergepgids(dst, fpgids, m)
113}
114
115func (f *freelist) copyall(dst []pgid) {
116 m := make(pgids, 0, f.pending_count())
117 for _, txp := range f.pending {
118 m = append(m, txp.ids...)
119 }
120 sort.Sort(m)
121 mergepgids(dst, f.getFreePageIDs(), m)
122}
123
124// arrayAllocate returns the starting page id of a contiguous list of pages of a given size.
125// If a contiguous block cannot be found then 0 is returned.
126func (f *freelist) arrayAllocate(txid txid, n int) pgid {
127 if len(f.ids) == 0 {
128 return 0
129 }
130
131 var initial, previd pgid
132 for i, id := range f.ids {
133 if id <= 1 {
134 panic(fmt.Sprintf("invalid page allocation: %d", id))
135 }
136
137 // Reset initial page if this is not contiguous.
138 if previd == 0 || id-previd != 1 {
139 initial = id
140 }
141
142 // If we found a contiguous block then remove it and return it.
143 if (id-initial)+1 == pgid(n) {
144 // If we're allocating off the beginning then take the fast path
145 // and just adjust the existing slice. This will use extra memory
146 // temporarily but the append() in free() will realloc the slice
147 // as is necessary.
148 if (i + 1) == n {
149 f.ids = f.ids[i+1:]
150 } else {
151 copy(f.ids[i-n+1:], f.ids[i+1:])
152 f.ids = f.ids[:len(f.ids)-n]
153 }
154
155 // Remove from the free cache.
156 for i := pgid(0); i < pgid(n); i++ {
157 delete(f.cache, initial+i)
158 }
159 f.allocs[initial] = txid
160 return initial
161 }
162
163 previd = id
164 }
165 return 0
166}
167
168// free releases a page and its overflow for a given transaction id.
169// If the page is already free then a panic will occur.
170func (f *freelist) free(txid txid, p *page) {
171 if p.id <= 1 {
172 panic(fmt.Sprintf("cannot free page 0 or 1: %d", p.id))
173 }
174
175 // Free page and all its overflow pages.
176 txp := f.pending[txid]
177 if txp == nil {
178 txp = &txPending{}
179 f.pending[txid] = txp
180 }
181 allocTxid, ok := f.allocs[p.id]
182 if ok {
183 delete(f.allocs, p.id)
184 } else if (p.flags & freelistPageFlag) != 0 {
185 // Freelist is always allocated by prior tx.
186 allocTxid = txid - 1
187 }
188
189 for id := p.id; id <= p.id+pgid(p.overflow); id++ {
190 // Verify that page is not already free.
191 if f.cache[id] {
192 panic(fmt.Sprintf("page %d already freed", id))
193 }
194 // Add to the freelist and cache.
195 txp.ids = append(txp.ids, id)
196 txp.alloctx = append(txp.alloctx, allocTxid)
197 f.cache[id] = true
198 }
199}
200
201// release moves all page ids for a transaction id (or older) to the freelist.
202func (f *freelist) release(txid txid) {
203 m := make(pgids, 0)
204 for tid, txp := range f.pending {
205 if tid <= txid {
206 // Move transaction's pending pages to the available freelist.
207 // Don't remove from the cache since the page is still free.
208 m = append(m, txp.ids...)
209 delete(f.pending, tid)
210 }
211 }
212 f.mergeSpans(m)
213}
214
215// releaseRange moves pending pages allocated within an extent [begin,end] to the free list.
216func (f *freelist) releaseRange(begin, end txid) {
217 if begin > end {
218 return
219 }
220 var m pgids
221 for tid, txp := range f.pending {
222 if tid < begin || tid > end {
223 continue
224 }
225 // Don't recompute freed pages if ranges haven't updated.
226 if txp.lastReleaseBegin == begin {
227 continue
228 }
229 for i := 0; i < len(txp.ids); i++ {
230 if atx := txp.alloctx[i]; atx < begin || atx > end {
231 continue
232 }
233 m = append(m, txp.ids[i])
234 txp.ids[i] = txp.ids[len(txp.ids)-1]
235 txp.ids = txp.ids[:len(txp.ids)-1]
236 txp.alloctx[i] = txp.alloctx[len(txp.alloctx)-1]
237 txp.alloctx = txp.alloctx[:len(txp.alloctx)-1]
238 i--
239 }
240 txp.lastReleaseBegin = begin
241 if len(txp.ids) == 0 {
242 delete(f.pending, tid)
243 }
244 }
245 f.mergeSpans(m)
246}
247
248// rollback removes the pages from a given pending tx.
249func (f *freelist) rollback(txid txid) {
250 // Remove page ids from cache.
251 txp := f.pending[txid]
252 if txp == nil {
253 return
254 }
255 var m pgids
256 for i, pgid := range txp.ids {
257 delete(f.cache, pgid)
258 tx := txp.alloctx[i]
259 if tx == 0 {
260 continue
261 }
262 if tx != txid {
263 // Pending free aborted; restore page back to alloc list.
264 f.allocs[pgid] = tx
265 } else {
266 // Freed page was allocated by this txn; OK to throw away.
267 m = append(m, pgid)
268 }
269 }
270 // Remove pages from pending list and mark as free if allocated by txid.
271 delete(f.pending, txid)
272 f.mergeSpans(m)
273}
274
275// freed returns whether a given page is in the free list.
276func (f *freelist) freed(pgid pgid) bool {
277 return f.cache[pgid]
278}
279
280// read initializes the freelist from a freelist page.
281func (f *freelist) read(p *page) {
282 if (p.flags & freelistPageFlag) == 0 {
283 panic(fmt.Sprintf("invalid freelist page: %d, page type is %s", p.id, p.typ()))
284 }
285 // If the page.count is at the max uint16 value (64k) then it's considered
286 // an overflow and the size of the freelist is stored as the first element.
287 var idx, count uintptr = 0, uintptr(p.count)
288 if count == 0xFFFF {
289 idx = 1
290 count = uintptr(*(*pgid)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + unsafe.Sizeof(*p))))
291 }
292
293 // Copy the list of page ids from the freelist.
294 if count == 0 {
295 f.ids = nil
296 } else {
297 ids := *(*[]pgid)(unsafe.Pointer(&reflect.SliceHeader{
298 Data: uintptr(unsafe.Pointer(p)) + unsafe.Sizeof(*p) + idx*unsafe.Sizeof(pgid(0)),
299 Len: int(count),
300 Cap: int(count),
301 }))
302
303 // copy the ids, so we don't modify on the freelist page directly
304 idsCopy := make([]pgid, count)
305 copy(idsCopy, ids)
306 // Make sure they're sorted.
307 sort.Sort(pgids(idsCopy))
308
309 f.readIDs(idsCopy)
310 }
311}
312
313// arrayReadIDs initializes the freelist from a given list of ids.
314func (f *freelist) arrayReadIDs(ids []pgid) {
315 f.ids = ids
316 f.reindex()
317}
318
319func (f *freelist) arrayGetFreePageIDs() []pgid {
320 return f.ids
321}
322
323// write writes the page ids onto a freelist page. All free and pending ids are
324// saved to disk since in the event of a program crash, all pending ids will
325// become free.
326func (f *freelist) write(p *page) error {
327 // Combine the old free pgids and pgids waiting on an open transaction.
328
329 // Update the header flag.
330 p.flags |= freelistPageFlag
331
332 // The page.count can only hold up to 64k elements so if we overflow that
333 // number then we handle it by putting the size in the first element.
334 lenids := f.count()
335 if lenids == 0 {
336 p.count = uint16(lenids)
337 } else if lenids < 0xFFFF {
338 p.count = uint16(lenids)
339 f.copyallunsafe(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + unsafe.Sizeof(*p)))
340 } else {
341 p.count = 0xFFFF
342 *(*pgid)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + unsafe.Sizeof(*p))) = pgid(lenids)
343 f.copyallunsafe(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + unsafe.Sizeof(*p) + unsafe.Sizeof(pgid(0))))
344 }
345
346 return nil
347}
348
349// reload reads the freelist from a page and filters out pending items.
350func (f *freelist) reload(p *page) {
351 f.read(p)
352
353 // Build a cache of only pending pages.
354 pcache := make(map[pgid]bool)
355 for _, txp := range f.pending {
356 for _, pendingID := range txp.ids {
357 pcache[pendingID] = true
358 }
359 }
360
361 // Check each page in the freelist and build a new available freelist
362 // with any pages not in the pending lists.
363 var a []pgid
364 for _, id := range f.getFreePageIDs() {
365 if !pcache[id] {
366 a = append(a, id)
367 }
368 }
369
370 f.readIDs(a)
371}
372
373// noSyncReload reads the freelist from pgids and filters out pending items.
374func (f *freelist) noSyncReload(pgids []pgid) {
375 // Build a cache of only pending pages.
376 pcache := make(map[pgid]bool)
377 for _, txp := range f.pending {
378 for _, pendingID := range txp.ids {
379 pcache[pendingID] = true
380 }
381 }
382
383 // Check each page in the freelist and build a new available freelist
384 // with any pages not in the pending lists.
385 var a []pgid
386 for _, id := range pgids {
387 if !pcache[id] {
388 a = append(a, id)
389 }
390 }
391
392 f.readIDs(a)
393}
394
395// reindex rebuilds the free cache based on available and pending free lists.
396func (f *freelist) reindex() {
397 ids := f.getFreePageIDs()
398 f.cache = make(map[pgid]bool, len(ids))
399 for _, id := range ids {
400 f.cache[id] = true
401 }
402 for _, txp := range f.pending {
403 for _, pendingID := range txp.ids {
404 f.cache[pendingID] = true
405 }
406 }
407}
408
409// arrayMergeSpans try to merge list of pages(represented by pgids) with existing spans but using array
410func (f *freelist) arrayMergeSpans(ids pgids) {
411 sort.Sort(ids)
412 f.ids = pgids(f.ids).merge(ids)
413}