blob: 805922bfc9a3d6db39bc1c45bdca8012dd31a6e7 [file] [log] [blame]
khenaidood948f772021-08-11 17:49:24 -04001// Copyright 2015 The etcd Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package mvcc
16
17import (
18 "bytes"
19 "errors"
20 "fmt"
21
22 "github.com/google/btree"
23)
24
25var (
26 ErrRevisionNotFound = errors.New("mvcc: revision not found")
27)
28
29// keyIndex stores the revisions of a key in the backend.
30// Each keyIndex has at least one key generation.
31// Each generation might have several key versions.
32// Tombstone on a key appends an tombstone version at the end
33// of the current generation and creates a new empty generation.
34// Each version of a key has an index pointing to the backend.
35//
36// For example: put(1.0);put(2.0);tombstone(3.0);put(4.0);tombstone(5.0) on key "foo"
37// generate a keyIndex:
38// key: "foo"
39// rev: 5
40// generations:
41// {empty}
42// {4.0, 5.0(t)}
43// {1.0, 2.0, 3.0(t)}
44//
45// Compact a keyIndex removes the versions with smaller or equal to
46// rev except the largest one. If the generation becomes empty
47// during compaction, it will be removed. if all the generations get
48// removed, the keyIndex should be removed.
49//
50// For example:
51// compact(2) on the previous example
52// generations:
53// {empty}
54// {4.0, 5.0(t)}
55// {2.0, 3.0(t)}
56//
57// compact(4)
58// generations:
59// {empty}
60// {4.0, 5.0(t)}
61//
62// compact(5):
63// generations:
64// {empty} -> key SHOULD be removed.
65//
66// compact(6):
67// generations:
68// {empty} -> key SHOULD be removed.
69type keyIndex struct {
70 key []byte
71 modified revision // the main rev of the last modification
72 generations []generation
73}
74
75// put puts a revision to the keyIndex.
76func (ki *keyIndex) put(main int64, sub int64) {
77 rev := revision{main: main, sub: sub}
78
79 if !rev.GreaterThan(ki.modified) {
80 plog.Panicf("store.keyindex: put with unexpected smaller revision [%v / %v]", rev, ki.modified)
81 }
82 if len(ki.generations) == 0 {
83 ki.generations = append(ki.generations, generation{})
84 }
85 g := &ki.generations[len(ki.generations)-1]
86 if len(g.revs) == 0 { // create a new key
87 keysGauge.Inc()
88 g.created = rev
89 }
90 g.revs = append(g.revs, rev)
91 g.ver++
92 ki.modified = rev
93}
94
95func (ki *keyIndex) restore(created, modified revision, ver int64) {
96 if len(ki.generations) != 0 {
97 plog.Panicf("store.keyindex: cannot restore non-empty keyIndex")
98 }
99
100 ki.modified = modified
101 g := generation{created: created, ver: ver, revs: []revision{modified}}
102 ki.generations = append(ki.generations, g)
103 keysGauge.Inc()
104}
105
106// tombstone puts a revision, pointing to a tombstone, to the keyIndex.
107// It also creates a new empty generation in the keyIndex.
108// It returns ErrRevisionNotFound when tombstone on an empty generation.
109func (ki *keyIndex) tombstone(main int64, sub int64) error {
110 if ki.isEmpty() {
111 plog.Panicf("store.keyindex: unexpected tombstone on empty keyIndex %s", string(ki.key))
112 }
113 if ki.generations[len(ki.generations)-1].isEmpty() {
114 return ErrRevisionNotFound
115 }
116 ki.put(main, sub)
117 ki.generations = append(ki.generations, generation{})
118 keysGauge.Dec()
119 return nil
120}
121
122// get gets the modified, created revision and version of the key that satisfies the given atRev.
123// Rev must be higher than or equal to the given atRev.
124func (ki *keyIndex) get(atRev int64) (modified, created revision, ver int64, err error) {
125 if ki.isEmpty() {
126 plog.Panicf("store.keyindex: unexpected get on empty keyIndex %s", string(ki.key))
127 }
128 g := ki.findGeneration(atRev)
129 if g.isEmpty() {
130 return revision{}, revision{}, 0, ErrRevisionNotFound
131 }
132
133 n := g.walk(func(rev revision) bool { return rev.main > atRev })
134 if n != -1 {
135 return g.revs[n], g.created, g.ver - int64(len(g.revs)-n-1), nil
136 }
137
138 return revision{}, revision{}, 0, ErrRevisionNotFound
139}
140
141// since returns revisions since the given rev. Only the revision with the
142// largest sub revision will be returned if multiple revisions have the same
143// main revision.
144func (ki *keyIndex) since(rev int64) []revision {
145 if ki.isEmpty() {
146 plog.Panicf("store.keyindex: unexpected get on empty keyIndex %s", string(ki.key))
147 }
148 since := revision{rev, 0}
149 var gi int
150 // find the generations to start checking
151 for gi = len(ki.generations) - 1; gi > 0; gi-- {
152 g := ki.generations[gi]
153 if g.isEmpty() {
154 continue
155 }
156 if since.GreaterThan(g.created) {
157 break
158 }
159 }
160
161 var revs []revision
162 var last int64
163 for ; gi < len(ki.generations); gi++ {
164 for _, r := range ki.generations[gi].revs {
165 if since.GreaterThan(r) {
166 continue
167 }
168 if r.main == last {
169 // replace the revision with a new one that has higher sub value,
170 // because the original one should not be seen by external
171 revs[len(revs)-1] = r
172 continue
173 }
174 revs = append(revs, r)
175 last = r.main
176 }
177 }
178 return revs
179}
180
181// compact compacts a keyIndex by removing the versions with smaller or equal
182// revision than the given atRev except the largest one (If the largest one is
183// a tombstone, it will not be kept).
184// If a generation becomes empty during compaction, it will be removed.
185func (ki *keyIndex) compact(atRev int64, available map[revision]struct{}) {
186 if ki.isEmpty() {
187 plog.Panicf("store.keyindex: unexpected compact on empty keyIndex %s", string(ki.key))
188 }
189
190 genIdx, revIndex := ki.doCompact(atRev, available)
191
192 g := &ki.generations[genIdx]
193 if !g.isEmpty() {
194 // remove the previous contents.
195 if revIndex != -1 {
196 g.revs = g.revs[revIndex:]
197 }
198 // remove any tombstone
199 if len(g.revs) == 1 && genIdx != len(ki.generations)-1 {
200 delete(available, g.revs[0])
201 genIdx++
202 }
203 }
204
205 // remove the previous generations.
206 ki.generations = ki.generations[genIdx:]
207}
208
209// keep finds the revision to be kept if compact is called at given atRev.
210func (ki *keyIndex) keep(atRev int64, available map[revision]struct{}) {
211 if ki.isEmpty() {
212 return
213 }
214
215 genIdx, revIndex := ki.doCompact(atRev, available)
216 g := &ki.generations[genIdx]
217 if !g.isEmpty() {
218 // remove any tombstone
219 if revIndex == len(g.revs)-1 && genIdx != len(ki.generations)-1 {
220 delete(available, g.revs[revIndex])
221 }
222 }
223}
224
225func (ki *keyIndex) doCompact(atRev int64, available map[revision]struct{}) (genIdx int, revIndex int) {
226 // walk until reaching the first revision smaller or equal to "atRev",
227 // and add the revision to the available map
228 f := func(rev revision) bool {
229 if rev.main <= atRev {
230 available[rev] = struct{}{}
231 return false
232 }
233 return true
234 }
235
236 genIdx, g := 0, &ki.generations[0]
237 // find first generation includes atRev or created after atRev
238 for genIdx < len(ki.generations)-1 {
239 if tomb := g.revs[len(g.revs)-1].main; tomb > atRev {
240 break
241 }
242 genIdx++
243 g = &ki.generations[genIdx]
244 }
245
246 revIndex = g.walk(f)
247
248 return genIdx, revIndex
249}
250
251func (ki *keyIndex) isEmpty() bool {
252 return len(ki.generations) == 1 && ki.generations[0].isEmpty()
253}
254
255// findGeneration finds out the generation of the keyIndex that the
256// given rev belongs to. If the given rev is at the gap of two generations,
257// which means that the key does not exist at the given rev, it returns nil.
258func (ki *keyIndex) findGeneration(rev int64) *generation {
259 lastg := len(ki.generations) - 1
260 cg := lastg
261
262 for cg >= 0 {
263 if len(ki.generations[cg].revs) == 0 {
264 cg--
265 continue
266 }
267 g := ki.generations[cg]
268 if cg != lastg {
269 if tomb := g.revs[len(g.revs)-1].main; tomb <= rev {
270 return nil
271 }
272 }
273 if g.revs[0].main <= rev {
274 return &ki.generations[cg]
275 }
276 cg--
277 }
278 return nil
279}
280
281func (a *keyIndex) Less(b btree.Item) bool {
282 return bytes.Compare(a.key, b.(*keyIndex).key) == -1
283}
284
285func (a *keyIndex) equal(b *keyIndex) bool {
286 if !bytes.Equal(a.key, b.key) {
287 return false
288 }
289 if a.modified != b.modified {
290 return false
291 }
292 if len(a.generations) != len(b.generations) {
293 return false
294 }
295 for i := range a.generations {
296 ag, bg := a.generations[i], b.generations[i]
297 if !ag.equal(bg) {
298 return false
299 }
300 }
301 return true
302}
303
304func (ki *keyIndex) String() string {
305 var s string
306 for _, g := range ki.generations {
307 s += g.String()
308 }
309 return s
310}
311
312// generation contains multiple revisions of a key.
313type generation struct {
314 ver int64
315 created revision // when the generation is created (put in first revision).
316 revs []revision
317}
318
319func (g *generation) isEmpty() bool { return g == nil || len(g.revs) == 0 }
320
321// walk walks through the revisions in the generation in descending order.
322// It passes the revision to the given function.
323// walk returns until: 1. it finishes walking all pairs 2. the function returns false.
324// walk returns the position at where it stopped. If it stopped after
325// finishing walking, -1 will be returned.
326func (g *generation) walk(f func(rev revision) bool) int {
327 l := len(g.revs)
328 for i := range g.revs {
329 ok := f(g.revs[l-i-1])
330 if !ok {
331 return l - i - 1
332 }
333 }
334 return -1
335}
336
337func (g *generation) String() string {
338 return fmt.Sprintf("g: created[%d] ver[%d], revs %#v\n", g.created, g.ver, g.revs)
339}
340
341func (a generation) equal(b generation) bool {
342 if a.ver != b.ver {
343 return false
344 }
345 if len(a.revs) != len(b.revs) {
346 return false
347 }
348
349 for i := range a.revs {
350 ar, br := a.revs[i], b.revs[i]
351 if ar != br {
352 return false
353 }
354 }
355 return true
356}