blob: 2a8a0361a0b91aa760bbd032e644d00a075355fb [file] [log] [blame]
khenaidood948f772021-08-11 17:49:24 -04001// Copyright 2015 The etcd Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package mvcc
16
17import (
18 "context"
19 "encoding/binary"
20 "errors"
21 "hash/crc32"
22 "math"
23 "sync"
24 "sync/atomic"
25 "time"
26
27 "github.com/coreos/etcd/lease"
28 "github.com/coreos/etcd/mvcc/backend"
29 "github.com/coreos/etcd/mvcc/mvccpb"
30 "github.com/coreos/etcd/pkg/schedule"
31 "github.com/coreos/pkg/capnslog"
32)
33
34var (
35 keyBucketName = []byte("key")
36 metaBucketName = []byte("meta")
37
38 consistentIndexKeyName = []byte("consistent_index")
39 scheduledCompactKeyName = []byte("scheduledCompactRev")
40 finishedCompactKeyName = []byte("finishedCompactRev")
41
42 ErrCompacted = errors.New("mvcc: required revision has been compacted")
43 ErrFutureRev = errors.New("mvcc: required revision is a future revision")
44 ErrCanceled = errors.New("mvcc: watcher is canceled")
45 ErrClosed = errors.New("mvcc: closed")
46
47 plog = capnslog.NewPackageLogger("github.com/coreos/etcd", "mvcc")
48)
49
50const (
51 // markedRevBytesLen is the byte length of marked revision.
52 // The first `revBytesLen` bytes represents a normal revision. The last
53 // one byte is the mark.
54 markedRevBytesLen = revBytesLen + 1
55 markBytePosition = markedRevBytesLen - 1
56 markTombstone byte = 't'
57)
58
59var restoreChunkKeys = 10000 // non-const for testing
60
61// ConsistentIndexGetter is an interface that wraps the Get method.
62// Consistent index is the offset of an entry in a consistent replicated log.
63type ConsistentIndexGetter interface {
64 // ConsistentIndex returns the consistent index of current executing entry.
65 ConsistentIndex() uint64
66}
67
68type store struct {
69 ReadView
70 WriteView
71
72 // consistentIndex caches the "consistent_index" key's value. Accessed
73 // through atomics so must be 64-bit aligned.
74 consistentIndex uint64
75
76 // mu read locks for txns and write locks for non-txn store changes.
77 mu sync.RWMutex
78
79 ig ConsistentIndexGetter
80
81 b backend.Backend
82 kvindex index
83
84 le lease.Lessor
85
86 // revMuLock protects currentRev and compactMainRev.
87 // Locked at end of write txn and released after write txn unlock lock.
88 // Locked before locking read txn and released after locking.
89 revMu sync.RWMutex
90 // currentRev is the revision of the last completed transaction.
91 currentRev int64
92 // compactMainRev is the main revision of the last compaction.
93 compactMainRev int64
94
95 // bytesBuf8 is a byte slice of length 8
96 // to avoid a repetitive allocation in saveIndex.
97 bytesBuf8 []byte
98
99 fifoSched schedule.Scheduler
100
101 stopc chan struct{}
102}
103
104// NewStore returns a new store. It is useful to create a store inside
105// mvcc pkg. It should only be used for testing externally.
106func NewStore(b backend.Backend, le lease.Lessor, ig ConsistentIndexGetter) *store {
107 s := &store{
108 b: b,
109 ig: ig,
110 kvindex: newTreeIndex(),
111
112 le: le,
113
114 currentRev: 1,
115 compactMainRev: -1,
116
117 bytesBuf8: make([]byte, 8),
118 fifoSched: schedule.NewFIFOScheduler(),
119
120 stopc: make(chan struct{}),
121 }
122 s.ReadView = &readView{s}
123 s.WriteView = &writeView{s}
124 if s.le != nil {
125 s.le.SetRangeDeleter(func() lease.TxnDelete { return s.Write() })
126 }
127
128 tx := s.b.BatchTx()
129 tx.Lock()
130 tx.UnsafeCreateBucket(keyBucketName)
131 tx.UnsafeCreateBucket(metaBucketName)
132 tx.Unlock()
133 s.b.ForceCommit()
134
135 if err := s.restore(); err != nil {
136 // TODO: return the error instead of panic here?
137 panic("failed to recover store from backend")
138 }
139
140 return s
141}
142
143func (s *store) compactBarrier(ctx context.Context, ch chan struct{}) {
144 if ctx == nil || ctx.Err() != nil {
145 select {
146 case <-s.stopc:
147 default:
148 // fix deadlock in mvcc,for more information, please refer to pr 11817.
149 // s.stopc is only updated in restore operation, which is called by apply
150 // snapshot call, compaction and apply snapshot requests are serialized by
151 // raft, and do not happen at the same time.
152 s.mu.Lock()
153 f := func(ctx context.Context) { s.compactBarrier(ctx, ch) }
154 s.fifoSched.Schedule(f)
155 s.mu.Unlock()
156 }
157 return
158 }
159 close(ch)
160}
161
162func (s *store) Hash() (hash uint32, revision int64, err error) {
163 start := time.Now()
164
165 s.b.ForceCommit()
166 h, err := s.b.Hash(DefaultIgnores)
167
168 hashDurations.Observe(time.Since(start).Seconds())
169 return h, s.currentRev, err
170}
171
172func (s *store) HashByRev(rev int64) (hash uint32, currentRev int64, compactRev int64, err error) {
173 start := time.Now()
174
175 s.mu.RLock()
176 s.revMu.RLock()
177 compactRev, currentRev = s.compactMainRev, s.currentRev
178 s.revMu.RUnlock()
179
180 if rev > 0 && rev <= compactRev {
181 s.mu.RUnlock()
182 return 0, 0, compactRev, ErrCompacted
183 } else if rev > 0 && rev > currentRev {
184 s.mu.RUnlock()
185 return 0, currentRev, 0, ErrFutureRev
186 }
187
188 if rev == 0 {
189 rev = currentRev
190 }
191 keep := s.kvindex.Keep(rev)
192
193 tx := s.b.ReadTx()
194 tx.Lock()
195 defer tx.Unlock()
196 s.mu.RUnlock()
197
198 upper := revision{main: rev + 1}
199 lower := revision{main: compactRev + 1}
200 h := crc32.New(crc32.MakeTable(crc32.Castagnoli))
201
202 h.Write(keyBucketName)
203 err = tx.UnsafeForEach(keyBucketName, func(k, v []byte) error {
204 kr := bytesToRev(k)
205 if !upper.GreaterThan(kr) {
206 return nil
207 }
208 // skip revisions that are scheduled for deletion
209 // due to compacting; don't skip if there isn't one.
210 if lower.GreaterThan(kr) && len(keep) > 0 {
211 if _, ok := keep[kr]; !ok {
212 return nil
213 }
214 }
215 h.Write(k)
216 h.Write(v)
217 return nil
218 })
219 hash = h.Sum32()
220
221 hashRevDurations.Observe(time.Since(start).Seconds())
222 return hash, currentRev, compactRev, err
223}
224
225func (s *store) Compact(rev int64) (<-chan struct{}, error) {
226 s.mu.Lock()
227 defer s.mu.Unlock()
228 s.revMu.Lock()
229 defer s.revMu.Unlock()
230
231 if rev <= s.compactMainRev {
232 ch := make(chan struct{})
233 f := func(ctx context.Context) { s.compactBarrier(ctx, ch) }
234 s.fifoSched.Schedule(f)
235 return ch, ErrCompacted
236 }
237 if rev > s.currentRev {
238 return nil, ErrFutureRev
239 }
240
241 start := time.Now()
242
243 s.compactMainRev = rev
244
245 rbytes := newRevBytes()
246 revToBytes(revision{main: rev}, rbytes)
247
248 tx := s.b.BatchTx()
249 tx.Lock()
250 tx.UnsafePut(metaBucketName, scheduledCompactKeyName, rbytes)
251 tx.Unlock()
252 // ensure that desired compaction is persisted
253 s.b.ForceCommit()
254
255 keep := s.kvindex.Compact(rev)
256 ch := make(chan struct{})
257 var j = func(ctx context.Context) {
258 if ctx.Err() != nil {
259 s.compactBarrier(ctx, ch)
260 return
261 }
262 if !s.scheduleCompaction(rev, keep) {
263 s.compactBarrier(nil, ch)
264 return
265 }
266 close(ch)
267 }
268
269 s.fifoSched.Schedule(j)
270
271 indexCompactionPauseDurations.Observe(float64(time.Since(start) / time.Millisecond))
272 return ch, nil
273}
274
275// DefaultIgnores is a map of keys to ignore in hash checking.
276var DefaultIgnores map[backend.IgnoreKey]struct{}
277
278func init() {
279 DefaultIgnores = map[backend.IgnoreKey]struct{}{
280 // consistent index might be changed due to v2 internal sync, which
281 // is not controllable by the user.
282 {Bucket: string(metaBucketName), Key: string(consistentIndexKeyName)}: {},
283 }
284}
285
286func (s *store) Commit() {
287 s.mu.Lock()
288 defer s.mu.Unlock()
289
290 tx := s.b.BatchTx()
291 tx.Lock()
292 s.saveIndex(tx)
293 tx.Unlock()
294 s.b.ForceCommit()
295}
296
297func (s *store) Restore(b backend.Backend) error {
298 s.mu.Lock()
299 defer s.mu.Unlock()
300
301 close(s.stopc)
302 s.fifoSched.Stop()
303
304 atomic.StoreUint64(&s.consistentIndex, 0)
305 s.b = b
306 s.kvindex = newTreeIndex()
307 s.currentRev = 1
308 s.compactMainRev = -1
309 s.fifoSched = schedule.NewFIFOScheduler()
310 s.stopc = make(chan struct{})
311
312 return s.restore()
313}
314
315func (s *store) restore() error {
316 s.setupMetricsReporter()
317
318 min, max := newRevBytes(), newRevBytes()
319 revToBytes(revision{main: 1}, min)
320 revToBytes(revision{main: math.MaxInt64, sub: math.MaxInt64}, max)
321
322 keyToLease := make(map[string]lease.LeaseID)
323
324 // restore index
325 tx := s.b.BatchTx()
326 tx.Lock()
327
328 _, finishedCompactBytes := tx.UnsafeRange(metaBucketName, finishedCompactKeyName, nil, 0)
329 if len(finishedCompactBytes) != 0 {
330 s.compactMainRev = bytesToRev(finishedCompactBytes[0]).main
331 plog.Printf("restore compact to %d", s.compactMainRev)
332 }
333 _, scheduledCompactBytes := tx.UnsafeRange(metaBucketName, scheduledCompactKeyName, nil, 0)
334 scheduledCompact := int64(0)
335 if len(scheduledCompactBytes) != 0 {
336 scheduledCompact = bytesToRev(scheduledCompactBytes[0]).main
337 }
338
339 // index keys concurrently as they're loaded in from tx
340 keysGauge.Set(0)
341 rkvc, revc := restoreIntoIndex(s.kvindex)
342 for {
343 keys, vals := tx.UnsafeRange(keyBucketName, min, max, int64(restoreChunkKeys))
344 if len(keys) == 0 {
345 break
346 }
347 // rkvc blocks if the total pending keys exceeds the restore
348 // chunk size to keep keys from consuming too much memory.
349 restoreChunk(rkvc, keys, vals, keyToLease)
350 if len(keys) < restoreChunkKeys {
351 // partial set implies final set
352 break
353 }
354 // next set begins after where this one ended
355 newMin := bytesToRev(keys[len(keys)-1][:revBytesLen])
356 newMin.sub++
357 revToBytes(newMin, min)
358 }
359 close(rkvc)
360 s.currentRev = <-revc
361
362 // keys in the range [compacted revision -N, compaction] might all be deleted due to compaction.
363 // the correct revision should be set to compaction revision in the case, not the largest revision
364 // we have seen.
365 if s.currentRev < s.compactMainRev {
366 s.currentRev = s.compactMainRev
367 }
368 if scheduledCompact <= s.compactMainRev {
369 scheduledCompact = 0
370 }
371
372 for key, lid := range keyToLease {
373 if s.le == nil {
374 panic("no lessor to attach lease")
375 }
376 err := s.le.Attach(lid, []lease.LeaseItem{{Key: key}})
377 if err != nil {
378 plog.Errorf("unexpected Attach error: %v", err)
379 }
380 }
381
382 tx.Unlock()
383
384 if scheduledCompact != 0 {
385 s.Compact(scheduledCompact)
386 plog.Printf("resume scheduled compaction at %d", scheduledCompact)
387 }
388
389 return nil
390}
391
392type revKeyValue struct {
393 key []byte
394 kv mvccpb.KeyValue
395 kstr string
396}
397
398func restoreIntoIndex(idx index) (chan<- revKeyValue, <-chan int64) {
399 rkvc, revc := make(chan revKeyValue, restoreChunkKeys), make(chan int64, 1)
400 go func() {
401 currentRev := int64(1)
402 defer func() { revc <- currentRev }()
403 // restore the tree index from streaming the unordered index.
404 kiCache := make(map[string]*keyIndex, restoreChunkKeys)
405 for rkv := range rkvc {
406 ki, ok := kiCache[rkv.kstr]
407 // purge kiCache if many keys but still missing in the cache
408 if !ok && len(kiCache) >= restoreChunkKeys {
409 i := 10
410 for k := range kiCache {
411 delete(kiCache, k)
412 if i--; i == 0 {
413 break
414 }
415 }
416 }
417 // cache miss, fetch from tree index if there
418 if !ok {
419 ki = &keyIndex{key: rkv.kv.Key}
420 if idxKey := idx.KeyIndex(ki); idxKey != nil {
421 kiCache[rkv.kstr], ki = idxKey, idxKey
422 ok = true
423 }
424 }
425 rev := bytesToRev(rkv.key)
426 currentRev = rev.main
427 if ok {
428 if isTombstone(rkv.key) {
429 ki.tombstone(rev.main, rev.sub)
430 continue
431 }
432 ki.put(rev.main, rev.sub)
433 } else if !isTombstone(rkv.key) {
434 ki.restore(revision{rkv.kv.CreateRevision, 0}, rev, rkv.kv.Version)
435 idx.Insert(ki)
436 kiCache[rkv.kstr] = ki
437 }
438 }
439 }()
440 return rkvc, revc
441}
442
443func restoreChunk(kvc chan<- revKeyValue, keys, vals [][]byte, keyToLease map[string]lease.LeaseID) {
444 for i, key := range keys {
445 rkv := revKeyValue{key: key}
446 if err := rkv.kv.Unmarshal(vals[i]); err != nil {
447 plog.Fatalf("cannot unmarshal event: %v", err)
448 }
449 rkv.kstr = string(rkv.kv.Key)
450 if isTombstone(key) {
451 delete(keyToLease, rkv.kstr)
452 } else if lid := lease.LeaseID(rkv.kv.Lease); lid != lease.NoLease {
453 keyToLease[rkv.kstr] = lid
454 } else {
455 delete(keyToLease, rkv.kstr)
456 }
457 kvc <- rkv
458 }
459}
460
461func (s *store) Close() error {
462 close(s.stopc)
463 s.fifoSched.Stop()
464 return nil
465}
466
467func (s *store) saveIndex(tx backend.BatchTx) {
468 if s.ig == nil {
469 return
470 }
471 bs := s.bytesBuf8
472 ci := s.ig.ConsistentIndex()
473 binary.BigEndian.PutUint64(bs, ci)
474 // put the index into the underlying backend
475 // tx has been locked in TxnBegin, so there is no need to lock it again
476 tx.UnsafePut(metaBucketName, consistentIndexKeyName, bs)
477 atomic.StoreUint64(&s.consistentIndex, ci)
478}
479
480func (s *store) ConsistentIndex() uint64 {
481 if ci := atomic.LoadUint64(&s.consistentIndex); ci > 0 {
482 return ci
483 }
484 tx := s.b.BatchTx()
485 tx.Lock()
486 defer tx.Unlock()
487 _, vs := tx.UnsafeRange(metaBucketName, consistentIndexKeyName, nil, 0)
488 if len(vs) == 0 {
489 return 0
490 }
491 v := binary.BigEndian.Uint64(vs[0])
492 atomic.StoreUint64(&s.consistentIndex, v)
493 return v
494}
495
496func (s *store) setupMetricsReporter() {
497 b := s.b
498 reportDbTotalSizeInBytesMu.Lock()
499 reportDbTotalSizeInBytes = func() float64 { return float64(b.Size()) }
500 reportDbTotalSizeInBytesMu.Unlock()
501 reportDbTotalSizeInUseInBytesMu.Lock()
502 reportDbTotalSizeInUseInBytes = func() float64 { return float64(b.SizeInUse()) }
503 reportDbTotalSizeInUseInBytesMu.Unlock()
504 reportCurrentRevMu.Lock()
505 reportCurrentRev = func() float64 {
506 s.revMu.RLock()
507 defer s.revMu.RUnlock()
508 return float64(s.currentRev)
509 }
510 reportCurrentRevMu.Unlock()
511 reportCompactRevMu.Lock()
512 reportCompactRev = func() float64 {
513 s.revMu.RLock()
514 defer s.revMu.RUnlock()
515 return float64(s.compactMainRev)
516 }
517 reportCompactRevMu.Unlock()
518}
519
520// appendMarkTombstone appends tombstone mark to normal revision bytes.
521func appendMarkTombstone(b []byte) []byte {
522 if len(b) != revBytesLen {
523 plog.Panicf("cannot append mark to non normal revision bytes")
524 }
525 return append(b, markTombstone)
526}
527
528// isTombstone checks whether the revision bytes is a tombstone.
529func isTombstone(b []byte) bool {
530 return len(b) == markedRevBytesLen && b[markBytePosition] == markTombstone
531}