khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 1 | // Copyright 2015 The etcd Authors |
| 2 | // |
| 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | // you may not use this file except in compliance with the License. |
| 5 | // You may obtain a copy of the License at |
| 6 | // |
| 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | // |
| 9 | // Unless required by applicable law or agreed to in writing, software |
| 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | // See the License for the specific language governing permissions and |
| 13 | // limitations under the License. |
| 14 | |
| 15 | package mvcc |
| 16 | |
| 17 | import ( |
| 18 | "context" |
| 19 | "encoding/binary" |
| 20 | "errors" |
| 21 | "hash/crc32" |
| 22 | "math" |
| 23 | "sync" |
| 24 | "sync/atomic" |
| 25 | "time" |
| 26 | |
| 27 | "github.com/coreos/etcd/lease" |
| 28 | "github.com/coreos/etcd/mvcc/backend" |
| 29 | "github.com/coreos/etcd/mvcc/mvccpb" |
| 30 | "github.com/coreos/etcd/pkg/schedule" |
| 31 | "github.com/coreos/pkg/capnslog" |
| 32 | ) |
| 33 | |
| 34 | var ( |
| 35 | keyBucketName = []byte("key") |
| 36 | metaBucketName = []byte("meta") |
| 37 | |
| 38 | consistentIndexKeyName = []byte("consistent_index") |
| 39 | scheduledCompactKeyName = []byte("scheduledCompactRev") |
| 40 | finishedCompactKeyName = []byte("finishedCompactRev") |
| 41 | |
| 42 | ErrCompacted = errors.New("mvcc: required revision has been compacted") |
| 43 | ErrFutureRev = errors.New("mvcc: required revision is a future revision") |
| 44 | ErrCanceled = errors.New("mvcc: watcher is canceled") |
| 45 | ErrClosed = errors.New("mvcc: closed") |
| 46 | |
| 47 | plog = capnslog.NewPackageLogger("github.com/coreos/etcd", "mvcc") |
| 48 | ) |
| 49 | |
| 50 | const ( |
| 51 | // markedRevBytesLen is the byte length of marked revision. |
| 52 | // The first `revBytesLen` bytes represents a normal revision. The last |
| 53 | // one byte is the mark. |
| 54 | markedRevBytesLen = revBytesLen + 1 |
| 55 | markBytePosition = markedRevBytesLen - 1 |
| 56 | markTombstone byte = 't' |
| 57 | ) |
| 58 | |
| 59 | var restoreChunkKeys = 10000 // non-const for testing |
| 60 | |
| 61 | // ConsistentIndexGetter is an interface that wraps the Get method. |
| 62 | // Consistent index is the offset of an entry in a consistent replicated log. |
| 63 | type ConsistentIndexGetter interface { |
| 64 | // ConsistentIndex returns the consistent index of current executing entry. |
| 65 | ConsistentIndex() uint64 |
| 66 | } |
| 67 | |
| 68 | type store struct { |
| 69 | ReadView |
| 70 | WriteView |
| 71 | |
| 72 | // consistentIndex caches the "consistent_index" key's value. Accessed |
| 73 | // through atomics so must be 64-bit aligned. |
| 74 | consistentIndex uint64 |
| 75 | |
| 76 | // mu read locks for txns and write locks for non-txn store changes. |
| 77 | mu sync.RWMutex |
| 78 | |
| 79 | ig ConsistentIndexGetter |
| 80 | |
| 81 | b backend.Backend |
| 82 | kvindex index |
| 83 | |
| 84 | le lease.Lessor |
| 85 | |
| 86 | // revMuLock protects currentRev and compactMainRev. |
| 87 | // Locked at end of write txn and released after write txn unlock lock. |
| 88 | // Locked before locking read txn and released after locking. |
| 89 | revMu sync.RWMutex |
| 90 | // currentRev is the revision of the last completed transaction. |
| 91 | currentRev int64 |
| 92 | // compactMainRev is the main revision of the last compaction. |
| 93 | compactMainRev int64 |
| 94 | |
| 95 | // bytesBuf8 is a byte slice of length 8 |
| 96 | // to avoid a repetitive allocation in saveIndex. |
| 97 | bytesBuf8 []byte |
| 98 | |
| 99 | fifoSched schedule.Scheduler |
| 100 | |
| 101 | stopc chan struct{} |
| 102 | } |
| 103 | |
| 104 | // NewStore returns a new store. It is useful to create a store inside |
| 105 | // mvcc pkg. It should only be used for testing externally. |
| 106 | func NewStore(b backend.Backend, le lease.Lessor, ig ConsistentIndexGetter) *store { |
| 107 | s := &store{ |
| 108 | b: b, |
| 109 | ig: ig, |
| 110 | kvindex: newTreeIndex(), |
| 111 | |
| 112 | le: le, |
| 113 | |
| 114 | currentRev: 1, |
| 115 | compactMainRev: -1, |
| 116 | |
| 117 | bytesBuf8: make([]byte, 8), |
| 118 | fifoSched: schedule.NewFIFOScheduler(), |
| 119 | |
| 120 | stopc: make(chan struct{}), |
| 121 | } |
| 122 | s.ReadView = &readView{s} |
| 123 | s.WriteView = &writeView{s} |
| 124 | if s.le != nil { |
| 125 | s.le.SetRangeDeleter(func() lease.TxnDelete { return s.Write() }) |
| 126 | } |
| 127 | |
| 128 | tx := s.b.BatchTx() |
| 129 | tx.Lock() |
| 130 | tx.UnsafeCreateBucket(keyBucketName) |
| 131 | tx.UnsafeCreateBucket(metaBucketName) |
| 132 | tx.Unlock() |
| 133 | s.b.ForceCommit() |
| 134 | |
| 135 | if err := s.restore(); err != nil { |
| 136 | // TODO: return the error instead of panic here? |
| 137 | panic("failed to recover store from backend") |
| 138 | } |
| 139 | |
| 140 | return s |
| 141 | } |
| 142 | |
| 143 | func (s *store) compactBarrier(ctx context.Context, ch chan struct{}) { |
| 144 | if ctx == nil || ctx.Err() != nil { |
| 145 | select { |
| 146 | case <-s.stopc: |
| 147 | default: |
| 148 | // fix deadlock in mvcc,for more information, please refer to pr 11817. |
| 149 | // s.stopc is only updated in restore operation, which is called by apply |
| 150 | // snapshot call, compaction and apply snapshot requests are serialized by |
| 151 | // raft, and do not happen at the same time. |
| 152 | s.mu.Lock() |
| 153 | f := func(ctx context.Context) { s.compactBarrier(ctx, ch) } |
| 154 | s.fifoSched.Schedule(f) |
| 155 | s.mu.Unlock() |
| 156 | } |
| 157 | return |
| 158 | } |
| 159 | close(ch) |
| 160 | } |
| 161 | |
| 162 | func (s *store) Hash() (hash uint32, revision int64, err error) { |
| 163 | start := time.Now() |
| 164 | |
| 165 | s.b.ForceCommit() |
| 166 | h, err := s.b.Hash(DefaultIgnores) |
| 167 | |
| 168 | hashDurations.Observe(time.Since(start).Seconds()) |
| 169 | return h, s.currentRev, err |
| 170 | } |
| 171 | |
| 172 | func (s *store) HashByRev(rev int64) (hash uint32, currentRev int64, compactRev int64, err error) { |
| 173 | start := time.Now() |
| 174 | |
| 175 | s.mu.RLock() |
| 176 | s.revMu.RLock() |
| 177 | compactRev, currentRev = s.compactMainRev, s.currentRev |
| 178 | s.revMu.RUnlock() |
| 179 | |
| 180 | if rev > 0 && rev <= compactRev { |
| 181 | s.mu.RUnlock() |
| 182 | return 0, 0, compactRev, ErrCompacted |
| 183 | } else if rev > 0 && rev > currentRev { |
| 184 | s.mu.RUnlock() |
| 185 | return 0, currentRev, 0, ErrFutureRev |
| 186 | } |
| 187 | |
| 188 | if rev == 0 { |
| 189 | rev = currentRev |
| 190 | } |
| 191 | keep := s.kvindex.Keep(rev) |
| 192 | |
| 193 | tx := s.b.ReadTx() |
| 194 | tx.Lock() |
| 195 | defer tx.Unlock() |
| 196 | s.mu.RUnlock() |
| 197 | |
| 198 | upper := revision{main: rev + 1} |
| 199 | lower := revision{main: compactRev + 1} |
| 200 | h := crc32.New(crc32.MakeTable(crc32.Castagnoli)) |
| 201 | |
| 202 | h.Write(keyBucketName) |
| 203 | err = tx.UnsafeForEach(keyBucketName, func(k, v []byte) error { |
| 204 | kr := bytesToRev(k) |
| 205 | if !upper.GreaterThan(kr) { |
| 206 | return nil |
| 207 | } |
| 208 | // skip revisions that are scheduled for deletion |
| 209 | // due to compacting; don't skip if there isn't one. |
| 210 | if lower.GreaterThan(kr) && len(keep) > 0 { |
| 211 | if _, ok := keep[kr]; !ok { |
| 212 | return nil |
| 213 | } |
| 214 | } |
| 215 | h.Write(k) |
| 216 | h.Write(v) |
| 217 | return nil |
| 218 | }) |
| 219 | hash = h.Sum32() |
| 220 | |
| 221 | hashRevDurations.Observe(time.Since(start).Seconds()) |
| 222 | return hash, currentRev, compactRev, err |
| 223 | } |
| 224 | |
| 225 | func (s *store) Compact(rev int64) (<-chan struct{}, error) { |
| 226 | s.mu.Lock() |
| 227 | defer s.mu.Unlock() |
| 228 | s.revMu.Lock() |
| 229 | defer s.revMu.Unlock() |
| 230 | |
| 231 | if rev <= s.compactMainRev { |
| 232 | ch := make(chan struct{}) |
| 233 | f := func(ctx context.Context) { s.compactBarrier(ctx, ch) } |
| 234 | s.fifoSched.Schedule(f) |
| 235 | return ch, ErrCompacted |
| 236 | } |
| 237 | if rev > s.currentRev { |
| 238 | return nil, ErrFutureRev |
| 239 | } |
| 240 | |
| 241 | start := time.Now() |
| 242 | |
| 243 | s.compactMainRev = rev |
| 244 | |
| 245 | rbytes := newRevBytes() |
| 246 | revToBytes(revision{main: rev}, rbytes) |
| 247 | |
| 248 | tx := s.b.BatchTx() |
| 249 | tx.Lock() |
| 250 | tx.UnsafePut(metaBucketName, scheduledCompactKeyName, rbytes) |
| 251 | tx.Unlock() |
| 252 | // ensure that desired compaction is persisted |
| 253 | s.b.ForceCommit() |
| 254 | |
| 255 | keep := s.kvindex.Compact(rev) |
| 256 | ch := make(chan struct{}) |
| 257 | var j = func(ctx context.Context) { |
| 258 | if ctx.Err() != nil { |
| 259 | s.compactBarrier(ctx, ch) |
| 260 | return |
| 261 | } |
| 262 | if !s.scheduleCompaction(rev, keep) { |
| 263 | s.compactBarrier(nil, ch) |
| 264 | return |
| 265 | } |
| 266 | close(ch) |
| 267 | } |
| 268 | |
| 269 | s.fifoSched.Schedule(j) |
| 270 | |
| 271 | indexCompactionPauseDurations.Observe(float64(time.Since(start) / time.Millisecond)) |
| 272 | return ch, nil |
| 273 | } |
| 274 | |
| 275 | // DefaultIgnores is a map of keys to ignore in hash checking. |
| 276 | var DefaultIgnores map[backend.IgnoreKey]struct{} |
| 277 | |
| 278 | func init() { |
| 279 | DefaultIgnores = map[backend.IgnoreKey]struct{}{ |
| 280 | // consistent index might be changed due to v2 internal sync, which |
| 281 | // is not controllable by the user. |
| 282 | {Bucket: string(metaBucketName), Key: string(consistentIndexKeyName)}: {}, |
| 283 | } |
| 284 | } |
| 285 | |
| 286 | func (s *store) Commit() { |
| 287 | s.mu.Lock() |
| 288 | defer s.mu.Unlock() |
| 289 | |
| 290 | tx := s.b.BatchTx() |
| 291 | tx.Lock() |
| 292 | s.saveIndex(tx) |
| 293 | tx.Unlock() |
| 294 | s.b.ForceCommit() |
| 295 | } |
| 296 | |
| 297 | func (s *store) Restore(b backend.Backend) error { |
| 298 | s.mu.Lock() |
| 299 | defer s.mu.Unlock() |
| 300 | |
| 301 | close(s.stopc) |
| 302 | s.fifoSched.Stop() |
| 303 | |
| 304 | atomic.StoreUint64(&s.consistentIndex, 0) |
| 305 | s.b = b |
| 306 | s.kvindex = newTreeIndex() |
| 307 | s.currentRev = 1 |
| 308 | s.compactMainRev = -1 |
| 309 | s.fifoSched = schedule.NewFIFOScheduler() |
| 310 | s.stopc = make(chan struct{}) |
| 311 | |
| 312 | return s.restore() |
| 313 | } |
| 314 | |
| 315 | func (s *store) restore() error { |
| 316 | s.setupMetricsReporter() |
| 317 | |
| 318 | min, max := newRevBytes(), newRevBytes() |
| 319 | revToBytes(revision{main: 1}, min) |
| 320 | revToBytes(revision{main: math.MaxInt64, sub: math.MaxInt64}, max) |
| 321 | |
| 322 | keyToLease := make(map[string]lease.LeaseID) |
| 323 | |
| 324 | // restore index |
| 325 | tx := s.b.BatchTx() |
| 326 | tx.Lock() |
| 327 | |
| 328 | _, finishedCompactBytes := tx.UnsafeRange(metaBucketName, finishedCompactKeyName, nil, 0) |
| 329 | if len(finishedCompactBytes) != 0 { |
| 330 | s.compactMainRev = bytesToRev(finishedCompactBytes[0]).main |
| 331 | plog.Printf("restore compact to %d", s.compactMainRev) |
| 332 | } |
| 333 | _, scheduledCompactBytes := tx.UnsafeRange(metaBucketName, scheduledCompactKeyName, nil, 0) |
| 334 | scheduledCompact := int64(0) |
| 335 | if len(scheduledCompactBytes) != 0 { |
| 336 | scheduledCompact = bytesToRev(scheduledCompactBytes[0]).main |
| 337 | } |
| 338 | |
| 339 | // index keys concurrently as they're loaded in from tx |
| 340 | keysGauge.Set(0) |
| 341 | rkvc, revc := restoreIntoIndex(s.kvindex) |
| 342 | for { |
| 343 | keys, vals := tx.UnsafeRange(keyBucketName, min, max, int64(restoreChunkKeys)) |
| 344 | if len(keys) == 0 { |
| 345 | break |
| 346 | } |
| 347 | // rkvc blocks if the total pending keys exceeds the restore |
| 348 | // chunk size to keep keys from consuming too much memory. |
| 349 | restoreChunk(rkvc, keys, vals, keyToLease) |
| 350 | if len(keys) < restoreChunkKeys { |
| 351 | // partial set implies final set |
| 352 | break |
| 353 | } |
| 354 | // next set begins after where this one ended |
| 355 | newMin := bytesToRev(keys[len(keys)-1][:revBytesLen]) |
| 356 | newMin.sub++ |
| 357 | revToBytes(newMin, min) |
| 358 | } |
| 359 | close(rkvc) |
| 360 | s.currentRev = <-revc |
| 361 | |
| 362 | // keys in the range [compacted revision -N, compaction] might all be deleted due to compaction. |
| 363 | // the correct revision should be set to compaction revision in the case, not the largest revision |
| 364 | // we have seen. |
| 365 | if s.currentRev < s.compactMainRev { |
| 366 | s.currentRev = s.compactMainRev |
| 367 | } |
| 368 | if scheduledCompact <= s.compactMainRev { |
| 369 | scheduledCompact = 0 |
| 370 | } |
| 371 | |
| 372 | for key, lid := range keyToLease { |
| 373 | if s.le == nil { |
| 374 | panic("no lessor to attach lease") |
| 375 | } |
| 376 | err := s.le.Attach(lid, []lease.LeaseItem{{Key: key}}) |
| 377 | if err != nil { |
| 378 | plog.Errorf("unexpected Attach error: %v", err) |
| 379 | } |
| 380 | } |
| 381 | |
| 382 | tx.Unlock() |
| 383 | |
| 384 | if scheduledCompact != 0 { |
| 385 | s.Compact(scheduledCompact) |
| 386 | plog.Printf("resume scheduled compaction at %d", scheduledCompact) |
| 387 | } |
| 388 | |
| 389 | return nil |
| 390 | } |
| 391 | |
| 392 | type revKeyValue struct { |
| 393 | key []byte |
| 394 | kv mvccpb.KeyValue |
| 395 | kstr string |
| 396 | } |
| 397 | |
| 398 | func restoreIntoIndex(idx index) (chan<- revKeyValue, <-chan int64) { |
| 399 | rkvc, revc := make(chan revKeyValue, restoreChunkKeys), make(chan int64, 1) |
| 400 | go func() { |
| 401 | currentRev := int64(1) |
| 402 | defer func() { revc <- currentRev }() |
| 403 | // restore the tree index from streaming the unordered index. |
| 404 | kiCache := make(map[string]*keyIndex, restoreChunkKeys) |
| 405 | for rkv := range rkvc { |
| 406 | ki, ok := kiCache[rkv.kstr] |
| 407 | // purge kiCache if many keys but still missing in the cache |
| 408 | if !ok && len(kiCache) >= restoreChunkKeys { |
| 409 | i := 10 |
| 410 | for k := range kiCache { |
| 411 | delete(kiCache, k) |
| 412 | if i--; i == 0 { |
| 413 | break |
| 414 | } |
| 415 | } |
| 416 | } |
| 417 | // cache miss, fetch from tree index if there |
| 418 | if !ok { |
| 419 | ki = &keyIndex{key: rkv.kv.Key} |
| 420 | if idxKey := idx.KeyIndex(ki); idxKey != nil { |
| 421 | kiCache[rkv.kstr], ki = idxKey, idxKey |
| 422 | ok = true |
| 423 | } |
| 424 | } |
| 425 | rev := bytesToRev(rkv.key) |
| 426 | currentRev = rev.main |
| 427 | if ok { |
| 428 | if isTombstone(rkv.key) { |
| 429 | ki.tombstone(rev.main, rev.sub) |
| 430 | continue |
| 431 | } |
| 432 | ki.put(rev.main, rev.sub) |
| 433 | } else if !isTombstone(rkv.key) { |
| 434 | ki.restore(revision{rkv.kv.CreateRevision, 0}, rev, rkv.kv.Version) |
| 435 | idx.Insert(ki) |
| 436 | kiCache[rkv.kstr] = ki |
| 437 | } |
| 438 | } |
| 439 | }() |
| 440 | return rkvc, revc |
| 441 | } |
| 442 | |
| 443 | func restoreChunk(kvc chan<- revKeyValue, keys, vals [][]byte, keyToLease map[string]lease.LeaseID) { |
| 444 | for i, key := range keys { |
| 445 | rkv := revKeyValue{key: key} |
| 446 | if err := rkv.kv.Unmarshal(vals[i]); err != nil { |
| 447 | plog.Fatalf("cannot unmarshal event: %v", err) |
| 448 | } |
| 449 | rkv.kstr = string(rkv.kv.Key) |
| 450 | if isTombstone(key) { |
| 451 | delete(keyToLease, rkv.kstr) |
| 452 | } else if lid := lease.LeaseID(rkv.kv.Lease); lid != lease.NoLease { |
| 453 | keyToLease[rkv.kstr] = lid |
| 454 | } else { |
| 455 | delete(keyToLease, rkv.kstr) |
| 456 | } |
| 457 | kvc <- rkv |
| 458 | } |
| 459 | } |
| 460 | |
| 461 | func (s *store) Close() error { |
| 462 | close(s.stopc) |
| 463 | s.fifoSched.Stop() |
| 464 | return nil |
| 465 | } |
| 466 | |
| 467 | func (s *store) saveIndex(tx backend.BatchTx) { |
| 468 | if s.ig == nil { |
| 469 | return |
| 470 | } |
| 471 | bs := s.bytesBuf8 |
| 472 | ci := s.ig.ConsistentIndex() |
| 473 | binary.BigEndian.PutUint64(bs, ci) |
| 474 | // put the index into the underlying backend |
| 475 | // tx has been locked in TxnBegin, so there is no need to lock it again |
| 476 | tx.UnsafePut(metaBucketName, consistentIndexKeyName, bs) |
| 477 | atomic.StoreUint64(&s.consistentIndex, ci) |
| 478 | } |
| 479 | |
| 480 | func (s *store) ConsistentIndex() uint64 { |
| 481 | if ci := atomic.LoadUint64(&s.consistentIndex); ci > 0 { |
| 482 | return ci |
| 483 | } |
| 484 | tx := s.b.BatchTx() |
| 485 | tx.Lock() |
| 486 | defer tx.Unlock() |
| 487 | _, vs := tx.UnsafeRange(metaBucketName, consistentIndexKeyName, nil, 0) |
| 488 | if len(vs) == 0 { |
| 489 | return 0 |
| 490 | } |
| 491 | v := binary.BigEndian.Uint64(vs[0]) |
| 492 | atomic.StoreUint64(&s.consistentIndex, v) |
| 493 | return v |
| 494 | } |
| 495 | |
| 496 | func (s *store) setupMetricsReporter() { |
| 497 | b := s.b |
| 498 | reportDbTotalSizeInBytesMu.Lock() |
| 499 | reportDbTotalSizeInBytes = func() float64 { return float64(b.Size()) } |
| 500 | reportDbTotalSizeInBytesMu.Unlock() |
| 501 | reportDbTotalSizeInUseInBytesMu.Lock() |
| 502 | reportDbTotalSizeInUseInBytes = func() float64 { return float64(b.SizeInUse()) } |
| 503 | reportDbTotalSizeInUseInBytesMu.Unlock() |
| 504 | reportCurrentRevMu.Lock() |
| 505 | reportCurrentRev = func() float64 { |
| 506 | s.revMu.RLock() |
| 507 | defer s.revMu.RUnlock() |
| 508 | return float64(s.currentRev) |
| 509 | } |
| 510 | reportCurrentRevMu.Unlock() |
| 511 | reportCompactRevMu.Lock() |
| 512 | reportCompactRev = func() float64 { |
| 513 | s.revMu.RLock() |
| 514 | defer s.revMu.RUnlock() |
| 515 | return float64(s.compactMainRev) |
| 516 | } |
| 517 | reportCompactRevMu.Unlock() |
| 518 | } |
| 519 | |
| 520 | // appendMarkTombstone appends tombstone mark to normal revision bytes. |
| 521 | func appendMarkTombstone(b []byte) []byte { |
| 522 | if len(b) != revBytesLen { |
| 523 | plog.Panicf("cannot append mark to non normal revision bytes") |
| 524 | } |
| 525 | return append(b, markTombstone) |
| 526 | } |
| 527 | |
| 528 | // isTombstone checks whether the revision bytes is a tombstone. |
| 529 | func isTombstone(b []byte) bool { |
| 530 | return len(b) == markedRevBytesLen && b[markBytePosition] == markTombstone |
| 531 | } |