khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 1 | // Copyright 2015 The etcd Authors |
| 2 | // |
| 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | // you may not use this file except in compliance with the License. |
| 5 | // You may obtain a copy of the License at |
| 6 | // |
| 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | // |
| 9 | // Unless required by applicable law or agreed to in writing, software |
| 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | // See the License for the specific language governing permissions and |
| 13 | // limitations under the License. |
| 14 | |
| 15 | package backend |
| 16 | |
| 17 | import ( |
| 18 | "fmt" |
| 19 | "hash/crc32" |
| 20 | "io" |
| 21 | "io/ioutil" |
| 22 | "os" |
| 23 | "path/filepath" |
| 24 | "sync" |
| 25 | "sync/atomic" |
| 26 | "time" |
| 27 | |
| 28 | bolt "github.com/coreos/bbolt" |
| 29 | "github.com/coreos/pkg/capnslog" |
| 30 | ) |
| 31 | |
| 32 | var ( |
| 33 | defaultBatchLimit = 10000 |
| 34 | defaultBatchInterval = 100 * time.Millisecond |
| 35 | |
| 36 | defragLimit = 10000 |
| 37 | |
| 38 | // initialMmapSize is the initial size of the mmapped region. Setting this larger than |
| 39 | // the potential max db size can prevent writer from blocking reader. |
| 40 | // This only works for linux. |
| 41 | initialMmapSize = uint64(10 * 1024 * 1024 * 1024) |
| 42 | |
| 43 | plog = capnslog.NewPackageLogger("github.com/coreos/etcd", "mvcc/backend") |
| 44 | |
| 45 | // minSnapshotWarningTimeout is the minimum threshold to trigger a long running snapshot warning. |
| 46 | minSnapshotWarningTimeout = time.Duration(30 * time.Second) |
| 47 | ) |
| 48 | |
| 49 | type Backend interface { |
| 50 | ReadTx() ReadTx |
| 51 | BatchTx() BatchTx |
| 52 | |
| 53 | Snapshot() Snapshot |
| 54 | Hash(ignores map[IgnoreKey]struct{}) (uint32, error) |
| 55 | // Size returns the current size of the backend. |
| 56 | Size() int64 |
| 57 | // SizeInUse returns the current size of the backend logically in use. |
| 58 | // Since the backend can manage free space in a non-byte unit such as |
| 59 | // number of pages, the returned value can be not exactly accurate in bytes. |
| 60 | SizeInUse() int64 |
| 61 | Defrag() error |
| 62 | ForceCommit() |
| 63 | Close() error |
| 64 | } |
| 65 | |
| 66 | type Snapshot interface { |
| 67 | // Size gets the size of the snapshot. |
| 68 | Size() int64 |
| 69 | // WriteTo writes the snapshot into the given writer. |
| 70 | WriteTo(w io.Writer) (n int64, err error) |
| 71 | // Close closes the snapshot. |
| 72 | Close() error |
| 73 | } |
| 74 | |
| 75 | type backend struct { |
| 76 | // size and commits are used with atomic operations so they must be |
| 77 | // 64-bit aligned, otherwise 32-bit tests will crash |
| 78 | |
| 79 | // size is the number of bytes in the backend |
| 80 | size int64 |
| 81 | |
| 82 | // sizeInUse is the number of bytes actually used in the backend |
| 83 | sizeInUse int64 |
| 84 | |
| 85 | // commits counts number of commits since start |
| 86 | commits int64 |
| 87 | |
| 88 | mu sync.RWMutex |
| 89 | db *bolt.DB |
| 90 | |
| 91 | batchInterval time.Duration |
| 92 | batchLimit int |
| 93 | batchTx *batchTxBuffered |
| 94 | |
| 95 | readTx *readTx |
| 96 | |
| 97 | stopc chan struct{} |
| 98 | donec chan struct{} |
| 99 | } |
| 100 | |
| 101 | type BackendConfig struct { |
| 102 | // Path is the file path to the backend file. |
| 103 | Path string |
| 104 | // BatchInterval is the maximum time before flushing the BatchTx. |
| 105 | BatchInterval time.Duration |
| 106 | // BatchLimit is the maximum puts before flushing the BatchTx. |
| 107 | BatchLimit int |
| 108 | // MmapSize is the number of bytes to mmap for the backend. |
| 109 | MmapSize uint64 |
| 110 | } |
| 111 | |
| 112 | func DefaultBackendConfig() BackendConfig { |
| 113 | return BackendConfig{ |
| 114 | BatchInterval: defaultBatchInterval, |
| 115 | BatchLimit: defaultBatchLimit, |
| 116 | MmapSize: initialMmapSize, |
| 117 | } |
| 118 | } |
| 119 | |
| 120 | func New(bcfg BackendConfig) Backend { |
| 121 | return newBackend(bcfg) |
| 122 | } |
| 123 | |
| 124 | func NewDefaultBackend(path string) Backend { |
| 125 | bcfg := DefaultBackendConfig() |
| 126 | bcfg.Path = path |
| 127 | return newBackend(bcfg) |
| 128 | } |
| 129 | |
| 130 | func newBackend(bcfg BackendConfig) *backend { |
| 131 | bopts := &bolt.Options{} |
| 132 | if boltOpenOptions != nil { |
| 133 | *bopts = *boltOpenOptions |
| 134 | } |
| 135 | bopts.InitialMmapSize = bcfg.mmapSize() |
| 136 | |
| 137 | db, err := bolt.Open(bcfg.Path, 0600, bopts) |
| 138 | if err != nil { |
| 139 | plog.Panicf("cannot open database at %s (%v)", bcfg.Path, err) |
| 140 | } |
| 141 | |
| 142 | // In future, may want to make buffering optional for low-concurrency systems |
| 143 | // or dynamically swap between buffered/non-buffered depending on workload. |
| 144 | b := &backend{ |
| 145 | db: db, |
| 146 | |
| 147 | batchInterval: bcfg.BatchInterval, |
| 148 | batchLimit: bcfg.BatchLimit, |
| 149 | |
| 150 | readTx: &readTx{ |
| 151 | buf: txReadBuffer{ |
| 152 | txBuffer: txBuffer{make(map[string]*bucketBuffer)}, |
| 153 | }, |
| 154 | buckets: make(map[string]*bolt.Bucket), |
| 155 | }, |
| 156 | |
| 157 | stopc: make(chan struct{}), |
| 158 | donec: make(chan struct{}), |
| 159 | } |
| 160 | b.batchTx = newBatchTxBuffered(b) |
| 161 | go b.run() |
| 162 | return b |
| 163 | } |
| 164 | |
| 165 | // BatchTx returns the current batch tx in coalescer. The tx can be used for read and |
| 166 | // write operations. The write result can be retrieved within the same tx immediately. |
| 167 | // The write result is isolated with other txs until the current one get committed. |
| 168 | func (b *backend) BatchTx() BatchTx { |
| 169 | return b.batchTx |
| 170 | } |
| 171 | |
| 172 | func (b *backend) ReadTx() ReadTx { return b.readTx } |
| 173 | |
| 174 | // ForceCommit forces the current batching tx to commit. |
| 175 | func (b *backend) ForceCommit() { |
| 176 | b.batchTx.Commit() |
| 177 | } |
| 178 | |
| 179 | func (b *backend) Snapshot() Snapshot { |
| 180 | b.batchTx.Commit() |
| 181 | |
| 182 | b.mu.RLock() |
| 183 | defer b.mu.RUnlock() |
| 184 | tx, err := b.db.Begin(false) |
| 185 | if err != nil { |
| 186 | plog.Fatalf("cannot begin tx (%s)", err) |
| 187 | } |
| 188 | |
| 189 | stopc, donec := make(chan struct{}), make(chan struct{}) |
| 190 | dbBytes := tx.Size() |
| 191 | go func() { |
| 192 | defer close(donec) |
| 193 | // sendRateBytes is based on transferring snapshot data over a 1 gigabit/s connection |
| 194 | // assuming a min tcp throughput of 100MB/s. |
| 195 | var sendRateBytes int64 = 100 * 1024 * 1014 |
| 196 | warningTimeout := time.Duration(int64((float64(dbBytes) / float64(sendRateBytes)) * float64(time.Second))) |
| 197 | if warningTimeout < minSnapshotWarningTimeout { |
| 198 | warningTimeout = minSnapshotWarningTimeout |
| 199 | } |
| 200 | start := time.Now() |
| 201 | ticker := time.NewTicker(warningTimeout) |
| 202 | defer ticker.Stop() |
| 203 | for { |
| 204 | select { |
| 205 | case <-ticker.C: |
| 206 | plog.Warningf("snapshotting is taking more than %v seconds to finish transferring %v MB [started at %v]", time.Since(start).Seconds(), float64(dbBytes)/float64(1024*1014), start) |
| 207 | case <-stopc: |
| 208 | snapshotDurations.Observe(time.Since(start).Seconds()) |
| 209 | return |
| 210 | } |
| 211 | } |
| 212 | }() |
| 213 | |
| 214 | return &snapshot{tx, stopc, donec} |
| 215 | } |
| 216 | |
| 217 | type IgnoreKey struct { |
| 218 | Bucket string |
| 219 | Key string |
| 220 | } |
| 221 | |
| 222 | func (b *backend) Hash(ignores map[IgnoreKey]struct{}) (uint32, error) { |
| 223 | h := crc32.New(crc32.MakeTable(crc32.Castagnoli)) |
| 224 | |
| 225 | b.mu.RLock() |
| 226 | defer b.mu.RUnlock() |
| 227 | err := b.db.View(func(tx *bolt.Tx) error { |
| 228 | c := tx.Cursor() |
| 229 | for next, _ := c.First(); next != nil; next, _ = c.Next() { |
| 230 | b := tx.Bucket(next) |
| 231 | if b == nil { |
| 232 | return fmt.Errorf("cannot get hash of bucket %s", string(next)) |
| 233 | } |
| 234 | h.Write(next) |
| 235 | b.ForEach(func(k, v []byte) error { |
| 236 | bk := IgnoreKey{Bucket: string(next), Key: string(k)} |
| 237 | if _, ok := ignores[bk]; !ok { |
| 238 | h.Write(k) |
| 239 | h.Write(v) |
| 240 | } |
| 241 | return nil |
| 242 | }) |
| 243 | } |
| 244 | return nil |
| 245 | }) |
| 246 | |
| 247 | if err != nil { |
| 248 | return 0, err |
| 249 | } |
| 250 | |
| 251 | return h.Sum32(), nil |
| 252 | } |
| 253 | |
| 254 | func (b *backend) Size() int64 { |
| 255 | return atomic.LoadInt64(&b.size) |
| 256 | } |
| 257 | |
| 258 | func (b *backend) SizeInUse() int64 { |
| 259 | return atomic.LoadInt64(&b.sizeInUse) |
| 260 | } |
| 261 | |
| 262 | func (b *backend) run() { |
| 263 | defer close(b.donec) |
| 264 | t := time.NewTimer(b.batchInterval) |
| 265 | defer t.Stop() |
| 266 | for { |
| 267 | select { |
| 268 | case <-t.C: |
| 269 | case <-b.stopc: |
| 270 | b.batchTx.CommitAndStop() |
| 271 | return |
| 272 | } |
| 273 | b.batchTx.Commit() |
| 274 | t.Reset(b.batchInterval) |
| 275 | } |
| 276 | } |
| 277 | |
| 278 | func (b *backend) Close() error { |
| 279 | close(b.stopc) |
| 280 | <-b.donec |
| 281 | return b.db.Close() |
| 282 | } |
| 283 | |
| 284 | // Commits returns total number of commits since start |
| 285 | func (b *backend) Commits() int64 { |
| 286 | return atomic.LoadInt64(&b.commits) |
| 287 | } |
| 288 | |
| 289 | func (b *backend) Defrag() error { |
| 290 | return b.defrag() |
| 291 | } |
| 292 | |
| 293 | func (b *backend) defrag() error { |
| 294 | now := time.Now() |
| 295 | |
| 296 | // TODO: make this non-blocking? |
| 297 | // lock batchTx to ensure nobody is using previous tx, and then |
| 298 | // close previous ongoing tx. |
| 299 | b.batchTx.Lock() |
| 300 | defer b.batchTx.Unlock() |
| 301 | |
| 302 | // lock database after lock tx to avoid deadlock. |
| 303 | b.mu.Lock() |
| 304 | defer b.mu.Unlock() |
| 305 | |
| 306 | // block concurrent read requests while resetting tx |
| 307 | b.readTx.mu.Lock() |
| 308 | defer b.readTx.mu.Unlock() |
| 309 | |
| 310 | b.batchTx.unsafeCommit(true) |
| 311 | b.batchTx.tx = nil |
| 312 | |
| 313 | // Create a temporary file to ensure we start with a clean slate. |
| 314 | // Snapshotter.cleanupSnapdir cleans up any of these that are found during startup. |
| 315 | dir := filepath.Dir(b.db.Path()) |
| 316 | temp, err := ioutil.TempFile(dir, "db.tmp.*") |
| 317 | if err != nil { |
| 318 | return err |
| 319 | } |
| 320 | options := bolt.Options{} |
| 321 | if boltOpenOptions != nil { |
| 322 | options = *boltOpenOptions |
| 323 | } |
| 324 | options.OpenFile = func(path string, i int, mode os.FileMode) (file *os.File, err error) { |
| 325 | return temp, nil |
| 326 | } |
| 327 | tdbp := temp.Name() |
| 328 | tmpdb, err := bolt.Open(tdbp, 0600, &options) |
| 329 | if err != nil { |
| 330 | return err |
| 331 | } |
| 332 | |
| 333 | // gofail: var defragBeforeCopy struct{} |
| 334 | err = defragdb(b.db, tmpdb, defragLimit) |
| 335 | |
| 336 | if err != nil { |
| 337 | tmpdb.Close() |
| 338 | if rmErr := os.RemoveAll(tmpdb.Path()); rmErr != nil { |
| 339 | plog.Fatalf("failed to remove db.tmp after defragmentation completed: %v", rmErr) |
| 340 | } |
| 341 | return err |
| 342 | } |
| 343 | |
| 344 | dbp := b.db.Path() |
| 345 | |
| 346 | err = b.db.Close() |
| 347 | if err != nil { |
| 348 | plog.Fatalf("cannot close database (%s)", err) |
| 349 | } |
| 350 | err = tmpdb.Close() |
| 351 | if err != nil { |
| 352 | plog.Fatalf("cannot close database (%s)", err) |
| 353 | } |
| 354 | // gofail: var defragBeforeRename struct{} |
| 355 | err = os.Rename(tdbp, dbp) |
| 356 | if err != nil { |
| 357 | plog.Fatalf("cannot rename database (%s)", err) |
| 358 | } |
| 359 | |
| 360 | b.db, err = bolt.Open(dbp, 0600, boltOpenOptions) |
| 361 | if err != nil { |
| 362 | plog.Panicf("cannot open database at %s (%v)", dbp, err) |
| 363 | } |
| 364 | b.batchTx.tx, err = b.db.Begin(true) |
| 365 | if err != nil { |
| 366 | plog.Fatalf("cannot begin tx (%s)", err) |
| 367 | } |
| 368 | |
| 369 | b.readTx.reset() |
| 370 | b.readTx.tx = b.unsafeBegin(false) |
| 371 | |
| 372 | size := b.readTx.tx.Size() |
| 373 | db := b.db |
| 374 | atomic.StoreInt64(&b.size, size) |
| 375 | atomic.StoreInt64(&b.sizeInUse, size-(int64(db.Stats().FreePageN)*int64(db.Info().PageSize))) |
| 376 | |
| 377 | took := time.Since(now) |
| 378 | defragDurations.Observe(took.Seconds()) |
| 379 | |
| 380 | return nil |
| 381 | } |
| 382 | |
| 383 | func defragdb(odb, tmpdb *bolt.DB, limit int) error { |
| 384 | // open a tx on tmpdb for writes |
| 385 | tmptx, err := tmpdb.Begin(true) |
| 386 | if err != nil { |
| 387 | return err |
| 388 | } |
| 389 | |
| 390 | // open a tx on old db for read |
| 391 | tx, err := odb.Begin(false) |
| 392 | if err != nil { |
| 393 | return err |
| 394 | } |
| 395 | defer tx.Rollback() |
| 396 | |
| 397 | c := tx.Cursor() |
| 398 | |
| 399 | count := 0 |
| 400 | for next, _ := c.First(); next != nil; next, _ = c.Next() { |
| 401 | b := tx.Bucket(next) |
| 402 | if b == nil { |
| 403 | return fmt.Errorf("backend: cannot defrag bucket %s", string(next)) |
| 404 | } |
| 405 | |
| 406 | tmpb, berr := tmptx.CreateBucketIfNotExists(next) |
| 407 | if berr != nil { |
| 408 | return berr |
| 409 | } |
| 410 | tmpb.FillPercent = 0.9 // for seq write in for each |
| 411 | |
| 412 | b.ForEach(func(k, v []byte) error { |
| 413 | count++ |
| 414 | if count > limit { |
| 415 | err = tmptx.Commit() |
| 416 | if err != nil { |
| 417 | return err |
| 418 | } |
| 419 | tmptx, err = tmpdb.Begin(true) |
| 420 | if err != nil { |
| 421 | return err |
| 422 | } |
| 423 | tmpb = tmptx.Bucket(next) |
| 424 | tmpb.FillPercent = 0.9 // for seq write in for each |
| 425 | |
| 426 | count = 0 |
| 427 | } |
| 428 | return tmpb.Put(k, v) |
| 429 | }) |
| 430 | } |
| 431 | |
| 432 | return tmptx.Commit() |
| 433 | } |
| 434 | |
| 435 | func (b *backend) begin(write bool) *bolt.Tx { |
| 436 | b.mu.RLock() |
| 437 | tx := b.unsafeBegin(write) |
| 438 | b.mu.RUnlock() |
| 439 | |
| 440 | size := tx.Size() |
| 441 | db := tx.DB() |
| 442 | atomic.StoreInt64(&b.size, size) |
| 443 | atomic.StoreInt64(&b.sizeInUse, size-(int64(db.Stats().FreePageN)*int64(db.Info().PageSize))) |
| 444 | |
| 445 | return tx |
| 446 | } |
| 447 | |
| 448 | func (b *backend) unsafeBegin(write bool) *bolt.Tx { |
| 449 | tx, err := b.db.Begin(write) |
| 450 | if err != nil { |
| 451 | plog.Fatalf("cannot begin tx (%s)", err) |
| 452 | } |
| 453 | return tx |
| 454 | } |
| 455 | |
| 456 | // NewTmpBackend creates a backend implementation for testing. |
| 457 | func NewTmpBackend(batchInterval time.Duration, batchLimit int) (*backend, string) { |
| 458 | dir, err := ioutil.TempDir(os.TempDir(), "etcd_backend_test") |
| 459 | if err != nil { |
| 460 | plog.Fatal(err) |
| 461 | } |
| 462 | tmpPath := filepath.Join(dir, "database") |
| 463 | bcfg := DefaultBackendConfig() |
| 464 | bcfg.Path, bcfg.BatchInterval, bcfg.BatchLimit = tmpPath, batchInterval, batchLimit |
| 465 | return newBackend(bcfg), tmpPath |
| 466 | } |
| 467 | |
| 468 | func NewDefaultTmpBackend() (*backend, string) { |
| 469 | return NewTmpBackend(defaultBatchInterval, defaultBatchLimit) |
| 470 | } |
| 471 | |
| 472 | type snapshot struct { |
| 473 | *bolt.Tx |
| 474 | stopc chan struct{} |
| 475 | donec chan struct{} |
| 476 | } |
| 477 | |
| 478 | func (s *snapshot) Close() error { |
| 479 | close(s.stopc) |
| 480 | <-s.donec |
| 481 | return s.Tx.Rollback() |
| 482 | } |