blob: 6be574590e0c8e1b1f59d3c24d4331ca9c28ab03 [file] [log] [blame]
Stephane Barbarie260a5632019-02-26 16:12:49 -05001// Copyright 2015 The etcd Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package raft
16
17import (
18 "errors"
19 "sync"
20
21 pb "go.etcd.io/etcd/raft/raftpb"
22)
23
24// ErrCompacted is returned by Storage.Entries/Compact when a requested
25// index is unavailable because it predates the last snapshot.
26var ErrCompacted = errors.New("requested index is unavailable due to compaction")
27
28// ErrSnapOutOfDate is returned by Storage.CreateSnapshot when a requested
29// index is older than the existing snapshot.
30var ErrSnapOutOfDate = errors.New("requested index is older than the existing snapshot")
31
32// ErrUnavailable is returned by Storage interface when the requested log entries
33// are unavailable.
34var ErrUnavailable = errors.New("requested entry at index is unavailable")
35
36// ErrSnapshotTemporarilyUnavailable is returned by the Storage interface when the required
37// snapshot is temporarily unavailable.
38var ErrSnapshotTemporarilyUnavailable = errors.New("snapshot is temporarily unavailable")
39
40// Storage is an interface that may be implemented by the application
41// to retrieve log entries from storage.
42//
43// If any Storage method returns an error, the raft instance will
44// become inoperable and refuse to participate in elections; the
45// application is responsible for cleanup and recovery in this case.
46type Storage interface {
Scott Bakerbeb3cfa2019-10-01 14:44:30 -070047 // TODO(tbg): split this into two interfaces, LogStorage and StateStorage.
48
Stephane Barbarie260a5632019-02-26 16:12:49 -050049 // InitialState returns the saved HardState and ConfState information.
50 InitialState() (pb.HardState, pb.ConfState, error)
51 // Entries returns a slice of log entries in the range [lo,hi).
52 // MaxSize limits the total size of the log entries returned, but
53 // Entries returns at least one entry if any.
54 Entries(lo, hi, maxSize uint64) ([]pb.Entry, error)
55 // Term returns the term of entry i, which must be in the range
56 // [FirstIndex()-1, LastIndex()]. The term of the entry before
57 // FirstIndex is retained for matching purposes even though the
58 // rest of that entry may not be available.
59 Term(i uint64) (uint64, error)
60 // LastIndex returns the index of the last entry in the log.
61 LastIndex() (uint64, error)
62 // FirstIndex returns the index of the first log entry that is
63 // possibly available via Entries (older entries have been incorporated
64 // into the latest Snapshot; if storage only contains the dummy entry the
65 // first log entry is not available).
66 FirstIndex() (uint64, error)
67 // Snapshot returns the most recent snapshot.
68 // If snapshot is temporarily unavailable, it should return ErrSnapshotTemporarilyUnavailable,
69 // so raft state machine could know that Storage needs some time to prepare
70 // snapshot and call Snapshot later.
71 Snapshot() (pb.Snapshot, error)
72}
73
74// MemoryStorage implements the Storage interface backed by an
75// in-memory array.
76type MemoryStorage struct {
77 // Protects access to all fields. Most methods of MemoryStorage are
78 // run on the raft goroutine, but Append() is run on an application
79 // goroutine.
80 sync.Mutex
81
82 hardState pb.HardState
83 snapshot pb.Snapshot
84 // ents[i] has raft log position i+snapshot.Metadata.Index
85 ents []pb.Entry
86}
87
88// NewMemoryStorage creates an empty MemoryStorage.
89func NewMemoryStorage() *MemoryStorage {
90 return &MemoryStorage{
91 // When starting from scratch populate the list with a dummy entry at term zero.
92 ents: make([]pb.Entry, 1),
93 }
94}
95
96// InitialState implements the Storage interface.
97func (ms *MemoryStorage) InitialState() (pb.HardState, pb.ConfState, error) {
98 return ms.hardState, ms.snapshot.Metadata.ConfState, nil
99}
100
101// SetHardState saves the current HardState.
102func (ms *MemoryStorage) SetHardState(st pb.HardState) error {
103 ms.Lock()
104 defer ms.Unlock()
105 ms.hardState = st
106 return nil
107}
108
109// Entries implements the Storage interface.
110func (ms *MemoryStorage) Entries(lo, hi, maxSize uint64) ([]pb.Entry, error) {
111 ms.Lock()
112 defer ms.Unlock()
113 offset := ms.ents[0].Index
114 if lo <= offset {
115 return nil, ErrCompacted
116 }
117 if hi > ms.lastIndex()+1 {
118 raftLogger.Panicf("entries' hi(%d) is out of bound lastindex(%d)", hi, ms.lastIndex())
119 }
120 // only contains dummy entries.
121 if len(ms.ents) == 1 {
122 return nil, ErrUnavailable
123 }
124
125 ents := ms.ents[lo-offset : hi-offset]
126 return limitSize(ents, maxSize), nil
127}
128
129// Term implements the Storage interface.
130func (ms *MemoryStorage) Term(i uint64) (uint64, error) {
131 ms.Lock()
132 defer ms.Unlock()
133 offset := ms.ents[0].Index
134 if i < offset {
135 return 0, ErrCompacted
136 }
137 if int(i-offset) >= len(ms.ents) {
138 return 0, ErrUnavailable
139 }
140 return ms.ents[i-offset].Term, nil
141}
142
143// LastIndex implements the Storage interface.
144func (ms *MemoryStorage) LastIndex() (uint64, error) {
145 ms.Lock()
146 defer ms.Unlock()
147 return ms.lastIndex(), nil
148}
149
150func (ms *MemoryStorage) lastIndex() uint64 {
151 return ms.ents[0].Index + uint64(len(ms.ents)) - 1
152}
153
154// FirstIndex implements the Storage interface.
155func (ms *MemoryStorage) FirstIndex() (uint64, error) {
156 ms.Lock()
157 defer ms.Unlock()
158 return ms.firstIndex(), nil
159}
160
161func (ms *MemoryStorage) firstIndex() uint64 {
162 return ms.ents[0].Index + 1
163}
164
165// Snapshot implements the Storage interface.
166func (ms *MemoryStorage) Snapshot() (pb.Snapshot, error) {
167 ms.Lock()
168 defer ms.Unlock()
169 return ms.snapshot, nil
170}
171
172// ApplySnapshot overwrites the contents of this Storage object with
173// those of the given snapshot.
174func (ms *MemoryStorage) ApplySnapshot(snap pb.Snapshot) error {
175 ms.Lock()
176 defer ms.Unlock()
177
178 //handle check for old snapshot being applied
179 msIndex := ms.snapshot.Metadata.Index
180 snapIndex := snap.Metadata.Index
181 if msIndex >= snapIndex {
182 return ErrSnapOutOfDate
183 }
184
185 ms.snapshot = snap
186 ms.ents = []pb.Entry{{Term: snap.Metadata.Term, Index: snap.Metadata.Index}}
187 return nil
188}
189
190// CreateSnapshot makes a snapshot which can be retrieved with Snapshot() and
191// can be used to reconstruct the state at that point.
192// If any configuration changes have been made since the last compaction,
193// the result of the last ApplyConfChange must be passed in.
194func (ms *MemoryStorage) CreateSnapshot(i uint64, cs *pb.ConfState, data []byte) (pb.Snapshot, error) {
195 ms.Lock()
196 defer ms.Unlock()
197 if i <= ms.snapshot.Metadata.Index {
198 return pb.Snapshot{}, ErrSnapOutOfDate
199 }
200
201 offset := ms.ents[0].Index
202 if i > ms.lastIndex() {
203 raftLogger.Panicf("snapshot %d is out of bound lastindex(%d)", i, ms.lastIndex())
204 }
205
206 ms.snapshot.Metadata.Index = i
207 ms.snapshot.Metadata.Term = ms.ents[i-offset].Term
208 if cs != nil {
209 ms.snapshot.Metadata.ConfState = *cs
210 }
211 ms.snapshot.Data = data
212 return ms.snapshot, nil
213}
214
215// Compact discards all log entries prior to compactIndex.
216// It is the application's responsibility to not attempt to compact an index
217// greater than raftLog.applied.
218func (ms *MemoryStorage) Compact(compactIndex uint64) error {
219 ms.Lock()
220 defer ms.Unlock()
221 offset := ms.ents[0].Index
222 if compactIndex <= offset {
223 return ErrCompacted
224 }
225 if compactIndex > ms.lastIndex() {
226 raftLogger.Panicf("compact %d is out of bound lastindex(%d)", compactIndex, ms.lastIndex())
227 }
228
229 i := compactIndex - offset
230 ents := make([]pb.Entry, 1, 1+uint64(len(ms.ents))-i)
231 ents[0].Index = ms.ents[i].Index
232 ents[0].Term = ms.ents[i].Term
233 ents = append(ents, ms.ents[i+1:]...)
234 ms.ents = ents
235 return nil
236}
237
238// Append the new entries to storage.
239// TODO (xiangli): ensure the entries are continuous and
240// entries[0].Index > ms.entries[0].Index
241func (ms *MemoryStorage) Append(entries []pb.Entry) error {
242 if len(entries) == 0 {
243 return nil
244 }
245
246 ms.Lock()
247 defer ms.Unlock()
248
249 first := ms.firstIndex()
250 last := entries[0].Index + uint64(len(entries)) - 1
251
252 // shortcut if there is no new entry.
253 if last < first {
254 return nil
255 }
256 // truncate compacted entries
257 if first > entries[0].Index {
258 entries = entries[first-entries[0].Index:]
259 }
260
261 offset := entries[0].Index - ms.ents[0].Index
262 switch {
263 case uint64(len(ms.ents)) > offset:
264 ms.ents = append([]pb.Entry{}, ms.ents[:offset]...)
265 ms.ents = append(ms.ents, entries...)
266 case uint64(len(ms.ents)) == offset:
267 ms.ents = append(ms.ents, entries...)
268 default:
269 raftLogger.Panicf("missing log entry [last: %d, append at: %d]",
270 ms.lastIndex(), entries[0].Index)
271 }
272 return nil
273}