blob: d0e0c81e2090d0fd819ac796bff741da7f460239 [file] [log] [blame]
khenaidooab1f7bd2019-11-14 14:00:27 -05001// Copyright 2015 The etcd Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package rafthttp
16
17import (
18 "context"
19 "errors"
20 "fmt"
21 "io/ioutil"
22 "net/http"
23 "path"
24 "strings"
25 "time"
26
27 "go.etcd.io/etcd/etcdserver/api/snap"
28 pioutil "go.etcd.io/etcd/pkg/ioutil"
29 "go.etcd.io/etcd/pkg/types"
30 "go.etcd.io/etcd/raft/raftpb"
31 "go.etcd.io/etcd/version"
32
33 humanize "github.com/dustin/go-humanize"
34 "go.uber.org/zap"
35)
36
37const (
38 // connReadLimitByte limits the number of bytes
39 // a single read can read out.
40 //
41 // 64KB should be large enough for not causing
42 // throughput bottleneck as well as small enough
43 // for not causing a read timeout.
44 connReadLimitByte = 64 * 1024
45)
46
47var (
48 RaftPrefix = "/raft"
49 ProbingPrefix = path.Join(RaftPrefix, "probing")
50 RaftStreamPrefix = path.Join(RaftPrefix, "stream")
51 RaftSnapshotPrefix = path.Join(RaftPrefix, "snapshot")
52
53 errIncompatibleVersion = errors.New("incompatible version")
54 errClusterIDMismatch = errors.New("cluster ID mismatch")
55)
56
57type peerGetter interface {
58 Get(id types.ID) Peer
59}
60
61type writerToResponse interface {
62 WriteTo(w http.ResponseWriter)
63}
64
65type pipelineHandler struct {
66 lg *zap.Logger
67 localID types.ID
68 tr Transporter
69 r Raft
70 cid types.ID
71}
72
73// newPipelineHandler returns a handler for handling raft messages
74// from pipeline for RaftPrefix.
75//
76// The handler reads out the raft message from request body,
77// and forwards it to the given raft state machine for processing.
78func newPipelineHandler(t *Transport, r Raft, cid types.ID) http.Handler {
79 return &pipelineHandler{
80 lg: t.Logger,
81 localID: t.ID,
82 tr: t,
83 r: r,
84 cid: cid,
85 }
86}
87
88func (h *pipelineHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
89 if r.Method != "POST" {
90 w.Header().Set("Allow", "POST")
91 http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
92 return
93 }
94
95 w.Header().Set("X-Etcd-Cluster-ID", h.cid.String())
96
97 if err := checkClusterCompatibilityFromHeader(h.lg, h.localID, r.Header, h.cid); err != nil {
98 http.Error(w, err.Error(), http.StatusPreconditionFailed)
99 return
100 }
101
102 addRemoteFromRequest(h.tr, r)
103
104 // Limit the data size that could be read from the request body, which ensures that read from
105 // connection will not time out accidentally due to possible blocking in underlying implementation.
106 limitedr := pioutil.NewLimitedBufferReader(r.Body, connReadLimitByte)
107 b, err := ioutil.ReadAll(limitedr)
108 if err != nil {
109 if h.lg != nil {
110 h.lg.Warn(
111 "failed to read Raft message",
112 zap.String("local-member-id", h.localID.String()),
113 zap.Error(err),
114 )
115 } else {
116 plog.Errorf("failed to read raft message (%v)", err)
117 }
118 http.Error(w, "error reading raft message", http.StatusBadRequest)
119 recvFailures.WithLabelValues(r.RemoteAddr).Inc()
120 return
121 }
122
123 var m raftpb.Message
124 if err := m.Unmarshal(b); err != nil {
125 if h.lg != nil {
126 h.lg.Warn(
127 "failed to unmarshal Raft message",
128 zap.String("local-member-id", h.localID.String()),
129 zap.Error(err),
130 )
131 } else {
132 plog.Errorf("failed to unmarshal raft message (%v)", err)
133 }
134 http.Error(w, "error unmarshalling raft message", http.StatusBadRequest)
135 recvFailures.WithLabelValues(r.RemoteAddr).Inc()
136 return
137 }
138
139 receivedBytes.WithLabelValues(types.ID(m.From).String()).Add(float64(len(b)))
140
141 if err := h.r.Process(context.TODO(), m); err != nil {
142 switch v := err.(type) {
143 case writerToResponse:
144 v.WriteTo(w)
145 default:
146 if h.lg != nil {
147 h.lg.Warn(
148 "failed to process Raft message",
149 zap.String("local-member-id", h.localID.String()),
150 zap.Error(err),
151 )
152 } else {
153 plog.Warningf("failed to process raft message (%v)", err)
154 }
155 http.Error(w, "error processing raft message", http.StatusInternalServerError)
156 w.(http.Flusher).Flush()
157 // disconnect the http stream
158 panic(err)
159 }
160 return
161 }
162
163 // Write StatusNoContent header after the message has been processed by
164 // raft, which facilitates the client to report MsgSnap status.
165 w.WriteHeader(http.StatusNoContent)
166}
167
168type snapshotHandler struct {
169 lg *zap.Logger
170 tr Transporter
171 r Raft
172 snapshotter *snap.Snapshotter
173
174 localID types.ID
175 cid types.ID
176}
177
178func newSnapshotHandler(t *Transport, r Raft, snapshotter *snap.Snapshotter, cid types.ID) http.Handler {
179 return &snapshotHandler{
180 lg: t.Logger,
181 tr: t,
182 r: r,
183 snapshotter: snapshotter,
184 localID: t.ID,
185 cid: cid,
186 }
187}
188
189const unknownSnapshotSender = "UNKNOWN_SNAPSHOT_SENDER"
190
191// ServeHTTP serves HTTP request to receive and process snapshot message.
192//
193// If request sender dies without closing underlying TCP connection,
194// the handler will keep waiting for the request body until TCP keepalive
195// finds out that the connection is broken after several minutes.
196// This is acceptable because
197// 1. snapshot messages sent through other TCP connections could still be
198// received and processed.
199// 2. this case should happen rarely, so no further optimization is done.
200func (h *snapshotHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
201 start := time.Now()
202
203 if r.Method != "POST" {
204 w.Header().Set("Allow", "POST")
205 http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
206 snapshotReceiveFailures.WithLabelValues(unknownSnapshotSender).Inc()
207 return
208 }
209
210 w.Header().Set("X-Etcd-Cluster-ID", h.cid.String())
211
212 if err := checkClusterCompatibilityFromHeader(h.lg, h.localID, r.Header, h.cid); err != nil {
213 http.Error(w, err.Error(), http.StatusPreconditionFailed)
214 snapshotReceiveFailures.WithLabelValues(unknownSnapshotSender).Inc()
215 return
216 }
217
218 addRemoteFromRequest(h.tr, r)
219
220 dec := &messageDecoder{r: r.Body}
221 // let snapshots be very large since they can exceed 512MB for large installations
222 m, err := dec.decodeLimit(uint64(1 << 63))
223 from := types.ID(m.From).String()
224 if err != nil {
225 msg := fmt.Sprintf("failed to decode raft message (%v)", err)
226 if h.lg != nil {
227 h.lg.Warn(
228 "failed to decode Raft message",
229 zap.String("local-member-id", h.localID.String()),
230 zap.String("remote-snapshot-sender-id", from),
231 zap.Error(err),
232 )
233 } else {
234 plog.Error(msg)
235 }
236 http.Error(w, msg, http.StatusBadRequest)
237 recvFailures.WithLabelValues(r.RemoteAddr).Inc()
238 snapshotReceiveFailures.WithLabelValues(from).Inc()
239 return
240 }
241
242 msgSize := m.Size()
243 receivedBytes.WithLabelValues(from).Add(float64(msgSize))
244
245 if m.Type != raftpb.MsgSnap {
246 if h.lg != nil {
247 h.lg.Warn(
248 "unexpected Raft message type",
249 zap.String("local-member-id", h.localID.String()),
250 zap.String("remote-snapshot-sender-id", from),
251 zap.String("message-type", m.Type.String()),
252 )
253 } else {
254 plog.Errorf("unexpected raft message type %s on snapshot path", m.Type)
255 }
256 http.Error(w, "wrong raft message type", http.StatusBadRequest)
257 snapshotReceiveFailures.WithLabelValues(from).Inc()
258 return
259 }
260
261 snapshotReceiveInflights.WithLabelValues(from).Inc()
262 defer func() {
263 snapshotReceiveInflights.WithLabelValues(from).Dec()
264 }()
265
266 if h.lg != nil {
267 h.lg.Info(
268 "receiving database snapshot",
269 zap.String("local-member-id", h.localID.String()),
270 zap.String("remote-snapshot-sender-id", from),
271 zap.Uint64("incoming-snapshot-index", m.Snapshot.Metadata.Index),
272 zap.Int("incoming-snapshot-message-size-bytes", msgSize),
273 zap.String("incoming-snapshot-message-size", humanize.Bytes(uint64(msgSize))),
274 )
275 } else {
276 plog.Infof("receiving database snapshot [index:%d, from %s] ...", m.Snapshot.Metadata.Index, types.ID(m.From))
277 }
278
279 // save incoming database snapshot.
280 n, err := h.snapshotter.SaveDBFrom(r.Body, m.Snapshot.Metadata.Index)
281 if err != nil {
282 msg := fmt.Sprintf("failed to save KV snapshot (%v)", err)
283 if h.lg != nil {
284 h.lg.Warn(
285 "failed to save incoming database snapshot",
286 zap.String("local-member-id", h.localID.String()),
287 zap.String("remote-snapshot-sender-id", from),
288 zap.Uint64("incoming-snapshot-index", m.Snapshot.Metadata.Index),
289 zap.Error(err),
290 )
291 } else {
292 plog.Error(msg)
293 }
294 http.Error(w, msg, http.StatusInternalServerError)
295 snapshotReceiveFailures.WithLabelValues(from).Inc()
296 return
297 }
298
299 receivedBytes.WithLabelValues(from).Add(float64(n))
300
301 if h.lg != nil {
302 h.lg.Info(
303 "received and saved database snapshot",
304 zap.String("local-member-id", h.localID.String()),
305 zap.String("remote-snapshot-sender-id", from),
306 zap.Uint64("incoming-snapshot-index", m.Snapshot.Metadata.Index),
307 zap.Int64("incoming-snapshot-size-bytes", n),
308 zap.String("incoming-snapshot-size", humanize.Bytes(uint64(n))),
309 )
310 } else {
311 plog.Infof("received and saved database snapshot [index: %d, from: %s] successfully", m.Snapshot.Metadata.Index, types.ID(m.From))
312 }
313
314 if err := h.r.Process(context.TODO(), m); err != nil {
315 switch v := err.(type) {
316 // Process may return writerToResponse error when doing some
317 // additional checks before calling raft.Node.Step.
318 case writerToResponse:
319 v.WriteTo(w)
320 default:
321 msg := fmt.Sprintf("failed to process raft message (%v)", err)
322 if h.lg != nil {
323 h.lg.Warn(
324 "failed to process Raft message",
325 zap.String("local-member-id", h.localID.String()),
326 zap.String("remote-snapshot-sender-id", from),
327 zap.Error(err),
328 )
329 } else {
330 plog.Error(msg)
331 }
332 http.Error(w, msg, http.StatusInternalServerError)
333 snapshotReceiveFailures.WithLabelValues(from).Inc()
334 }
335 return
336 }
337
338 // Write StatusNoContent header after the message has been processed by
339 // raft, which facilitates the client to report MsgSnap status.
340 w.WriteHeader(http.StatusNoContent)
341
342 snapshotReceive.WithLabelValues(from).Inc()
343 snapshotReceiveSeconds.WithLabelValues(from).Observe(time.Since(start).Seconds())
344}
345
346type streamHandler struct {
347 lg *zap.Logger
348 tr *Transport
349 peerGetter peerGetter
350 r Raft
351 id types.ID
352 cid types.ID
353}
354
355func newStreamHandler(t *Transport, pg peerGetter, r Raft, id, cid types.ID) http.Handler {
356 return &streamHandler{
357 lg: t.Logger,
358 tr: t,
359 peerGetter: pg,
360 r: r,
361 id: id,
362 cid: cid,
363 }
364}
365
366func (h *streamHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
367 if r.Method != "GET" {
368 w.Header().Set("Allow", "GET")
369 http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
370 return
371 }
372
373 w.Header().Set("X-Server-Version", version.Version)
374 w.Header().Set("X-Etcd-Cluster-ID", h.cid.String())
375
376 if err := checkClusterCompatibilityFromHeader(h.lg, h.tr.ID, r.Header, h.cid); err != nil {
377 http.Error(w, err.Error(), http.StatusPreconditionFailed)
378 return
379 }
380
381 var t streamType
382 switch path.Dir(r.URL.Path) {
383 case streamTypeMsgAppV2.endpoint():
384 t = streamTypeMsgAppV2
385 case streamTypeMessage.endpoint():
386 t = streamTypeMessage
387 default:
388 if h.lg != nil {
389 h.lg.Debug(
390 "ignored unexpected streaming request path",
391 zap.String("local-member-id", h.tr.ID.String()),
392 zap.String("remote-peer-id-stream-handler", h.id.String()),
393 zap.String("path", r.URL.Path),
394 )
395 } else {
396 plog.Debugf("ignored unexpected streaming request path %s", r.URL.Path)
397 }
398 http.Error(w, "invalid path", http.StatusNotFound)
399 return
400 }
401
402 fromStr := path.Base(r.URL.Path)
403 from, err := types.IDFromString(fromStr)
404 if err != nil {
405 if h.lg != nil {
406 h.lg.Warn(
407 "failed to parse path into ID",
408 zap.String("local-member-id", h.tr.ID.String()),
409 zap.String("remote-peer-id-stream-handler", h.id.String()),
410 zap.String("path", fromStr),
411 zap.Error(err),
412 )
413 } else {
414 plog.Errorf("failed to parse from %s into ID (%v)", fromStr, err)
415 }
416 http.Error(w, "invalid from", http.StatusNotFound)
417 return
418 }
419 if h.r.IsIDRemoved(uint64(from)) {
420 if h.lg != nil {
421 h.lg.Warn(
422 "rejected stream from remote peer because it was removed",
423 zap.String("local-member-id", h.tr.ID.String()),
424 zap.String("remote-peer-id-stream-handler", h.id.String()),
425 zap.String("remote-peer-id-from", from.String()),
426 )
427 } else {
428 plog.Warningf("rejected the stream from peer %s since it was removed", from)
429 }
430 http.Error(w, "removed member", http.StatusGone)
431 return
432 }
433 p := h.peerGetter.Get(from)
434 if p == nil {
435 // This may happen in following cases:
436 // 1. user starts a remote peer that belongs to a different cluster
437 // with the same cluster ID.
438 // 2. local etcd falls behind of the cluster, and cannot recognize
439 // the members that joined after its current progress.
440 if urls := r.Header.Get("X-PeerURLs"); urls != "" {
441 h.tr.AddRemote(from, strings.Split(urls, ","))
442 }
443 if h.lg != nil {
444 h.lg.Warn(
445 "failed to find remote peer in cluster",
446 zap.String("local-member-id", h.tr.ID.String()),
447 zap.String("remote-peer-id-stream-handler", h.id.String()),
448 zap.String("remote-peer-id-from", from.String()),
449 zap.String("cluster-id", h.cid.String()),
450 )
451 } else {
452 plog.Errorf("failed to find member %s in cluster %s", from, h.cid)
453 }
454 http.Error(w, "error sender not found", http.StatusNotFound)
455 return
456 }
457
458 wto := h.id.String()
459 if gto := r.Header.Get("X-Raft-To"); gto != wto {
460 if h.lg != nil {
461 h.lg.Warn(
462 "ignored streaming request; ID mismatch",
463 zap.String("local-member-id", h.tr.ID.String()),
464 zap.String("remote-peer-id-stream-handler", h.id.String()),
465 zap.String("remote-peer-id-header", gto),
466 zap.String("remote-peer-id-from", from.String()),
467 zap.String("cluster-id", h.cid.String()),
468 )
469 } else {
470 plog.Errorf("streaming request ignored (ID mismatch got %s want %s)", gto, wto)
471 }
472 http.Error(w, "to field mismatch", http.StatusPreconditionFailed)
473 return
474 }
475
476 w.WriteHeader(http.StatusOK)
477 w.(http.Flusher).Flush()
478
479 c := newCloseNotifier()
480 conn := &outgoingConn{
481 t: t,
482 Writer: w,
483 Flusher: w.(http.Flusher),
484 Closer: c,
485 localID: h.tr.ID,
486 peerID: h.id,
487 }
488 p.attachOutgoingConn(conn)
489 <-c.closeNotify()
490}
491
492// checkClusterCompatibilityFromHeader checks the cluster compatibility of
493// the local member from the given header.
494// It checks whether the version of local member is compatible with
495// the versions in the header, and whether the cluster ID of local member
496// matches the one in the header.
497func checkClusterCompatibilityFromHeader(lg *zap.Logger, localID types.ID, header http.Header, cid types.ID) error {
498 remoteName := header.Get("X-Server-From")
499
500 remoteServer := serverVersion(header)
501 remoteVs := ""
502 if remoteServer != nil {
503 remoteVs = remoteServer.String()
504 }
505
506 remoteMinClusterVer := minClusterVersion(header)
507 remoteMinClusterVs := ""
508 if remoteMinClusterVer != nil {
509 remoteMinClusterVs = remoteMinClusterVer.String()
510 }
511
512 localServer, localMinCluster, err := checkVersionCompatibility(remoteName, remoteServer, remoteMinClusterVer)
513
514 localVs := ""
515 if localServer != nil {
516 localVs = localServer.String()
517 }
518 localMinClusterVs := ""
519 if localMinCluster != nil {
520 localMinClusterVs = localMinCluster.String()
521 }
522
523 if err != nil {
524 if lg != nil {
525 lg.Warn(
526 "failed to check version compatibility",
527 zap.String("local-member-id", localID.String()),
528 zap.String("local-member-cluster-id", cid.String()),
529 zap.String("local-member-server-version", localVs),
530 zap.String("local-member-server-minimum-cluster-version", localMinClusterVs),
531 zap.String("remote-peer-server-name", remoteName),
532 zap.String("remote-peer-server-version", remoteVs),
533 zap.String("remote-peer-server-minimum-cluster-version", remoteMinClusterVs),
534 zap.Error(err),
535 )
536 } else {
537 plog.Errorf("request version incompatibility (%v)", err)
538 }
539 return errIncompatibleVersion
540 }
541 if gcid := header.Get("X-Etcd-Cluster-ID"); gcid != cid.String() {
542 if lg != nil {
543 lg.Warn(
544 "request cluster ID mismatch",
545 zap.String("local-member-id", localID.String()),
546 zap.String("local-member-cluster-id", cid.String()),
547 zap.String("local-member-server-version", localVs),
548 zap.String("local-member-server-minimum-cluster-version", localMinClusterVs),
549 zap.String("remote-peer-server-name", remoteName),
550 zap.String("remote-peer-server-version", remoteVs),
551 zap.String("remote-peer-server-minimum-cluster-version", remoteMinClusterVs),
552 zap.String("remote-peer-cluster-id", gcid),
553 )
554 } else {
555 plog.Errorf("request cluster ID mismatch (got %s want %s)", gcid, cid)
556 }
557 return errClusterIDMismatch
558 }
559 return nil
560}
561
562type closeNotifier struct {
563 done chan struct{}
564}
565
566func newCloseNotifier() *closeNotifier {
567 return &closeNotifier{
568 done: make(chan struct{}),
569 }
570}
571
572func (n *closeNotifier) Close() error {
573 close(n.done)
574 return nil
575}
576
577func (n *closeNotifier) closeNotify() <-chan struct{} { return n.done }