YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/util/failure_detector.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
//
18
// The following only applies to changes made to this file as part of YugaByte development.
19
//
20
// Portions Copyright (c) YugaByte, Inc.
21
//
22
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
23
// in compliance with the License.  You may obtain a copy of the License at
24
//
25
// http://www.apache.org/licenses/LICENSE-2.0
26
//
27
// Unless required by applicable law or agreed to in writing, software distributed under the License
28
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
29
// or implied.  See the License for the specific language governing permissions and limitations
30
// under the License.
31
//
32
33
#ifndef YB_UTIL_FAILURE_DETECTOR_H_
34
#define YB_UTIL_FAILURE_DETECTOR_H_
35
36
#include <stddef.h>
37
38
#include <functional>
39
#include <string>
40
#include <tuple>
41
#include <unordered_map>
42
#include <utility>
43
44
#include <gflags/gflags_declare.h>
45
#include <glog/logging.h>
46
47
#include "yb/gutil/callback.h"
48
#include "yb/gutil/integral_types.h"
49
#include "yb/gutil/logging-inl.h"
50
#include "yb/gutil/macros.h"
51
#include "yb/gutil/ref_counted.h"
52
53
#include "yb/util/status_fwd.h"
54
#include "yb/util/countdown_latch.h"
55
#include "yb/util/locks.h"
56
#include "yb/util/monotime.h"
57
#include "yb/util/random.h"
58
59
namespace yb {
60
class MonoDelta;
61
class MonoTime;
62
class Status;
63
class Thread;
64
65
// A generic interface for failure detector implementations.
66
// A failure detector is responsible for deciding whether a certain server is dead or alive.
67
class FailureDetector : public RefCountedThreadSafe<FailureDetector> {
68
 public:
69
  enum NodeStatus {
70
    DEAD,
71
    ALIVE
72
  };
73
  typedef std::unordered_map<std::string, NodeStatus> StatusMap;
74
75
  typedef Callback<void(const std::string& name,
76
                        const Status& status)> FailureDetectedCallback;
77
78
1
  virtual ~FailureDetector() {}
79
80
  // Registers a node with 'name' in the failure detector.
81
  //
82
  // If it returns Status::OK() the failure detector will from now
83
  // expect messages from the machine with 'name' and will trigger
84
  // 'callback' if a failure is detected.
85
  //
86
  // Returns Status::AlreadyPresent() if a machine with 'name' is
87
  // already registered in this failure detector.
88
  virtual CHECKED_STATUS Track(const std::string& name,
89
                       const MonoTime& now,
90
                       const FailureDetectedCallback& callback) = 0;
91
92
  // Stops tracking node with 'name'.
93
  virtual CHECKED_STATUS UnTrack(const std::string& name) = 0;
94
95
  // Return true iff the named entity is currently being tracked.
96
  virtual bool IsTracking(const std::string& name) = 0;
97
98
  // Records that a message from machine with 'name' was received at 'now'.
99
  virtual CHECKED_STATUS MessageFrom(const std::string& name, const MonoTime& now) = 0;
100
101
  // Checks the failure status of each tracked node. If the failure criteria is
102
  // met, the failure callback is invoked.
103
  virtual void CheckForFailures(const MonoTime& now) = 0;
104
};
105
106
// A simple failure detector implementation that considers a node dead
107
// when they have not reported by a certain time interval.
108
class TimedFailureDetector : public FailureDetector {
109
 public:
110
  // Some monitorable entity.
111
  struct Node {
112
    std::string permanent_name;
113
    MonoTime last_heard_of;
114
    FailureDetectedCallback callback;
115
    NodeStatus status;
116
  };
117
118
  explicit TimedFailureDetector(MonoDelta failure_period);
119
  virtual ~TimedFailureDetector();
120
121
  virtual CHECKED_STATUS Track(const std::string& name,
122
                       const MonoTime& now,
123
                       const FailureDetectedCallback& callback) override;
124
125
  virtual CHECKED_STATUS UnTrack(const std::string& name) override;
126
127
  virtual bool IsTracking(const std::string& name) override;
128
129
  virtual CHECKED_STATUS MessageFrom(const std::string& name, const MonoTime& now) override;
130
131
  virtual void CheckForFailures(const MonoTime& now) override;
132
133
 private:
134
  typedef std::unordered_map<std::string, Node*> NodeMap;
135
136
  // Check if the named failure detector has failed.
137
  // Does not invoke the callback.
138
  FailureDetector::NodeStatus GetNodeStatusUnlocked(const std::string& name,
139
                                                    const MonoTime& now);
140
141
  const MonoDelta failure_period_;
142
  mutable simple_spinlock lock_;
143
  NodeMap nodes_;
144
145
  DISALLOW_COPY_AND_ASSIGN(TimedFailureDetector);
146
};
147
148
// A randomized failure monitor that wakes up in normally-distributed intervals
149
// and runs CheckForFailures() on each failure detector it monitors.
150
//
151
// The wake up interval is defined by a normal distribution with the specified
152
// mean and standard deviation, in milliseconds, with minimum possible value
153
// pinned at kMinWakeUpTimeMillis.
154
//
155
// We use a random wake up interval to avoid thundering herd / lockstep problems
156
// when multiple nodes react to the failure of another node.
157
class RandomizedFailureMonitor {
158
 public:
159
  // The minimum time the FailureMonitor will wait.
160
  static const int64_t kMinWakeUpTimeMillis;
161
162
  RandomizedFailureMonitor(uint32_t random_seed,
163
                           int64_t period_mean_millis,
164
                           int64_t period_std_dev_millis);
165
  ~RandomizedFailureMonitor();
166
167
  // Starts the failure monitor.
168
  CHECKED_STATUS Start();
169
170
  // Stops the failure monitor.
171
  void Shutdown();
172
173
  // Adds a failure detector to be monitored.
174
  CHECKED_STATUS MonitorFailureDetector(const std::string& name,
175
                                const scoped_refptr<FailureDetector>& fd);
176
177
  // Unmonitors the failure detector with the specified name.
178
  CHECKED_STATUS UnmonitorFailureDetector(const std::string& name);
179
180
 private:
181
  typedef std::unordered_map<std::string, scoped_refptr<FailureDetector> > FDMap;
182
183
  // Runs the monitor thread.
184
  void RunThread();
185
186
    // Mean & std. deviation of random period to sleep for between checking the
187
  // failure detectors.
188
  const int64_t period_mean_millis_;
189
  const int64_t period_stddev_millis_;
190
  ThreadSafeRandom random_;
191
192
  scoped_refptr<Thread> thread_;
193
  CountDownLatch run_latch_;
194
195
  mutable simple_spinlock lock_;
196
  FDMap fds_;
197
  bool shutdown_; // Whether the failure monitor should shut down.
198
199
  DISALLOW_COPY_AND_ASSIGN(RandomizedFailureMonitor);
200
};
201
202
}  // namespace yb
203
204
#endif /* YB_UTIL_FAILURE_DETECTOR_H_ */