/Users/deen/code/yugabyte-db/src/yb/util/failure_detector.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // |
18 | | // The following only applies to changes made to this file as part of YugaByte development. |
19 | | // |
20 | | // Portions Copyright (c) YugaByte, Inc. |
21 | | // |
22 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
23 | | // in compliance with the License. You may obtain a copy of the License at |
24 | | // |
25 | | // http://www.apache.org/licenses/LICENSE-2.0 |
26 | | // |
27 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
28 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
29 | | // or implied. See the License for the specific language governing permissions and limitations |
30 | | // under the License. |
31 | | // |
32 | | |
33 | | #ifndef YB_UTIL_FAILURE_DETECTOR_H_ |
34 | | #define YB_UTIL_FAILURE_DETECTOR_H_ |
35 | | |
36 | | #include <stddef.h> |
37 | | |
38 | | #include <functional> |
39 | | #include <string> |
40 | | #include <tuple> |
41 | | #include <unordered_map> |
42 | | #include <utility> |
43 | | |
44 | | #include <gflags/gflags_declare.h> |
45 | | #include <glog/logging.h> |
46 | | |
47 | | #include "yb/gutil/callback.h" |
48 | | #include "yb/gutil/integral_types.h" |
49 | | #include "yb/gutil/logging-inl.h" |
50 | | #include "yb/gutil/macros.h" |
51 | | #include "yb/gutil/ref_counted.h" |
52 | | |
53 | | #include "yb/util/status_fwd.h" |
54 | | #include "yb/util/countdown_latch.h" |
55 | | #include "yb/util/locks.h" |
56 | | #include "yb/util/monotime.h" |
57 | | #include "yb/util/random.h" |
58 | | |
59 | | namespace yb { |
60 | | class MonoDelta; |
61 | | class MonoTime; |
62 | | class Status; |
63 | | class Thread; |
64 | | |
65 | | // A generic interface for failure detector implementations. |
66 | | // A failure detector is responsible for deciding whether a certain server is dead or alive. |
67 | | class FailureDetector : public RefCountedThreadSafe<FailureDetector> { |
68 | | public: |
69 | | enum NodeStatus { |
70 | | DEAD, |
71 | | ALIVE |
72 | | }; |
73 | | typedef std::unordered_map<std::string, NodeStatus> StatusMap; |
74 | | |
75 | | typedef Callback<void(const std::string& name, |
76 | | const Status& status)> FailureDetectedCallback; |
77 | | |
78 | 1 | virtual ~FailureDetector() {} |
79 | | |
80 | | // Registers a node with 'name' in the failure detector. |
81 | | // |
82 | | // If it returns Status::OK() the failure detector will from now |
83 | | // expect messages from the machine with 'name' and will trigger |
84 | | // 'callback' if a failure is detected. |
85 | | // |
86 | | // Returns Status::AlreadyPresent() if a machine with 'name' is |
87 | | // already registered in this failure detector. |
88 | | virtual CHECKED_STATUS Track(const std::string& name, |
89 | | const MonoTime& now, |
90 | | const FailureDetectedCallback& callback) = 0; |
91 | | |
92 | | // Stops tracking node with 'name'. |
93 | | virtual CHECKED_STATUS UnTrack(const std::string& name) = 0; |
94 | | |
95 | | // Return true iff the named entity is currently being tracked. |
96 | | virtual bool IsTracking(const std::string& name) = 0; |
97 | | |
98 | | // Records that a message from machine with 'name' was received at 'now'. |
99 | | virtual CHECKED_STATUS MessageFrom(const std::string& name, const MonoTime& now) = 0; |
100 | | |
101 | | // Checks the failure status of each tracked node. If the failure criteria is |
102 | | // met, the failure callback is invoked. |
103 | | virtual void CheckForFailures(const MonoTime& now) = 0; |
104 | | }; |
105 | | |
106 | | // A simple failure detector implementation that considers a node dead |
107 | | // when they have not reported by a certain time interval. |
108 | | class TimedFailureDetector : public FailureDetector { |
109 | | public: |
110 | | // Some monitorable entity. |
111 | | struct Node { |
112 | | std::string permanent_name; |
113 | | MonoTime last_heard_of; |
114 | | FailureDetectedCallback callback; |
115 | | NodeStatus status; |
116 | | }; |
117 | | |
118 | | explicit TimedFailureDetector(MonoDelta failure_period); |
119 | | virtual ~TimedFailureDetector(); |
120 | | |
121 | | virtual CHECKED_STATUS Track(const std::string& name, |
122 | | const MonoTime& now, |
123 | | const FailureDetectedCallback& callback) override; |
124 | | |
125 | | virtual CHECKED_STATUS UnTrack(const std::string& name) override; |
126 | | |
127 | | virtual bool IsTracking(const std::string& name) override; |
128 | | |
129 | | virtual CHECKED_STATUS MessageFrom(const std::string& name, const MonoTime& now) override; |
130 | | |
131 | | virtual void CheckForFailures(const MonoTime& now) override; |
132 | | |
133 | | private: |
134 | | typedef std::unordered_map<std::string, Node*> NodeMap; |
135 | | |
136 | | // Check if the named failure detector has failed. |
137 | | // Does not invoke the callback. |
138 | | FailureDetector::NodeStatus GetNodeStatusUnlocked(const std::string& name, |
139 | | const MonoTime& now); |
140 | | |
141 | | const MonoDelta failure_period_; |
142 | | mutable simple_spinlock lock_; |
143 | | NodeMap nodes_; |
144 | | |
145 | | DISALLOW_COPY_AND_ASSIGN(TimedFailureDetector); |
146 | | }; |
147 | | |
148 | | // A randomized failure monitor that wakes up in normally-distributed intervals |
149 | | // and runs CheckForFailures() on each failure detector it monitors. |
150 | | // |
151 | | // The wake up interval is defined by a normal distribution with the specified |
152 | | // mean and standard deviation, in milliseconds, with minimum possible value |
153 | | // pinned at kMinWakeUpTimeMillis. |
154 | | // |
155 | | // We use a random wake up interval to avoid thundering herd / lockstep problems |
156 | | // when multiple nodes react to the failure of another node. |
157 | | class RandomizedFailureMonitor { |
158 | | public: |
159 | | // The minimum time the FailureMonitor will wait. |
160 | | static const int64_t kMinWakeUpTimeMillis; |
161 | | |
162 | | RandomizedFailureMonitor(uint32_t random_seed, |
163 | | int64_t period_mean_millis, |
164 | | int64_t period_std_dev_millis); |
165 | | ~RandomizedFailureMonitor(); |
166 | | |
167 | | // Starts the failure monitor. |
168 | | CHECKED_STATUS Start(); |
169 | | |
170 | | // Stops the failure monitor. |
171 | | void Shutdown(); |
172 | | |
173 | | // Adds a failure detector to be monitored. |
174 | | CHECKED_STATUS MonitorFailureDetector(const std::string& name, |
175 | | const scoped_refptr<FailureDetector>& fd); |
176 | | |
177 | | // Unmonitors the failure detector with the specified name. |
178 | | CHECKED_STATUS UnmonitorFailureDetector(const std::string& name); |
179 | | |
180 | | private: |
181 | | typedef std::unordered_map<std::string, scoped_refptr<FailureDetector> > FDMap; |
182 | | |
183 | | // Runs the monitor thread. |
184 | | void RunThread(); |
185 | | |
186 | | // Mean & std. deviation of random period to sleep for between checking the |
187 | | // failure detectors. |
188 | | const int64_t period_mean_millis_; |
189 | | const int64_t period_stddev_millis_; |
190 | | ThreadSafeRandom random_; |
191 | | |
192 | | scoped_refptr<Thread> thread_; |
193 | | CountDownLatch run_latch_; |
194 | | |
195 | | mutable simple_spinlock lock_; |
196 | | FDMap fds_; |
197 | | bool shutdown_; // Whether the failure monitor should shut down. |
198 | | |
199 | | DISALLOW_COPY_AND_ASSIGN(RandomizedFailureMonitor); |
200 | | }; |
201 | | |
202 | | } // namespace yb |
203 | | |
204 | | #endif /* YB_UTIL_FAILURE_DETECTOR_H_ */ |