/Users/deen/code/yugabyte-db/src/yb/tools/ysck.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // |
18 | | // The following only applies to changes made to this file as part of YugaByte development. |
19 | | // |
20 | | // Portions Copyright (c) YugaByte, Inc. |
21 | | // |
22 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
23 | | // in compliance with the License. You may obtain a copy of the License at |
24 | | // |
25 | | // http://www.apache.org/licenses/LICENSE-2.0 |
26 | | // |
27 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
28 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
29 | | // or implied. See the License for the specific language governing permissions and limitations |
30 | | // under the License. |
31 | | // |
32 | | |
33 | | #include "yb/tools/ysck.h" |
34 | | |
35 | | #include <mutex> |
36 | | #include <unordered_set> |
37 | | |
38 | | #include <glog/logging.h> |
39 | | |
40 | | #include "yb/gutil/bind.h" |
41 | | #include "yb/gutil/map-util.h" |
42 | | #include "yb/gutil/ref_counted.h" |
43 | | #include "yb/gutil/strings/join.h" |
44 | | #include "yb/gutil/strings/substitute.h" |
45 | | |
46 | | #include "yb/util/blocking_queue.h" |
47 | | #include "yb/util/countdown_latch.h" |
48 | | #include "yb/util/locks.h" |
49 | | #include "yb/util/monotime.h" |
50 | | |
51 | | namespace yb { |
52 | | namespace tools { |
53 | | |
54 | | using std::ostream; |
55 | | using std::shared_ptr; |
56 | | using std::string; |
57 | | using std::unordered_map; |
58 | | using strings::Substitute; |
59 | | |
60 | | DEFINE_int32(checksum_timeout_sec, 120, |
61 | | "Maximum total seconds to wait for a checksum scan to complete " |
62 | | "before timing out."); |
63 | | DEFINE_int32(checksum_scan_concurrency, 4, |
64 | | "Number of concurrent checksum scans to execute per tablet server."); |
65 | | |
66 | | ChecksumOptions::ChecksumOptions() |
67 | | : timeout(MonoDelta::FromSeconds(FLAGS_checksum_timeout_sec)), |
68 | 33 | scan_concurrency(FLAGS_checksum_scan_concurrency) {} |
69 | | |
70 | | ChecksumOptions::ChecksumOptions(MonoDelta timeout, int scan_concurrency) |
71 | | : timeout(std::move(timeout)), |
72 | 0 | scan_concurrency(scan_concurrency) {} |
73 | | |
74 | 0 | string YsckTable::ToString() const { |
75 | 0 | return Format( |
76 | 0 | "id: $0 name: $1 schema: $2 num_replicas: $3 table_type: $4", |
77 | 0 | id_, |
78 | 0 | name_, |
79 | 0 | schema_, |
80 | 0 | num_replicas_, |
81 | 0 | yb::TableType_Name(table_type_)); |
82 | 0 | } |
83 | | |
84 | 1.19k | YsckCluster::~YsckCluster() { |
85 | 1.19k | } |
86 | | |
87 | 1.18k | Status YsckCluster::FetchTableAndTabletInfo() { |
88 | 1.18k | RETURN_NOT_OK(master_->Connect()); |
89 | 1.15k | RETURN_NOT_OK(RetrieveTablesList()); |
90 | 997 | RETURN_NOT_OK(RetrieveTabletServers()); |
91 | 17.9k | for (const shared_ptr<YsckTable>& table : tables())997 { |
92 | 17.9k | RETURN_NOT_OK(RetrieveTabletsList(table)); |
93 | 17.9k | } |
94 | 991 | return Status::OK(); |
95 | 997 | } |
96 | | |
97 | | // Gets the list of tablet servers from the Master. |
98 | 997 | Status YsckCluster::RetrieveTabletServers() { |
99 | 997 | return master_->RetrieveTabletServers(&tablet_servers_); |
100 | 997 | } |
101 | | |
102 | | // Gets the list of tables from the Master. |
103 | 1.15k | Status YsckCluster::RetrieveTablesList() { |
104 | 1.15k | return master_->RetrieveTablesList(&tables_); |
105 | 1.15k | } |
106 | | |
107 | 17.9k | Status YsckCluster::RetrieveTabletsList(const shared_ptr<YsckTable>& table) { |
108 | 17.9k | return master_->RetrieveTabletsList(table); |
109 | 17.9k | } |
110 | | |
111 | 1.19k | Status Ysck::CheckMasterRunning() { |
112 | 1.19k | VLOG(1) << "Connecting to the Master"0 ; |
113 | 1.19k | Status s = cluster_->master()->Connect(); |
114 | 1.19k | if (s.ok()) { |
115 | 1.18k | LOG(INFO) << "Connected to the Master"; |
116 | 1.18k | } |
117 | 1.19k | return s; |
118 | 1.19k | } |
119 | | |
120 | 1.18k | Status Ysck::FetchTableAndTabletInfo() { |
121 | 1.18k | return cluster_->FetchTableAndTabletInfo(); |
122 | 1.18k | } |
123 | | |
124 | 991 | Status Ysck::CheckTabletServersRunning() { |
125 | 991 | VLOG(1) << "Getting the Tablet Servers list"0 ; |
126 | 991 | auto servers_count = cluster_->tablet_servers().size(); |
127 | 991 | VLOG(1) << Substitute("List of $0 Tablet Servers retrieved", servers_count)0 ; |
128 | | |
129 | 991 | if (servers_count == 0) { |
130 | 0 | return STATUS(NotFound, "No tablet servers found"); |
131 | 0 | } |
132 | | |
133 | 991 | size_t bad_servers = 0; |
134 | 991 | VLOG(1) << "Connecting to all the Tablet Servers"0 ; |
135 | 2.95k | for (const YsckMaster::TSMap::value_type& entry : cluster_->tablet_servers()) { |
136 | 2.95k | Status s = ConnectToTabletServer(entry.second); |
137 | 2.95k | if (!s.ok()) { |
138 | 653 | bad_servers++; |
139 | 653 | } |
140 | 2.95k | } |
141 | 991 | if (bad_servers == 0) { |
142 | 670 | LOG(INFO) << Substitute("Connected to all $0 Tablet Servers", servers_count); |
143 | 670 | return Status::OK(); |
144 | 670 | } else { |
145 | 321 | LOG(WARNING) << Substitute("Connected to $0 Tablet Servers, $1 weren't reachable", |
146 | 321 | servers_count - bad_servers, bad_servers); |
147 | 321 | return STATUS(NetworkError, "Not all Tablet Servers are reachable"); |
148 | 321 | } |
149 | 991 | } |
150 | | |
151 | 2.95k | Status Ysck::ConnectToTabletServer(const shared_ptr<YsckTabletServer>& ts) { |
152 | 2.95k | VLOG(1) << "Going to connect to Tablet Server: " << ts->uuid()0 ; |
153 | 2.95k | Status s = ts->Connect(); |
154 | 2.95k | if (s.ok()) { |
155 | 2.29k | VLOG(1) << "Connected to Tablet Server: " << ts->uuid()0 ; |
156 | 2.29k | } else { |
157 | 653 | LOG(WARNING) << Substitute("Unable to connect to Tablet Server $0 because $1", |
158 | 653 | ts->uuid(), s.ToString()); |
159 | 653 | } |
160 | 2.95k | return s; |
161 | 2.95k | } |
162 | | |
163 | 669 | Status Ysck::CheckTablesConsistency() { |
164 | 669 | VLOG(1) << "Getting the tables list"0 ; |
165 | 669 | auto tables_count = cluster_->tables().size(); |
166 | 669 | VLOG(1) << Substitute("List of $0 tables retrieved", tables_count)0 ; |
167 | | |
168 | 669 | if (tables_count == 0) { |
169 | 1 | LOG(INFO) << "The cluster doesn't have any tables"; |
170 | 1 | return Status::OK(); |
171 | 1 | } |
172 | | |
173 | 668 | VLOG(1) << "Verifying each table"0 ; |
174 | 668 | size_t bad_tables_count = 0; |
175 | 12.1k | for (const shared_ptr<YsckTable> &table : cluster_->tables()) { |
176 | 12.1k | if (!VerifyTable(table)) { |
177 | 8 | bad_tables_count++; |
178 | 8 | } |
179 | 12.1k | } |
180 | 668 | if (bad_tables_count == 0) { |
181 | 660 | LOG(INFO) << Substitute("The metadata for $0 tables is HEALTHY", tables_count); |
182 | 660 | return Status::OK(); |
183 | 660 | } else { |
184 | 8 | LOG(WARNING) << Substitute("$0 out of $1 tables are not in a healthy state", |
185 | 8 | bad_tables_count, tables_count); |
186 | 8 | return STATUS(Corruption, Substitute("$0 tables are bad", bad_tables_count)); |
187 | 8 | } |
188 | 668 | } |
189 | | |
190 | | // Class to act as a collector of scan results. |
191 | | // Provides thread-safe accessors to update and read a hash table of results. |
192 | | class ChecksumResultReporter : public RefCountedThreadSafe<ChecksumResultReporter> { |
193 | | public: |
194 | | typedef std::pair<Status, uint64_t> ResultPair; |
195 | | typedef std::vector<ResultPair> TableResults; |
196 | | typedef std::unordered_map<std::string, TableResults> ReplicaResultMap; |
197 | | typedef std::unordered_map<std::string, ReplicaResultMap> TabletResultMap; |
198 | | |
199 | | // Initialize reporter with the number of replicas being queried. |
200 | | explicit ChecksumResultReporter(int num_tablet_replicas) |
201 | 658 | : responses_(num_tablet_replicas) { |
202 | 658 | } |
203 | | |
204 | | // Write an entry to the result map indicating a response from the remote. |
205 | | void ReportResult(const std::string& tablet_id, |
206 | | const std::string& replica_uuid, |
207 | | const Status& status, |
208 | 4.37k | uint64_t checksum) { |
209 | 4.37k | std::lock_guard<simple_spinlock> guard(lock_); |
210 | 4.37k | unordered_map<string, TableResults>& replica_results = |
211 | 4.37k | LookupOrInsert(&checksums_, tablet_id, unordered_map<string, TableResults>()); |
212 | 4.37k | if (replica_results.find(replica_uuid) == replica_results.end()) { |
213 | 4.37k | TableResults table_results(1, ResultPair(status, checksum)); |
214 | 4.37k | replica_results[replica_uuid] = table_results; |
215 | 4.37k | } else { |
216 | 0 | replica_results[replica_uuid].push_back(ResultPair(status, checksum)); |
217 | 0 | } |
218 | 4.37k | responses_.CountDown(); |
219 | 4.37k | } |
220 | | |
221 | | // Blocks until either the number of results plus errors reported equals |
222 | | // num_tablet_replicas (from the constructor), or until the timeout expires, |
223 | | // whichever comes first. |
224 | | // Returns false if the timeout expired before all responses came in. |
225 | | // Otherwise, returns true. |
226 | 658 | bool WaitFor(const MonoDelta& timeout) const { return responses_.WaitFor(timeout); } |
227 | | |
228 | | // Returns true iff all replicas have reported in. |
229 | 0 | bool AllReported() const { return responses_.count() == 0; } |
230 | | |
231 | | // Get reported results. |
232 | 658 | TabletResultMap checksums() const { |
233 | 658 | std::lock_guard<simple_spinlock> guard(lock_); |
234 | 658 | return checksums_; |
235 | 658 | } |
236 | | |
237 | | private: |
238 | | friend class RefCountedThreadSafe<ChecksumResultReporter>; |
239 | 658 | ~ChecksumResultReporter() {} |
240 | | |
241 | | // Report either a success or error response. |
242 | | void HandleResponse(const std::string& tablet_id, const std::string& replica_uuid, |
243 | | const Status& status, uint64_t checksum); |
244 | | |
245 | | CountDownLatch responses_; |
246 | | mutable simple_spinlock lock_; // Protects 'checksums_'. |
247 | | // checksums_ is an unordered_map of { tablet_id : { replica_uuid : checksum } }. |
248 | | TabletResultMap checksums_; |
249 | | }; |
250 | | |
251 | | // Queue of tablet replicas for an individual tablet server. |
252 | | typedef shared_ptr<BlockingQueue<std::pair<Schema, std::string> > > TabletQueue; |
253 | | |
254 | | // A callback function which records the result of a tablet replica's checksum, |
255 | | // and then checks if the tablet server has any more tablets to checksum. If so, |
256 | | // a new async checksum scan is started. |
257 | | void TabletServerChecksumCallback( |
258 | | const scoped_refptr<ChecksumResultReporter>& reporter, |
259 | | const shared_ptr<YsckTabletServer>& tablet_server, |
260 | | const TabletQueue& queue, |
261 | | const TabletId& tablet_id, |
262 | | const ChecksumOptions& options, |
263 | | const Status& status, |
264 | 4.37k | uint64_t checksum) { |
265 | 4.37k | reporter->ReportResult(tablet_id, tablet_server->uuid(), status, checksum); |
266 | | |
267 | 4.37k | std::pair<Schema, TabletId> table_tablet; |
268 | 4.37k | if (queue->BlockingGet(&table_tablet)) { |
269 | 16 | const Schema& table_schema = table_tablet.first; |
270 | 16 | const TabletId& tablet_id = table_tablet.second; |
271 | 16 | ReportResultCallback callback = Bind(&TabletServerChecksumCallback, |
272 | 16 | reporter, |
273 | 16 | tablet_server, |
274 | 16 | queue, |
275 | 16 | tablet_id, |
276 | 16 | options); |
277 | 16 | tablet_server->RunTabletChecksumScanAsync(tablet_id, table_schema, options, callback); |
278 | 16 | } |
279 | 4.37k | } |
280 | | |
281 | | Status Ysck::ChecksumData(const vector<string>& tables, |
282 | | const vector<string>& tablets, |
283 | 658 | const ChecksumOptions& opts) { |
284 | 658 | const std::unordered_set<std::string> tables_filter(tables.begin(), tables.end()); |
285 | 658 | const std::unordered_set<std::string> tablets_filter(tablets.begin(), tablets.end()); |
286 | | |
287 | | // Copy options so that local modifications can be made and passed on. |
288 | 658 | ChecksumOptions options = opts; |
289 | | |
290 | 658 | using TabletTableMap = std::unordered_map< |
291 | 658 | std::shared_ptr<YsckTablet>, std::vector<std::shared_ptr<YsckTable>>>; |
292 | 658 | TabletTableMap tablet_table_map; |
293 | | |
294 | 658 | int num_tablet_replicas = 0; |
295 | 658 | bool there_are_non_system_tables = false; |
296 | 11.9k | for (const shared_ptr<YsckTable>& table : cluster_->tables()) { |
297 | 11.9k | if (table->name().is_system()) { |
298 | | // Skip the system namespace with virtual tables, since they are not assigned to tservers. |
299 | 11.3k | continue; |
300 | 11.3k | } |
301 | | |
302 | | // TODO: remove once we have is_system() implemented correctly for PostgreSQL tables. |
303 | 658 | if (table->table_type() == PGSQL_TABLE_TYPE) |
304 | 0 | continue; |
305 | | |
306 | 658 | there_are_non_system_tables = true; |
307 | 658 | VLOG(1) << "Table: " << table->name().ToString()0 ; |
308 | 658 | if (!tables_filter.empty() && !ContainsKey(tables_filter, table->name().table_name())0 ) continue0 ; |
309 | | // TODO: remove once we have scan implemented for Redis. |
310 | 658 | if (table->table_type() == REDIS_TABLE_TYPE) continue0 ; |
311 | 1.46k | for (const shared_ptr<YsckTablet>& tablet : table->tablets())658 { |
312 | 1.46k | VLOG(1) << "Tablet: " << tablet->id()0 ; |
313 | 1.46k | if (!tablets_filter.empty() && !ContainsKey(tablets_filter, tablet->id())0 ) continue0 ; |
314 | 1.46k | if (tablet_table_map.find(tablet) == tablet_table_map.end()) { |
315 | 1.46k | tablet_table_map[tablet] = std::vector<shared_ptr<YsckTable>>(1, table); |
316 | 1.46k | } else { |
317 | 0 | tablet_table_map[tablet].push_back(table); |
318 | 0 | } |
319 | 1.46k | num_tablet_replicas += tablet->replicas().size(); |
320 | 1.46k | } |
321 | 658 | } |
322 | | // Number of tablet replicas can be zero if there are no user tables available. |
323 | 658 | if (there_are_non_system_tables && num_tablet_replicas == 0) { |
324 | 0 | string msg = "No tablet replicas found."; |
325 | 0 | if (!tables.empty() || !tablets.empty()) { |
326 | 0 | msg += " Filter: "; |
327 | 0 | if (!tables.empty()) { |
328 | 0 | msg += "tables=" + JoinStrings(tables, ",") + "."; |
329 | 0 | } |
330 | 0 | if (!tablets.empty()) { |
331 | 0 | msg += "tablets=" + JoinStrings(tablets, ",") + "."; |
332 | 0 | } |
333 | 0 | } |
334 | 0 | return STATUS(NotFound, msg); |
335 | 0 | } |
336 | | |
337 | | // Map of tablet servers to tablet queue. |
338 | 658 | typedef unordered_map<shared_ptr<YsckTabletServer>, TabletQueue> TabletServerQueueMap; |
339 | | |
340 | 658 | TabletServerQueueMap tablet_server_queues; |
341 | 658 | scoped_refptr<ChecksumResultReporter> reporter(new ChecksumResultReporter(num_tablet_replicas)); |
342 | | |
343 | | // Create a queue of checksum callbacks grouped by the tablet server. |
344 | 1.46k | for (const TabletTableMap::value_type& entry : tablet_table_map) { |
345 | 1.46k | const shared_ptr<YsckTablet>& tablet = entry.first; |
346 | 1.46k | for (const shared_ptr<YsckTable>& table : entry.second) { |
347 | 4.37k | for (const shared_ptr<YsckTabletReplica>& replica : tablet->replicas()) { |
348 | 4.37k | const shared_ptr<YsckTabletServer>& ts = |
349 | 4.37k | FindOrDie(cluster_->tablet_servers(), replica->ts_uuid()); |
350 | | |
351 | 4.37k | const TabletQueue& queue = |
352 | 4.37k | LookupOrInsertNewSharedPtr(&tablet_server_queues, ts, num_tablet_replicas); |
353 | 4.37k | CHECK_EQ(QUEUE_SUCCESS, queue->Put(make_pair(table->schema(), tablet->id()))); |
354 | 4.37k | } |
355 | 1.46k | } |
356 | 1.46k | } |
357 | | |
358 | | // Kick off checksum scans in parallel. For each tablet server, we start |
359 | | // scan_concurrency scans. Each callback then initiates one additional |
360 | | // scan when it returns if the queue for that TS is not empty. |
361 | 1.96k | for (const TabletServerQueueMap::value_type& entry : tablet_server_queues) { |
362 | 1.96k | const shared_ptr<YsckTabletServer>& tablet_server = entry.first; |
363 | 1.96k | const TabletQueue& queue = entry.second; |
364 | 1.96k | queue->Shutdown(); // Ensures that BlockingGet() will not block. |
365 | 9.83k | for (int i = 0; i < options.scan_concurrency; i++7.86k ) { |
366 | 7.86k | std::pair<Schema, std::string> table_tablet; |
367 | 7.86k | if (queue->BlockingGet(&table_tablet)) { |
368 | 4.36k | const Schema& table_schema = table_tablet.first; |
369 | 4.36k | const std::string& tablet_id = table_tablet.second; |
370 | 4.36k | ReportResultCallback callback = Bind(&TabletServerChecksumCallback, |
371 | 4.36k | reporter, |
372 | 4.36k | tablet_server, |
373 | 4.36k | queue, |
374 | 4.36k | tablet_id, |
375 | 4.36k | options); |
376 | 4.36k | tablet_server->RunTabletChecksumScanAsync(tablet_id, table_schema, options, callback); |
377 | 4.36k | } |
378 | 7.86k | } |
379 | 1.96k | } |
380 | | |
381 | 658 | bool timed_out = false; |
382 | 658 | if (!reporter->WaitFor(options.timeout)) { |
383 | 0 | timed_out = true; |
384 | 0 | } |
385 | 658 | ChecksumResultReporter::TabletResultMap checksums = reporter->checksums(); |
386 | | |
387 | 658 | int num_errors = 0; |
388 | 658 | int num_mismatches = 0; |
389 | 658 | int num_results = 0; |
390 | 11.9k | for (const shared_ptr<YsckTable>& table : cluster_->tables()) { |
391 | 11.9k | bool printed_table_name = false; |
392 | 13.0k | for (const shared_ptr<YsckTablet>& tablet : table->tablets()) { |
393 | 13.0k | if (ContainsKey(checksums, tablet->id())) { |
394 | 1.46k | if (!printed_table_name) { |
395 | 658 | printed_table_name = true; |
396 | 658 | LOG(INFO) << "-----------------------"; |
397 | 658 | LOG(INFO) << table->name().ToString(); |
398 | 658 | LOG(INFO) << "-----------------------"; |
399 | 658 | } |
400 | 1.46k | bool seen_first_replica = false; |
401 | 1.46k | uint64_t first_checksum = 0; |
402 | | |
403 | 1.46k | for (const ChecksumResultReporter::ReplicaResultMap::value_type& r : |
404 | 4.37k | FindOrDie(checksums, tablet->id())) { |
405 | 4.37k | const string& replica_uuid = r.first; |
406 | | |
407 | 4.37k | shared_ptr<YsckTabletServer> ts = FindOrDie(cluster_->tablet_servers(), replica_uuid); |
408 | 4.37k | for (const ChecksumResultReporter::ResultPair& result : r.second) { |
409 | 4.37k | const Status &status = result.first; |
410 | 4.37k | uint64_t checksum = result.second; |
411 | 4.37k | string status_str = (status.ok()) ? Substitute("Checksum: $0", checksum)3.30k |
412 | 4.37k | : Substitute("Error: $0", status.ToString())1.07k ; |
413 | 4.37k | LOG(INFO) << Substitute("T $0 P $1 ($2): $3", |
414 | 4.37k | tablet->id(), ts->uuid(), ts->address(), status_str); |
415 | 4.37k | if (!status.ok()) { |
416 | 1.07k | num_errors++; |
417 | 3.30k | } else if (!seen_first_replica) { |
418 | 1.30k | seen_first_replica = true; |
419 | 1.30k | first_checksum = checksum; |
420 | 2.00k | } else if (checksum != first_checksum) { |
421 | 0 | num_mismatches++; |
422 | 0 | LOG(ERROR) << ">> Mismatch found in table " << table->name().ToString() |
423 | 0 | << " tablet " << tablet->id(); |
424 | 0 | } |
425 | 4.37k | } |
426 | 4.37k | num_results++; |
427 | 4.37k | } |
428 | 1.46k | } |
429 | 13.0k | } |
430 | 11.9k | if (printed_table_name) LOG(INFO) << ""658 ; |
431 | 11.9k | } |
432 | 658 | if (num_results != num_tablet_replicas) { |
433 | 0 | CHECK(timed_out) << Substitute("Unexpected error: only got $0 out of $1 replica results", |
434 | 0 | num_results, num_tablet_replicas); |
435 | 0 | return STATUS(TimedOut, Substitute("Checksum scan did not complete within the timeout of $0: " |
436 | 0 | "Received results for $1 out of $2 expected replicas", |
437 | 0 | options.timeout.ToString(), num_results, |
438 | 0 | num_tablet_replicas)); |
439 | 0 | } |
440 | 658 | if (num_mismatches != 0) { |
441 | 0 | return STATUS(Corruption, Substitute("$0 checksum mismatches were detected", num_mismatches)); |
442 | 0 | } |
443 | 658 | if (num_errors != 0) { |
444 | 630 | return STATUS(Aborted, Substitute("$0 errors were detected", num_errors)); |
445 | 630 | } |
446 | | |
447 | 28 | return Status::OK(); |
448 | 658 | } |
449 | | |
450 | 12.1k | bool Ysck::VerifyTable(const shared_ptr<YsckTable>& table) { |
451 | 12.1k | bool good_table = true; |
452 | 12.1k | vector<shared_ptr<YsckTablet> > tablets = table->tablets(); |
453 | 12.1k | auto tablets_count = tablets.size(); |
454 | 12.1k | if (tablets_count == 0) { |
455 | 0 | LOG(WARNING) << Substitute("Table $0 has 0 tablets", table->name().ToString()); |
456 | 0 | return false; |
457 | 0 | } |
458 | 12.1k | int table_num_replicas = table->num_replicas(); |
459 | 12.1k | VLOG(1) << Substitute("Verifying $0 tablets for table $1 configured with num_replicas = $2", |
460 | 0 | tablets_count, table->name().ToString(), table_num_replicas); |
461 | 12.1k | size_t bad_tablets_count = 0; |
462 | | // TODO check if the tablets are contiguous and in order. |
463 | 13.1k | for (const shared_ptr<YsckTablet> &tablet : tablets) { |
464 | 13.1k | if (!VerifyTablet(tablet, table_num_replicas)) { |
465 | 8 | bad_tablets_count++; |
466 | 8 | } |
467 | 13.1k | } |
468 | 12.1k | if (bad_tablets_count == 0) { |
469 | 12.0k | LOG(INFO) << Substitute("Table $0 is HEALTHY", table->name().ToString()); |
470 | 12.0k | } else { |
471 | 8 | LOG(WARNING) << Substitute("Table $0 has $1 bad tablets", |
472 | 8 | table->name().ToString(), bad_tablets_count); |
473 | 8 | good_table = false; |
474 | 8 | } |
475 | 12.1k | return good_table; |
476 | 12.1k | } |
477 | | |
478 | 13.1k | bool Ysck::VerifyTablet(const shared_ptr<YsckTablet>& tablet, size_t table_num_replicas) { |
479 | 13.1k | vector<shared_ptr<YsckTabletReplica> > replicas = tablet->replicas(); |
480 | 13.1k | bool good_tablet = true; |
481 | 13.1k | if (replicas.size() != table_num_replicas) { |
482 | 13.1k | LOG(WARNING) << Substitute("Tablet $0 has $1 instead of $2 replicas", |
483 | 13.1k | tablet->id(), replicas.size(), table_num_replicas); |
484 | | // We only fail the "goodness" check if the tablet is under-replicated. |
485 | 13.1k | if (replicas.size() < table_num_replicas) { |
486 | 1 | good_tablet = false; |
487 | 1 | } |
488 | 13.1k | } |
489 | 13.1k | int leaders_count = 0; |
490 | 13.1k | int followers_count = 0; |
491 | 16.9k | for (const shared_ptr<YsckTabletReplica>& replica : replicas) { |
492 | 16.9k | if (replica->is_leader()) { |
493 | 13.1k | VLOG(1) << Substitute("Replica at $0 is a LEADER", replica->ts_uuid())0 ; |
494 | 13.1k | leaders_count++; |
495 | 13.1k | } else if (3.73k replica->is_follower()3.73k ) { |
496 | 3.73k | VLOG(1) << Substitute("Replica at $0 is a FOLLOWER", replica->ts_uuid())0 ; |
497 | 3.73k | followers_count++; |
498 | 3.73k | } |
499 | 16.9k | } |
500 | 13.1k | if (leaders_count == 0) { |
501 | 8 | LOG(WARNING) << Format("Tablet $0 doesn't have a leader, replicas: $1", tablet->id(), replicas); |
502 | 8 | good_tablet = false; |
503 | 8 | } |
504 | 13.1k | VLOG(1) << Substitute("Tablet $0 has $1 leader and $2 followers", |
505 | 0 | tablet->id(), leaders_count, followers_count); |
506 | 13.1k | return good_tablet; |
507 | 13.1k | } |
508 | | |
509 | 0 | Status Ysck::CheckAssignments() { |
510 | | // TODO |
511 | 0 | return STATUS(NotSupported, "CheckAssignments hasn't been implemented"); |
512 | 0 | } |
513 | | |
514 | 9 | std::string YsckTabletReplica::ToString() const { |
515 | 9 | return YB_CLASS_TO_STRING(is_leader, is_follower, ts_uuid); |
516 | 9 | } |
517 | | |
518 | | } // namespace tools |
519 | | } // namespace yb |