/Users/deen/code/yugabyte-db/src/yb/master/catalog_manager_util.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) YugaByte, Inc. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
4 | | // in compliance with the License. You may obtain a copy of the License at |
5 | | // |
6 | | // http://www.apache.org/licenses/LICENSE-2.0 |
7 | | // |
8 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
9 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
10 | | // or implied. See the License for the specific language governing permissions and limitations |
11 | | // under the License. |
12 | | // |
13 | | |
14 | | #include "yb/master/catalog_manager_util.h" |
15 | | |
16 | | #include "yb/master/catalog_entity_info.h" |
17 | | |
18 | | #include "yb/util/flag_tags.h" |
19 | | #include "yb/util/math_util.h" |
20 | | #include "yb/util/string_util.h" |
21 | | |
22 | | DEFINE_double(balancer_load_max_standard_deviation, 2.0, |
23 | | "The standard deviation among the tserver load, above which that distribution " |
24 | | "is considered not balanced."); |
25 | | TAG_FLAG(balancer_load_max_standard_deviation, advanced); |
26 | | |
27 | | namespace yb { |
28 | | namespace master { |
29 | | |
30 | | using strings::Substitute; |
31 | | |
32 | 7 | Status CatalogManagerUtil::IsLoadBalanced(const master::TSDescriptorVector& ts_descs) { |
33 | 7 | ZoneToDescMap zone_to_ts; |
34 | 7 | RETURN_NOT_OK(GetPerZoneTSDesc(ts_descs, &zone_to_ts)); |
35 | | |
36 | 12 | for (const auto& zone : zone_to_ts)7 { |
37 | 12 | if (zone.second.size() <= 1) { |
38 | 5 | continue; |
39 | 5 | } |
40 | | |
41 | | // Map from placement uuid to tserver load vector. |
42 | 7 | std::map<string, vector<double>> load; |
43 | 23 | for (const auto &ts_desc : zone.second) { |
44 | 23 | (load[ts_desc->placement_uuid()]).push_back(ts_desc->num_live_replicas()); |
45 | 23 | } |
46 | | |
47 | 9 | for (const auto& entry : load) { |
48 | 9 | double std_dev = yb::standard_deviation(entry.second); |
49 | 9 | LOG(INFO) << "Load standard deviation is " << std_dev << " for " |
50 | 9 | << entry.second.size() << " tservers in placement " << zone.first |
51 | 9 | << " for placement uuid " << entry.first; |
52 | | |
53 | 9 | if (std_dev >= FLAGS_balancer_load_max_standard_deviation) { |
54 | 2 | return STATUS(IllegalState, Substitute("Load not balanced: deviation=$0 in $1 for " |
55 | 2 | "placement uuid $2.", |
56 | 2 | std_dev, zone.first, entry.first)); |
57 | 2 | } |
58 | 9 | } |
59 | 7 | } |
60 | 5 | return Status::OK(); |
61 | 7 | } |
62 | | |
63 | | Status CatalogManagerUtil::AreLeadersOnPreferredOnly( |
64 | | const TSDescriptorVector& ts_descs, |
65 | | const ReplicationInfoPB& replication_info, |
66 | 156 | const vector<scoped_refptr<TableInfo>>& tables) { |
67 | 156 | if (PREDICT_FALSE(ts_descs.empty())) { |
68 | 0 | return Status::OK(); |
69 | 0 | } |
70 | | |
71 | | // Variables for checking transaction leader spread. |
72 | 156 | auto num_servers = ts_descs.size(); |
73 | 156 | std::map<std::string, int> txn_map; |
74 | 156 | int num_txn_tablets = 0; |
75 | 156 | int max_txn_leaders_per_node = 0; |
76 | 156 | int min_txn_leaders_per_node = 0; |
77 | | |
78 | 156 | if (!FLAGS_transaction_tables_use_preferred_zones) { |
79 | 156 | CalculateTxnLeaderMap(&txn_map, &num_txn_tablets, tables); |
80 | 156 | max_txn_leaders_per_node = num_txn_tablets / num_servers; |
81 | 156 | min_txn_leaders_per_node = max_txn_leaders_per_node; |
82 | 156 | if (num_txn_tablets % num_servers) { |
83 | 0 | ++max_txn_leaders_per_node; |
84 | 0 | } |
85 | 156 | } |
86 | | |
87 | | // If transaction_tables_use_preferred_zones = true, don't check for transaction leader spread. |
88 | | // This results in txn_map being empty, num_txn_tablets = 0, max_txn_leaders_per_node = 0, and |
89 | | // system_tablets_leaders = 0. |
90 | | // Thus all comparisons for transaction leader spread will be ignored (all 0 < 0, etc). |
91 | | |
92 | 340 | for (const auto& ts_desc : ts_descs) { |
93 | 340 | auto tserver = txn_map.find(ts_desc->permanent_uuid()); |
94 | 340 | int system_tablets_leaders = 0; |
95 | 340 | if (!(tserver == txn_map.end())) { |
96 | 4 | system_tablets_leaders = tserver->second; |
97 | 4 | } |
98 | | |
99 | | // If enabled, check if transaction tablet leaders are evenly spread. |
100 | 340 | if (system_tablets_leaders > max_txn_leaders_per_node) { |
101 | 0 | return STATUS( |
102 | 0 | IllegalState, |
103 | 0 | Substitute("Too many txn status leaders found on tserver $0. Found $1, Expected $2.", |
104 | 0 | ts_desc->permanent_uuid(), |
105 | 0 | system_tablets_leaders, |
106 | 0 | max_txn_leaders_per_node)); |
107 | 0 | } |
108 | 340 | if (system_tablets_leaders < min_txn_leaders_per_node) { |
109 | 0 | return STATUS( |
110 | 0 | IllegalState, |
111 | 0 | Substitute("Tserver $0 expected to have at least $1 txn status leader(s), but has $2.", |
112 | 0 | ts_desc->permanent_uuid(), |
113 | 0 | min_txn_leaders_per_node, |
114 | 0 | system_tablets_leaders)); |
115 | 0 | } |
116 | | |
117 | | // Check that leaders are on preferred ts only. |
118 | | // If transaction tables follow preferred nodes, then we verify that there are 0 leaders. |
119 | | // Otherwise, we need to check that there are 0 non-txn leaders on the ts. |
120 | 340 | if (!ts_desc->IsAcceptingLeaderLoad(replication_info) && |
121 | 340 | ts_desc->leader_count() > system_tablets_leaders212 ) { |
122 | | // This is a ts that shouldn't have leader load (asides from txn leaders) but does. |
123 | 134 | return STATUS( |
124 | 134 | IllegalState, |
125 | 134 | Substitute("Expected no leader load on tserver $0, found $1.", |
126 | 134 | ts_desc->permanent_uuid(), ts_desc->leader_count() - system_tablets_leaders)); |
127 | 134 | } |
128 | 340 | } |
129 | 22 | return Status::OK(); |
130 | 156 | } |
131 | | |
132 | | void CatalogManagerUtil::CalculateTxnLeaderMap(std::map<std::string, int>* txn_map, |
133 | | int* num_txn_tablets, |
134 | 156 | vector<scoped_refptr<TableInfo>> tables) { |
135 | 3.20k | for (const auto& table : tables) { |
136 | 3.20k | bool is_txn_table = table->GetTableType() == TRANSACTION_STATUS_TABLE_TYPE; |
137 | 3.20k | if (!is_txn_table) { |
138 | 3.20k | continue; |
139 | 3.20k | } |
140 | 1 | TabletInfos tablets = table->GetTablets(); |
141 | 1 | (*num_txn_tablets) += tablets.size(); |
142 | 24 | for (const auto& tablet : tablets) { |
143 | 24 | auto replication_locations = tablet->GetReplicaLocations(); |
144 | 150 | for (const auto& replica : *replication_locations) { |
145 | 150 | if (replica.second.role == PeerRole::LEADER) { |
146 | 24 | (*txn_map)[replica.first]++; |
147 | 24 | } |
148 | 150 | } |
149 | 24 | } |
150 | 1 | } |
151 | 156 | } |
152 | | |
153 | | Status CatalogManagerUtil::GetPerZoneTSDesc(const TSDescriptorVector& ts_descs, |
154 | 11 | ZoneToDescMap* zone_to_ts) { |
155 | 11 | if (zone_to_ts == nullptr) { |
156 | 0 | return STATUS(InvalidArgument, "Need a non-null zone to tsdesc map that will be filled in."); |
157 | 0 | } |
158 | 11 | zone_to_ts->clear(); |
159 | 45 | for (const auto& ts_desc : ts_descs) { |
160 | 45 | string placement_id = ts_desc->placement_id(); |
161 | 45 | auto iter = zone_to_ts->find(placement_id); |
162 | 45 | if (iter == zone_to_ts->end()) { |
163 | 20 | (*zone_to_ts)[placement_id] = {ts_desc}; |
164 | 25 | } else { |
165 | 25 | iter->second.push_back(ts_desc); |
166 | 25 | } |
167 | 45 | } |
168 | 11 | return Status::OK(); |
169 | 11 | } |
170 | | |
171 | 723 | bool CatalogManagerUtil::IsCloudInfoEqual(const CloudInfoPB& lhs, const CloudInfoPB& rhs) { |
172 | 723 | return (lhs.placement_cloud() == rhs.placement_cloud() && |
173 | 723 | lhs.placement_region() == rhs.placement_region()342 && |
174 | 723 | lhs.placement_zone() == rhs.placement_zone()338 ); |
175 | 723 | } |
176 | | |
177 | | bool CatalogManagerUtil::DoesPlacementInfoContainCloudInfo(const PlacementInfoPB& placement_info, |
178 | 3.58k | const CloudInfoPB& cloud_info) { |
179 | 3.58k | for (const auto& placement_block : placement_info.placement_blocks()) { |
180 | 195 | if (IsCloudInfoEqual(placement_block.cloud_info(), cloud_info)) { |
181 | 171 | return true; |
182 | 171 | } |
183 | 195 | } |
184 | 3.41k | return false; |
185 | 3.58k | } |
186 | | |
187 | | bool CatalogManagerUtil::DoesPlacementInfoSpanMultipleRegions( |
188 | 3.75k | const PlacementInfoPB& placement_info) { |
189 | 3.75k | int num_blocks = placement_info.placement_blocks_size(); |
190 | 3.75k | if (num_blocks < 2) { |
191 | 3.56k | return false; |
192 | 3.56k | } |
193 | 192 | const auto& first_block = placement_info.placement_blocks(0).cloud_info(); |
194 | 192 | for (int i = 1; i < num_blocks; ++i0 ) { |
195 | 9 | const auto& cur_block = placement_info.placement_blocks(i).cloud_info(); |
196 | 9 | if (first_block.placement_cloud() != cur_block.placement_cloud() || |
197 | 9 | first_block.placement_region() != cur_block.placement_region()0 ) { |
198 | 9 | return true; |
199 | 9 | } |
200 | 9 | } |
201 | 183 | return false; |
202 | 192 | } |
203 | | |
204 | | Result<std::string> CatalogManagerUtil::GetPlacementUuidFromRaftPeer( |
205 | 38 | const ReplicationInfoPB& replication_info, const consensus::RaftPeerPB& peer) { |
206 | 38 | switch (peer.member_type()) { |
207 | 1 | case consensus::PeerMemberType::PRE_VOTER: |
208 | 33 | case consensus::PeerMemberType::VOTER: { |
209 | | // This peer is a live replica. |
210 | 33 | return replication_info.live_replicas().placement_uuid(); |
211 | 1 | } |
212 | 2 | case consensus::PeerMemberType::PRE_OBSERVER: |
213 | 5 | case consensus::PeerMemberType::OBSERVER: { |
214 | | // This peer is a read replica. |
215 | 5 | std::vector<std::string> placement_uuid_matches; |
216 | 9 | for (const auto& placement_info : replication_info.read_replicas()) { |
217 | 9 | if (CatalogManagerUtil::DoesPlacementInfoContainCloudInfo( |
218 | 9 | placement_info, peer.cloud_info())) { |
219 | 5 | placement_uuid_matches.push_back(placement_info.placement_uuid()); |
220 | 5 | } |
221 | 9 | } |
222 | | |
223 | 5 | if (placement_uuid_matches.size() != 1) { |
224 | 2 | return STATUS(IllegalState, Format("Expect 1 placement match for peer $0, found $1: $2", |
225 | 2 | peer.ShortDebugString(), placement_uuid_matches.size(), |
226 | 2 | VectorToString(placement_uuid_matches))); |
227 | 2 | } |
228 | | |
229 | 3 | return placement_uuid_matches.front(); |
230 | 5 | } |
231 | 0 | case consensus::PeerMemberType::UNKNOWN_MEMBER_TYPE: { |
232 | 0 | return STATUS(IllegalState, Format("Member type unknown for peer $0", |
233 | 5 | peer.ShortDebugString())); |
234 | 5 | } |
235 | 0 | default: |
236 | 0 | return STATUS(IllegalState, "Unhandled raft state for peer $0", peer.ShortDebugString()); |
237 | 38 | } |
238 | 38 | } |
239 | | |
240 | | CHECKED_STATUS CatalogManagerUtil::CheckIfCanDeleteSingleTablet( |
241 | 62 | const scoped_refptr<TabletInfo>& tablet) { |
242 | 62 | static const auto stringify_partition_key = [](const Slice& key) { |
243 | 56 | return key.empty() ? "{empty}"8 : key.ToDebugString()48 ; |
244 | 56 | }; |
245 | 62 | const auto& tablet_id = tablet->tablet_id(); |
246 | | |
247 | 62 | const auto tablet_lock = tablet->LockForRead(); |
248 | 62 | const auto tablet_pb = tablet_lock.data().pb; |
249 | 62 | if (tablet_pb.state() == SysTabletsEntryPB::DELETED) { |
250 | 12 | return STATUS_FORMAT(NotFound, "Tablet $0 has been already deleted", tablet_id); |
251 | 12 | } |
252 | 50 | const auto partition = tablet_pb.partition(); |
253 | | |
254 | 50 | TabletInfos tablets_in_range; |
255 | 50 | VLOG(3) << "Tablet " << tablet_id << " " << AsString(partition)0 ; |
256 | 50 | tablet->table()->GetTabletsInRange( |
257 | 50 | partition.partition_key_start(), partition.partition_key_end(), &tablets_in_range); |
258 | | |
259 | 50 | std::string partition_key = partition.partition_key_start(); |
260 | 115 | for (const auto& inner_tablet : tablets_in_range) { |
261 | 115 | if (inner_tablet->tablet_id() == tablet_id) { |
262 | 16 | continue; |
263 | 16 | } |
264 | 99 | PartitionPB inner_partition; |
265 | 99 | SysTabletsEntryPB::State inner_tablet_state; |
266 | 99 | { |
267 | 99 | const auto inner_tablet_lock = inner_tablet->LockForRead(); |
268 | 99 | const auto& pb = inner_tablet_lock.data().pb; |
269 | 99 | inner_partition = pb.partition(); |
270 | 99 | inner_tablet_state = pb.state(); |
271 | 99 | } |
272 | 99 | VLOG(3) << "Inner tablet " << inner_tablet->tablet_id() |
273 | 0 | << " partition: " << AsString(inner_partition) |
274 | 0 | << " state: " << SysTabletsEntryPB_State_Name(inner_tablet_state); |
275 | 99 | if (inner_tablet_state != SysTabletsEntryPB::RUNNING) { |
276 | 20 | continue; |
277 | 20 | } |
278 | 79 | if (partition_key != inner_partition.partition_key_start()) { |
279 | 23 | return STATUS_FORMAT( |
280 | 23 | IllegalState, |
281 | 23 | "Can't delete tablet $0 not covered by child tablets. Partition gap: $1 ... $2", |
282 | 23 | tablet_id, |
283 | 23 | stringify_partition_key(partition_key), |
284 | 23 | stringify_partition_key(inner_partition.partition_key_start())); |
285 | 23 | } |
286 | 56 | partition_key = inner_partition.partition_key_end(); |
287 | 56 | if (!partition.partition_key_end().empty() && partition_key >= partition.partition_key_end()42 ) { |
288 | 13 | break; |
289 | 13 | } |
290 | 56 | } |
291 | 27 | if (partition_key != partition.partition_key_end()) { |
292 | 5 | return STATUS_FORMAT( |
293 | 5 | IllegalState, |
294 | 5 | "Can't delete tablet $0 not covered by child tablets. Partition gap: $1 ... $2", |
295 | 5 | tablet_id, |
296 | 5 | stringify_partition_key(partition_key), |
297 | 5 | stringify_partition_key(partition.partition_key_end())); |
298 | 5 | } |
299 | 22 | return Status::OK(); |
300 | 27 | } |
301 | | |
302 | | CatalogManagerUtil::CloudInfoSimilarity CatalogManagerUtil::ComputeCloudInfoSimilarity( |
303 | 4.97M | const CloudInfoPB& ci1, const CloudInfoPB& ci2) { |
304 | 4.97M | if (ci1.has_placement_cloud() && |
305 | 4.97M | ci2.has_placement_cloud() && |
306 | 4.97M | ci1.placement_cloud() != ci2.placement_cloud()) { |
307 | 124k | return NO_MATCH; |
308 | 124k | } |
309 | | |
310 | 4.84M | if (ci1.has_placement_region() && |
311 | 4.84M | ci2.has_placement_region() && |
312 | 4.84M | ci1.placement_region() != ci2.placement_region()) { |
313 | 855k | return CLOUD_MATCH; |
314 | 855k | } |
315 | | |
316 | 3.99M | if (ci1.has_placement_zone() && |
317 | 3.99M | ci2.has_placement_zone()3.77M && |
318 | 3.99M | ci1.placement_zone() != ci2.placement_zone()3.77M ) { |
319 | 2.24M | return REGION_MATCH; |
320 | 2.24M | } |
321 | 1.74M | return ZONE_MATCH; |
322 | 3.99M | } |
323 | | |
324 | 4.96M | bool CatalogManagerUtil::IsCloudInfoPrefix(const CloudInfoPB& ci1, const CloudInfoPB& ci2) { |
325 | 4.96M | return ComputeCloudInfoSimilarity(ci1, ci2) == ZONE_MATCH; |
326 | 4.96M | } |
327 | | |
328 | 134 | CHECKED_STATUS CatalogManagerUtil::IsPlacementInfoValid(const PlacementInfoPB& placement_info) { |
329 | | // Check for duplicates. |
330 | 134 | std::unordered_set<string> cloud_info_string; |
331 | | |
332 | 218 | for (int i = 0; i < placement_info.placement_blocks_size(); i++84 ) { |
333 | 84 | if (!placement_info.placement_blocks(i).has_cloud_info()) { |
334 | 1 | continue; |
335 | 1 | } |
336 | | |
337 | 83 | const CloudInfoPB& ci = placement_info.placement_blocks(i).cloud_info(); |
338 | 83 | string ci_string = TSDescriptor::generate_placement_id(ci); |
339 | | |
340 | 83 | if (!cloud_info_string.count(ci_string)) { |
341 | 83 | cloud_info_string.insert(ci_string); |
342 | 83 | } else { |
343 | 0 | return STATUS(IllegalState, |
344 | 0 | Substitute("Placement information specified should not contain duplicates." |
345 | 0 | "Given placement block: $0 isn't a prefix", ci.ShortDebugString())); |
346 | 0 | } |
347 | 83 | } |
348 | | |
349 | | // Validate the placement blocks to be prefixes. |
350 | 218 | for (int i = 0; 134 i < placement_info.placement_blocks_size(); i++84 ) { |
351 | 84 | if (!placement_info.placement_blocks(i).has_cloud_info()) { |
352 | 1 | continue; |
353 | 1 | } |
354 | | |
355 | 83 | const CloudInfoPB& pb = placement_info.placement_blocks(i).cloud_info(); |
356 | | |
357 | | // Four cases for pb to be a prefix. |
358 | 83 | bool contains_cloud = pb.has_placement_cloud(); |
359 | 83 | bool contains_region = pb.has_placement_region(); |
360 | 83 | bool contains_zone = pb.has_placement_zone(); |
361 | | // *.*.* |
362 | 83 | bool star_star_star = !contains_cloud && !contains_region1 && !contains_zone1 ; |
363 | | // C.*.* |
364 | 83 | bool c_star_star = contains_cloud && !contains_region82 && !contains_zone1 ; |
365 | | // C.R.* |
366 | 83 | bool c_r_star = contains_cloud && contains_region82 && !contains_zone81 ; |
367 | | // C.R.Z |
368 | 83 | bool c_r_z = contains_cloud && contains_region82 && contains_zone81 ; |
369 | | |
370 | 83 | if (!star_star_star && !c_star_star82 && !c_r_star81 && !c_r_z80 ) { |
371 | 0 | return STATUS(IllegalState, |
372 | 0 | Substitute("Placement information specified should be prefixes." |
373 | 0 | "Given placement block: $0 isn't a prefix", pb.ShortDebugString())); |
374 | 0 | } |
375 | 83 | } |
376 | | |
377 | | // No two prefixes should overlap. |
378 | 218 | for (int i = 0; 134 i < placement_info.placement_blocks_size(); i++84 ) { |
379 | 312 | for (int j = 0; j < placement_info.placement_blocks_size(); j++228 ) { |
380 | 228 | if (i == j) { |
381 | 84 | continue; |
382 | 144 | } else { |
383 | 144 | if (!placement_info.placement_blocks(i).has_cloud_info() || |
384 | 144 | !placement_info.placement_blocks(j).has_cloud_info()) { |
385 | 0 | continue; |
386 | 0 | } |
387 | | |
388 | 144 | const CloudInfoPB& pb1 = placement_info.placement_blocks(i).cloud_info(); |
389 | 144 | const CloudInfoPB& pb2 = placement_info.placement_blocks(j).cloud_info(); |
390 | | // pb1 shouldn't be prefix of pb2. |
391 | 144 | if (CatalogManagerUtil::IsCloudInfoPrefix(pb1, pb2)) { |
392 | 0 | return STATUS(IllegalState, |
393 | 0 | Substitute("Placement information specified should not overlap. $0 and" |
394 | 0 | " $1 overlap. For instance, c1.r1.z1,c1.r1 is invalid while " |
395 | 0 | "c1.r1.z1,c1.r1.z2 is valid. Also note that c1.r1,c1.r1 is valid.", |
396 | 0 | pb1.ShortDebugString(), pb2.ShortDebugString())); |
397 | 0 | } |
398 | 144 | } |
399 | 228 | } |
400 | 84 | } |
401 | 134 | return Status::OK(); |
402 | 134 | } |
403 | | |
404 | 499k | bool CMPerTableLoadState::CompareLoads(const TabletServerId &ts1, const TabletServerId &ts2) { |
405 | 499k | if (per_ts_load_[ts1] != per_ts_load_[ts2]) { |
406 | 194k | return per_ts_load_[ts1] < per_ts_load_[ts2]; |
407 | 194k | } |
408 | 304k | if (global_load_state_->GetGlobalLoad(ts1) == global_load_state_->GetGlobalLoad(ts2)) { |
409 | 299k | return ts1 < ts2; |
410 | 299k | } |
411 | 4.53k | return global_load_state_->GetGlobalLoad(ts1) < global_load_state_->GetGlobalLoad(ts2); |
412 | 304k | } |
413 | | |
414 | 161k | void CMPerTableLoadState::SortLoad() { |
415 | 161k | Comparator comp(this); |
416 | 161k | std::sort(sorted_load_.begin(), sorted_load_.end(), comp); |
417 | 161k | } |
418 | | |
419 | | } // namespace master |
420 | | } // namespace yb |