/Users/deen/code/yugabyte-db/src/yb/master/catalog_manager.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // |
18 | | // The following only applies to changes made to this file as part of YugaByte development. |
19 | | // |
20 | | // Portions Copyright (c) YugaByte, Inc. |
21 | | // |
22 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
23 | | // in compliance with the License. You may obtain a copy of the License at |
24 | | // |
25 | | // http://www.apache.org/licenses/LICENSE-2.0 |
26 | | // |
27 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
28 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
29 | | // or implied. See the License for the specific language governing permissions and limitations |
30 | | // under the License. |
31 | | // |
32 | | // ================================================================================================ |
33 | | // |
34 | | // The catalog manager handles the current list of tables |
35 | | // and tablets in the cluster, as well as their current locations. |
36 | | // Since most operations in the master go through these data |
37 | | // structures, locking is carefully managed here to prevent unnecessary |
38 | | // contention and deadlocks: |
39 | | // |
40 | | // - each structure has an internal spinlock used for operations that |
41 | | // are purely in-memory (eg the current status of replicas) |
42 | | // - data that is persisted on disk is stored in separate PersistentTable(t)Info |
43 | | // structs. These are managed using copy-on-write so that writers may block |
44 | | // writing them back to disk while not impacting concurrent readers. |
45 | | // |
46 | | // Usage rules: |
47 | | // - You may obtain READ locks in any order. READ locks should never block, |
48 | | // since they only conflict with COMMIT which is a purely in-memory operation. |
49 | | // Thus they are deadlock-free. |
50 | | // - If you need a WRITE lock on both a table and one or more of its tablets, |
51 | | // acquire the lock on the table first. This strict ordering prevents deadlocks. |
52 | | // |
53 | | // ================================================================================================ |
54 | | |
55 | | #include "yb/master/catalog_manager.h" |
56 | | |
57 | | #include <stdlib.h> |
58 | | |
59 | | #include <algorithm> |
60 | | #include <atomic> |
61 | | #include <bitset> |
62 | | #include <functional> |
63 | | #include <memory> |
64 | | #include <mutex> |
65 | | #include <set> |
66 | | #include <string> |
67 | | #include <unordered_map> |
68 | | #include <vector> |
69 | | |
70 | | #include <boost/optional.hpp> |
71 | | #include <glog/logging.h> |
72 | | |
73 | | #include "yb/client/client-internal.h" |
74 | | #include "yb/client/client.h" |
75 | | #include "yb/client/schema.h" |
76 | | #include "yb/client/universe_key_client.h" |
77 | | |
78 | | #include "yb/common/common.pb.h" |
79 | | #include "yb/common/common_flags.h" |
80 | | #include "yb/common/constants.h" |
81 | | #include "yb/common/key_encoder.h" |
82 | | #include "yb/common/partial_row.h" |
83 | | #include "yb/common/partition.h" |
84 | | #include "yb/common/ql_type.h" |
85 | | #include "yb/common/roles_permissions.h" |
86 | | #include "yb/common/schema.h" |
87 | | #include "yb/common/wire_protocol.h" |
88 | | |
89 | | #include "yb/consensus/consensus.h" |
90 | | #include "yb/consensus/consensus.pb.h" |
91 | | #include "yb/consensus/consensus_util.h" |
92 | | #include "yb/consensus/metadata.pb.h" |
93 | | #include "yb/consensus/opid_util.h" |
94 | | #include "yb/consensus/quorum_util.h" |
95 | | |
96 | | #include "yb/docdb/doc_key.h" |
97 | | |
98 | | #include "yb/gutil/atomicops.h" |
99 | | #include "yb/gutil/bind.h" |
100 | | #include "yb/gutil/casts.h" |
101 | | #include "yb/gutil/map-util.h" |
102 | | #include "yb/gutil/mathlimits.h" |
103 | | #include "yb/gutil/stl_util.h" |
104 | | #include "yb/gutil/strings/escaping.h" |
105 | | #include "yb/gutil/strings/join.h" |
106 | | #include "yb/gutil/strings/substitute.h" |
107 | | #include "yb/gutil/sysinfo.h" |
108 | | #include "yb/gutil/walltime.h" |
109 | | |
110 | | #include "yb/master/master_fwd.h" |
111 | | #include "yb/master/async_rpc_tasks.h" |
112 | | #include "yb/master/backfill_index.h" |
113 | | #include "yb/master/catalog_entity_info.h" |
114 | | #include "yb/master/catalog_loaders.h" |
115 | | #include "yb/master/catalog_manager-internal.h" |
116 | | #include "yb/master/catalog_manager_bg_tasks.h" |
117 | | #include "yb/master/catalog_manager_util.h" |
118 | | #include "yb/master/cluster_balance.h" |
119 | | #include "yb/master/encryption_manager.h" |
120 | | #include "yb/master/master.h" |
121 | | #include "yb/master/master_admin.pb.h" |
122 | | #include "yb/master/master_client.pb.h" |
123 | | #include "yb/master/master_cluster.proxy.h" |
124 | | #include "yb/master/master_dcl.pb.h" |
125 | | #include "yb/master/master_ddl.pb.h" |
126 | | #include "yb/master/master_encryption.pb.h" |
127 | | #include "yb/master/master_error.h" |
128 | | #include "yb/master/master_heartbeat.pb.h" |
129 | | #include "yb/master/master_replication.pb.h" |
130 | | #include "yb/master/master_util.h" |
131 | | #include "yb/master/permissions_manager.h" |
132 | | #include "yb/master/scoped_leader_shared_lock-internal.h" |
133 | | #include "yb/master/sys_catalog.h" |
134 | | #include "yb/master/sys_catalog_constants.h" |
135 | | #include "yb/master/ts_descriptor.h" |
136 | | #include "yb/master/yql_aggregates_vtable.h" |
137 | | #include "yb/master/yql_auth_resource_role_permissions_index.h" |
138 | | #include "yb/master/yql_auth_role_permissions_vtable.h" |
139 | | #include "yb/master/yql_auth_roles_vtable.h" |
140 | | #include "yb/master/yql_columns_vtable.h" |
141 | | #include "yb/master/yql_empty_vtable.h" |
142 | | #include "yb/master/yql_functions_vtable.h" |
143 | | #include "yb/master/yql_indexes_vtable.h" |
144 | | #include "yb/master/yql_keyspaces_vtable.h" |
145 | | #include "yb/master/yql_local_vtable.h" |
146 | | #include "yb/master/yql_partitions_vtable.h" |
147 | | #include "yb/master/yql_peers_vtable.h" |
148 | | #include "yb/master/yql_size_estimates_vtable.h" |
149 | | #include "yb/master/yql_tables_vtable.h" |
150 | | #include "yb/master/yql_triggers_vtable.h" |
151 | | #include "yb/master/yql_types_vtable.h" |
152 | | #include "yb/master/yql_views_vtable.h" |
153 | | #include "yb/master/ysql_transaction_ddl.h" |
154 | | |
155 | | #include "yb/rpc/messenger.h" |
156 | | #include "yb/rpc/rpc_controller.h" |
157 | | |
158 | | #include "yb/tablet/operations/change_metadata_operation.h" |
159 | | #include "yb/tablet/tablet.h" |
160 | | #include "yb/tablet/tablet_metadata.h" |
161 | | #include "yb/tablet/tablet_peer.h" |
162 | | #include "yb/tablet/tablet_retention_policy.h" |
163 | | |
164 | | #include "yb/tserver/remote_bootstrap_client.h" |
165 | | #include "yb/tserver/ts_tablet_manager.h" |
166 | | #include "yb/tserver/tserver_error.h" |
167 | | |
168 | | #include "yb/util/atomic.h" |
169 | | #include "yb/util/countdown_latch.h" |
170 | | #include "yb/util/debug-util.h" |
171 | | #include "yb/util/debug/trace_event.h" |
172 | | #include "yb/util/flag_tags.h" |
173 | | #include "yb/util/format.h" |
174 | | #include "yb/util/hash_util.h" |
175 | | #include "yb/util/locks.h" |
176 | | #include "yb/util/math_util.h" |
177 | | #include "yb/util/metrics.h" |
178 | | #include "yb/util/monotime.h" |
179 | | #include "yb/util/net/net_util.h" |
180 | | #include "yb/util/oid_generator.h" |
181 | | #include "yb/util/random_util.h" |
182 | | #include "yb/util/rw_mutex.h" |
183 | | #include "yb/util/semaphore.h" |
184 | | #include "yb/util/shared_lock.h" |
185 | | #include "yb/util/size_literals.h" |
186 | | #include "yb/util/status.h" |
187 | | #include "yb/util/status_format.h" |
188 | | #include "yb/util/status_log.h" |
189 | | #include "yb/util/stopwatch.h" |
190 | | #include "yb/util/string_util.h" |
191 | | #include "yb/util/sync_point.h" |
192 | | #include "yb/util/thread.h" |
193 | | #include "yb/util/threadpool.h" |
194 | | #include "yb/util/trace.h" |
195 | | #include "yb/util/tsan_util.h" |
196 | | #include "yb/util/uuid.h" |
197 | | |
198 | | #include "yb/yql/pgwrapper/pg_wrapper.h" |
199 | | #include "yb/yql/redis/redisserver/redis_constants.h" |
200 | | |
201 | | using namespace std::literals; |
202 | | using namespace yb::size_literals; |
203 | | |
204 | | DEFINE_int32(master_ts_rpc_timeout_ms, 30 * 1000, // 30 sec |
205 | | "Timeout used for the Master->TS async rpc calls."); |
206 | | TAG_FLAG(master_ts_rpc_timeout_ms, advanced); |
207 | | |
208 | | DEFINE_int32(tablet_creation_timeout_ms, 30 * 1000, // 30 sec |
209 | | "Timeout used by the master when attempting to create tablet " |
210 | | "replicas during table creation."); |
211 | | TAG_FLAG(tablet_creation_timeout_ms, advanced); |
212 | | |
213 | | DEFINE_test_flag(bool, disable_tablet_deletion, false, |
214 | | "Whether catalog manager should disable tablet deletion."); |
215 | | |
216 | | DEFINE_bool(catalog_manager_wait_for_new_tablets_to_elect_leader, true, |
217 | | "Whether the catalog manager should wait for a newly created tablet to " |
218 | | "elect a leader before considering it successfully created. " |
219 | | "This is disabled in some tests where we explicitly manage leader " |
220 | | "election."); |
221 | | TAG_FLAG(catalog_manager_wait_for_new_tablets_to_elect_leader, hidden); |
222 | | |
223 | | DEFINE_int32(catalog_manager_inject_latency_in_delete_table_ms, 0, |
224 | | "Number of milliseconds that the master will sleep in DeleteTable."); |
225 | | TAG_FLAG(catalog_manager_inject_latency_in_delete_table_ms, hidden); |
226 | | |
227 | | DECLARE_int32(catalog_manager_bg_task_wait_ms); |
228 | | |
229 | | DEFINE_int32(replication_factor, 3, |
230 | | "Default number of replicas for tables that do not have the num_replicas set."); |
231 | | TAG_FLAG(replication_factor, advanced); |
232 | | |
233 | | DEFINE_int32(max_create_tablets_per_ts, 50, |
234 | | "The number of tablets per TS that can be requested for a new table."); |
235 | | TAG_FLAG(max_create_tablets_per_ts, advanced); |
236 | | |
237 | | DEFINE_int32(catalog_manager_report_batch_size, 1, |
238 | | "The max number of tablets evaluated in the heartbeat as a single SysCatalog update."); |
239 | | TAG_FLAG(catalog_manager_report_batch_size, advanced); |
240 | | |
241 | | DEFINE_int32(master_failover_catchup_timeout_ms, 30 * 1000 * yb::kTimeMultiplier, // 30 sec |
242 | | "Amount of time to give a newly-elected leader master to load" |
243 | | " the previous master's metadata and become active. If this time" |
244 | | " is exceeded, the node crashes."); |
245 | | TAG_FLAG(master_failover_catchup_timeout_ms, advanced); |
246 | | TAG_FLAG(master_failover_catchup_timeout_ms, experimental); |
247 | | |
248 | | DEFINE_bool(master_tombstone_evicted_tablet_replicas, true, |
249 | | "Whether the Master should tombstone (delete) tablet replicas that " |
250 | | "are no longer part of the latest reported raft config."); |
251 | | TAG_FLAG(master_tombstone_evicted_tablet_replicas, hidden); |
252 | | DECLARE_bool(master_ignore_deleted_on_load); |
253 | | |
254 | | // Temporary. Can be removed after long-run testing. |
255 | | DEFINE_bool(master_ignore_stale_cstate, true, |
256 | | "Whether Master processes the raft config when the version is lower."); |
257 | | TAG_FLAG(master_ignore_stale_cstate, hidden); |
258 | | |
259 | | DEFINE_bool(catalog_manager_check_ts_count_for_create_table, true, |
260 | | "Whether the master should ensure that there are enough live tablet " |
261 | | "servers to satisfy the provided replication count before allowing " |
262 | | "a table to be created."); |
263 | | TAG_FLAG(catalog_manager_check_ts_count_for_create_table, hidden); |
264 | | |
265 | | DEFINE_test_flag(bool, catalog_manager_check_yql_partitions_exist_for_is_create_table_done, true, |
266 | | "Whether the master should ensure that all of a table's tablets are " |
267 | | "in the YQL system.partitions vtable during the IsCreateTableDone check."); |
268 | | |
269 | | METRIC_DEFINE_gauge_uint32(cluster, num_tablet_servers_live, |
270 | | "Number of live tservers in the cluster", yb::MetricUnit::kUnits, |
271 | | "The number of tablet servers that have responded or done a heartbeat " |
272 | | "in the time interval defined by the gflag " |
273 | | "FLAGS_tserver_unresponsive_timeout_ms."); |
274 | | |
275 | | METRIC_DEFINE_gauge_uint32(cluster, num_tablet_servers_dead, |
276 | | "Number of dead tservers in the cluster", yb::MetricUnit::kUnits, |
277 | | "The number of tablet servers that have not responded or done a " |
278 | | "heartbeat in the time interval defined by the gflag " |
279 | | "FLAGS_tserver_unresponsive_timeout_ms."); |
280 | | |
281 | | DEFINE_test_flag(uint64, inject_latency_during_remote_bootstrap_secs, 0, |
282 | | "Number of seconds to sleep during a remote bootstrap."); |
283 | | |
284 | | DEFINE_test_flag(uint64, inject_latency_during_tablet_report_ms, 0, |
285 | | "Number of milliseconds to sleep during the processing of a tablet batch."); |
286 | | |
287 | | DEFINE_test_flag(bool, catalog_manager_simulate_system_table_create_failure, false, |
288 | | "This is only used in tests to simulate a failure where the table information is " |
289 | | "persisted in syscatalog, but the tablet information is not yet persisted and " |
290 | | "there is a failure."); |
291 | | |
292 | | DEFINE_string(cluster_uuid, "", "Cluster UUID to be used by this cluster"); |
293 | | TAG_FLAG(cluster_uuid, hidden); |
294 | | |
295 | | DECLARE_int32(yb_num_shards_per_tserver); |
296 | | |
297 | | DEFINE_int32(transaction_table_num_tablets, 0, |
298 | | "Number of tablets to use when creating the transaction status table." |
299 | | "0 to use transaction_table_num_tablets_per_tserver."); |
300 | | |
301 | | DEFINE_int32(transaction_table_num_tablets_per_tserver, kAutoDetectNumShardsPerTServer, |
302 | | "The default number of tablets per tablet server for transaction status table. If the value is " |
303 | | "-1, the system automatically determines an appropriate value based on number of CPU cores."); |
304 | | |
305 | | DEFINE_bool(auto_create_local_transaction_tables, true, |
306 | | "Whether or not to create local transaction status tables automatically on table " |
307 | | "creation with a tablespace with placement specified."); |
308 | | |
309 | | DEFINE_test_flag(bool, name_transaction_tables_with_tablespace_id, false, |
310 | | "This is only used in tests to make associating automatically created transaction " |
311 | | "tables with their tablespaces easier, and causes transaction tables created " |
312 | | "automatically for tablespaces to include the tablespace oid in their names."); |
313 | | |
314 | | DEFINE_bool(master_enable_metrics_snapshotter, false, "Should metrics snapshotter be enabled"); |
315 | | |
316 | | DEFINE_int32(metrics_snapshots_table_num_tablets, 0, |
317 | | "Number of tablets to use when creating the metrics snapshots table." |
318 | | "0 to use the same default num tablets as for regular tables."); |
319 | | |
320 | | DEFINE_bool(disable_index_backfill, false, |
321 | | "A kill switch to disable multi-stage backfill for YCQL indexes."); |
322 | | TAG_FLAG(disable_index_backfill, runtime); |
323 | | TAG_FLAG(disable_index_backfill, hidden); |
324 | | |
325 | | DEFINE_bool(disable_index_backfill_for_non_txn_tables, true, |
326 | | "A kill switch to disable multi-stage backfill for user enforced YCQL indexes. " |
327 | | "Note that enabling this feature may cause the create index flow to be slow. " |
328 | | "This is needed to ensure the safety of the index backfill process. See also " |
329 | | "index_backfill_upperbound_for_user_enforced_txn_duration_ms"); |
330 | | TAG_FLAG(disable_index_backfill_for_non_txn_tables, runtime); |
331 | | TAG_FLAG(disable_index_backfill_for_non_txn_tables, hidden); |
332 | | |
333 | | DEFINE_bool(enable_transactional_ddl_gc, true, |
334 | | "A kill switch for transactional DDL GC. Temporary safety measure."); |
335 | | TAG_FLAG(enable_transactional_ddl_gc, runtime); |
336 | | TAG_FLAG(enable_transactional_ddl_gc, hidden); |
337 | | |
338 | | DEFINE_bool( |
339 | | hide_pg_catalog_table_creation_logs, false, |
340 | | "Whether to hide detailed log messages for PostgreSQL catalog table creation. " |
341 | | "This cuts down test logs significantly."); |
342 | | TAG_FLAG(hide_pg_catalog_table_creation_logs, hidden); |
343 | | |
344 | | DEFINE_test_flag(int32, simulate_slow_table_create_secs, 0, |
345 | | "Simulates a slow table creation by sleeping after the table has been added to memory."); |
346 | | |
347 | | DEFINE_test_flag(int32, simulate_slow_system_tablet_bootstrap_secs, 0, |
348 | | "Simulates a slow tablet bootstrap by adding a sleep before system tablet init."); |
349 | | |
350 | | DEFINE_test_flag(bool, return_error_if_namespace_not_found, false, |
351 | | "Return an error from ListTables if a namespace id is not found in the map"); |
352 | | |
353 | | DEFINE_test_flag(bool, hang_on_namespace_transition, false, |
354 | | "Used in tests to simulate a lapse between issuing a namespace op and final processing."); |
355 | | |
356 | | DEFINE_test_flag(bool, simulate_crash_after_table_marked_deleting, false, |
357 | | "Crash yb-master after table's state is set to DELETING. This skips tablets deletion."); |
358 | | |
359 | | DEFINE_bool(master_drop_table_after_task_response, true, |
360 | | "Mark a table as DELETED as soon as we get all the responses from all the TS."); |
361 | | TAG_FLAG(master_drop_table_after_task_response, advanced); |
362 | | TAG_FLAG(master_drop_table_after_task_response, runtime); |
363 | | |
364 | | DEFINE_test_flag(bool, tablegroup_master_only, false, |
365 | | "This is only for MasterTest to be able to test tablegroups without the" |
366 | | " transaction status table being created."); |
367 | | |
368 | | DEFINE_bool(enable_register_ts_from_raft, true, "Whether to register a tserver from the consensus " |
369 | | "information of a reported tablet."); |
370 | | |
371 | | DECLARE_int32(tserver_unresponsive_timeout_ms); |
372 | | |
373 | | DEFINE_bool(use_create_table_leader_hint, true, |
374 | | "Whether the Master should hint which replica for each tablet should " |
375 | | "be leader initially on tablet creation."); |
376 | | TAG_FLAG(use_create_table_leader_hint, runtime); |
377 | | |
378 | | DEFINE_test_flag(bool, create_table_leader_hint_min_lexicographic, false, |
379 | | "Whether the Master should hint replica with smallest lexicographic rank for each " |
380 | | "tablet as leader initially on tablet creation."); |
381 | | |
382 | | DEFINE_double(heartbeat_safe_deadline_ratio, .20, |
383 | | "When the heartbeat deadline has this percentage of time remaining, " |
384 | | "the master should halt tablet report processing so it can respond in time."); |
385 | | DECLARE_int32(heartbeat_rpc_timeout_ms); |
386 | | DECLARE_CAPABILITY(TabletReportLimit); |
387 | | |
388 | | DEFINE_int32(partitions_vtable_cache_refresh_secs, 0, |
389 | | "Amount of time to wait before refreshing the system.partitions cached vtable. " |
390 | | "If generate_partitions_vtable_on_changes is set, then this background task will " |
391 | | "update the cache using the internal map, but won't do any generating of the vtable."); |
392 | | |
393 | | DEFINE_int32(txn_table_wait_min_ts_count, 1, |
394 | | "Minimum Number of TS to wait for before creating the transaction status table." |
395 | | " Default value is 1. We wait for atleast --replication_factor if this value" |
396 | | " is smaller than that"); |
397 | | TAG_FLAG(txn_table_wait_min_ts_count, advanced); |
398 | | |
399 | | DEFINE_bool(enable_ysql_tablespaces_for_placement, true, |
400 | | "If set, tablespaces will be used for placement of YSQL tables."); |
401 | | TAG_FLAG(enable_ysql_tablespaces_for_placement, runtime); |
402 | | |
403 | | DEFINE_int32(ysql_tablespace_info_refresh_secs, 30, |
404 | | "Frequency at which the table to tablespace information will be updated in master " |
405 | | "from pg catalog tables. A value of -1 disables the refresh task."); |
406 | | TAG_FLAG(ysql_tablespace_info_refresh_secs, runtime); |
407 | | |
408 | | DEFINE_int64(tablet_split_size_threshold_bytes, 0, |
409 | | "DEPRECATED -- Threshold on tablet size after which tablet should be split. Automated " |
410 | | "splitting is disabled if this value is set to 0."); |
411 | | TAG_FLAG(tablet_split_size_threshold_bytes, hidden); |
412 | | |
413 | | DEFINE_int64(tablet_split_low_phase_shard_count_per_node, 8, |
414 | | "The per-node tablet count until which a table is splitting at the phase 1 threshold, " |
415 | | "as defined by tablet_split_low_phase_size_threshold_bytes."); |
416 | | DEFINE_int64(tablet_split_high_phase_shard_count_per_node, 24, |
417 | | "The per-node tablet count until which a table is splitting at the phase 2 threshold, " |
418 | | "as defined by tablet_split_high_phase_size_threshold_bytes."); |
419 | | |
420 | | DEFINE_int64(tablet_split_low_phase_size_threshold_bytes, 512_MB, |
421 | | "The tablet size threshold at which to split tablets in phase 1. " |
422 | | "See tablet_split_low_phase_shard_count_per_node."); |
423 | | DEFINE_int64(tablet_split_high_phase_size_threshold_bytes, 10_GB, |
424 | | "The tablet size threshold at which to split tablets in phase 2. " |
425 | | "See tablet_split_high_phase_shard_count_per_node."); |
426 | | DEFINE_int64(tablet_force_split_threshold_bytes, 100_GB, |
427 | | "The tablet size threshold at which to split tablets regardless of how many tablets " |
428 | | "exist in the table already. This should be configured to prevent runaway whale " |
429 | | "tablets from forming in your cluster even if both automatic splitting phases have " |
430 | | "been finished."); |
431 | | |
432 | | DEFINE_test_flag(bool, crash_server_on_sys_catalog_leader_affinity_move, false, |
433 | | "When set, crash the master process if it performs a sys catalog leader affinity " |
434 | | "move."); |
435 | | DEFINE_int32(blacklist_progress_initial_delay_secs, yb::master::kDelayAfterFailoverSecs, |
436 | | "When a master leader failsover, the time until which the progress of load movement " |
437 | | "off the blacklisted tservers is reported as 0. This initial delay " |
438 | | "gives sufficient time for heartbeats so that we don't report" |
439 | | " a premature incorrect completion."); |
440 | | TAG_FLAG(blacklist_progress_initial_delay_secs, runtime); |
441 | | |
442 | | DEFINE_test_flag(bool, validate_all_tablet_candidates, false, |
443 | | "When set to true, consider any tablet a valid candidate for splitting. " |
444 | | "Specifically this flag ensures that ValidateSplitCandidateTable and " |
445 | | "ValidateSplitCandidateTablet always return OK and all tablets are considered " |
446 | | "valid candidates for splitting."); |
447 | | |
448 | | DEFINE_test_flag(bool, skip_placement_validation_createtable_api, false, |
449 | | "When set, it skips checking that all the tablets of a table have enough tservers" |
450 | | " conforming to the table placement policy during CreateTable API call."); |
451 | | TAG_FLAG(TEST_skip_placement_validation_createtable_api, runtime); |
452 | | |
453 | | DEFINE_test_flag(int32, slowdown_alter_table_rpcs_ms, 0, |
454 | | "Slows down the alter table rpc's send and response handler so that the TServer " |
455 | | "has a heartbeat delay and triggers tablet leader change."); |
456 | | |
457 | | DEFINE_test_flag(bool, reject_delete_not_serving_tablet_rpc, false, |
458 | | "Whether to reject DeleteNotServingTablet RPC."); |
459 | | |
460 | | DEFINE_test_flag(double, crash_after_creating_single_split_tablet, 0.0, |
461 | | "Crash inside CatalogManager::RegisterNewTabletForSplit after calling Upsert"); |
462 | | |
463 | | DEFINE_bool(enable_delete_truncate_xcluster_replicated_table, false, |
464 | | "When set, enables deleting/truncating tables currently in xCluster replication"); |
465 | | TAG_FLAG(enable_delete_truncate_xcluster_replicated_table, runtime); |
466 | | |
467 | | DEFINE_test_flag(bool, sequential_colocation_ids, false, |
468 | | "When set, colocation IDs will be assigned sequentially (starting from 20001) " |
469 | | "rather than at random. This is especially useful for making pg_regress " |
470 | | "tests output consistent and predictable."); |
471 | | |
472 | | namespace yb { |
473 | | namespace master { |
474 | | |
475 | | using std::atomic; |
476 | | using std::shared_ptr; |
477 | | using std::string; |
478 | | using std::unique_ptr; |
479 | | using std::vector; |
480 | | |
481 | | using namespace std::placeholders; |
482 | | |
483 | | using base::subtle::NoBarrier_Load; |
484 | | using base::subtle::NoBarrier_CompareAndSwap; |
485 | | using consensus::kMinimumTerm; |
486 | | using consensus::CONSENSUS_CONFIG_COMMITTED; |
487 | | using consensus::CONSENSUS_CONFIG_ACTIVE; |
488 | | using consensus::COMMITTED_OPID; |
489 | | using consensus::Consensus; |
490 | | using consensus::ConsensusMetadata; |
491 | | using consensus::ConsensusServiceProxy; |
492 | | using consensus::ConsensusStatePB; |
493 | | using consensus::GetConsensusRole; |
494 | | using consensus::PeerMemberType; |
495 | | using consensus::RaftPeerPB; |
496 | | using consensus::StartRemoteBootstrapRequestPB; |
497 | | using rpc::RpcContext; |
498 | | using server::MonitoredTask; |
499 | | using strings::Substitute; |
500 | | using tablet::TABLET_DATA_COPYING; |
501 | | using tablet::TABLET_DATA_DELETED; |
502 | | using tablet::TABLET_DATA_READY; |
503 | | using tablet::TABLET_DATA_TOMBSTONED; |
504 | | using tablet::TabletDataState; |
505 | | using tablet::RaftGroupMetadata; |
506 | | using tablet::RaftGroupMetadataPtr; |
507 | | using tablet::TabletPeer; |
508 | | using tablet::RaftGroupStatePB; |
509 | | using tablet::TabletStatusListener; |
510 | | using tablet::TabletStatusPB; |
511 | | using tserver::HandleReplacingStaleTablet; |
512 | | using tserver::TabletServerErrorPB; |
513 | | using yb::pgwrapper::PgWrapper; |
514 | | using yb::server::MasterAddressesToString; |
515 | | |
516 | | using yb::client::YBClient; |
517 | | using yb::client::YBClientBuilder; |
518 | | using yb::client::YBColumnSchema; |
519 | | using yb::client::YBSchema; |
520 | | using yb::client::YBSchemaBuilder; |
521 | | using yb::client::YBTable; |
522 | | using yb::client::YBTableName; |
523 | | |
524 | | namespace { |
525 | | |
526 | | // Macros to access index information in CATALOG. |
527 | | // |
528 | | // NOTES from file master.proto for SysTablesEntryPB. |
529 | | // - For index table: [to be deprecated and replaced by "index_info"] |
530 | | // optional bytes indexed_table_id = 13; // Indexed table id of this index. |
531 | | // optional bool is_local_index = 14 [ default = false ]; // Whether this is a local index. |
532 | | // optional bool is_unique_index = 15 [ default = false ]; // Whether this is a unique index. |
533 | | // - During transition period, we have to consider both fields and the following macros help |
534 | | // avoiding duplicate protobuf version check thru out our code. |
535 | | |
536 | 47.3k | const std::string& GetIndexedTableId(const SysTablesEntryPB& pb) { |
537 | 47.3k | return pb.has_index_info() ? pb.index_info().indexed_table_id() : pb.indexed_table_id()0 ; |
538 | 47.3k | } |
539 | | |
540 | | #define PROTO_GET_IS_LOCAL(tabpb) \ |
541 | 5.80k | (tabpb.has_index_info() ? tabpb.index_info().is_local() \ |
542 | 5.80k | : tabpb.is_local_index()0 ) |
543 | | |
544 | | #define PROTO_GET_IS_UNIQUE(tabpb) \ |
545 | 5.80k | (tabpb.has_index_info() ? tabpb.index_info().is_unique() \ |
546 | 5.80k | : tabpb.is_unique_index()0 ) |
547 | | |
548 | | template <class PB> |
549 | 59.7k | bool IsIndex(const PB& pb) { |
550 | 59.7k | return pb.has_index_info() || !pb.indexed_table_id().empty()43.2k ; |
551 | 59.7k | } catalog_manager.cc:bool yb::master::(anonymous namespace)::IsIndex<yb::master::SysTablesEntryPB>(yb::master::SysTablesEntryPB const&) Line | Count | Source | 549 | 51.1k | bool IsIndex(const PB& pb) { | 550 | 51.1k | return pb.has_index_info() || !pb.indexed_table_id().empty()35.7k ; | 551 | 51.1k | } |
catalog_manager.cc:bool yb::master::(anonymous namespace)::IsIndex<yb::master::CreateTableRequestPB>(yb::master::CreateTableRequestPB const&) Line | Count | Source | 549 | 8.65k | bool IsIndex(const PB& pb) { | 550 | 8.65k | return pb.has_index_info() || !pb.indexed_table_id().empty()7.43k ; | 551 | 8.65k | } |
|
552 | | |
553 | 13.9k | bool IsTable(const SysTablesEntryPB& pb) { |
554 | 13.9k | return !IsIndex(pb); |
555 | 13.9k | } |
556 | | |
557 | | #define PROTO_PTR_IS_INDEX(tabpb) \ |
558 | | (tabpb->has_index_info() || !tabpb->indexed_table_id().empty()) |
559 | | |
560 | | #define PROTO_PTR_IS_TABLE(tabpb) \ |
561 | 8.45k | (!tabpb->has_index_info() && tabpb->indexed_table_id().empty()7.26k ) |
562 | | |
563 | | #if (0) |
564 | | // Once the deprecated fields are obsolete, the above macros should be defined as the following. |
565 | | #define GetIndexedTableId(tabpb) (tabpb.index_info().indexed_table_id()) |
566 | | #define PROTO_GET_IS_LOCAL(tabpb) (tabpb.index_info().is_local()) |
567 | | #define PROTO_GET_IS_UNIQUE(tabpb) (tabpb.index_info().is_unique()) |
568 | | #define PROTO_IS_INDEX(tabpb) (tabpb.has_index_info()) |
569 | | #define PROTO_IS_TABLE(tabpb) (!tabpb.has_index_info()) |
570 | | #define PROTO_PTR_IS_INDEX(tabpb) (tabpb->has_index_info()) |
571 | | #define PROTO_PTR_IS_TABLE(tabpb) (!tabpb->has_index_info()) |
572 | | |
573 | | #endif |
574 | | |
575 | | class IndexInfoBuilder { |
576 | | public: |
577 | 18 | explicit IndexInfoBuilder(IndexInfoPB* index_info) : index_info_(*index_info) { |
578 | 18 | DVLOG(3) << " After " << __PRETTY_FUNCTION__ << " index_info_ is " << yb::ToString(index_info_)0 ; |
579 | 18 | } |
580 | | |
581 | 18 | void ApplyProperties(const TableId& indexed_table_id, bool is_local, bool is_unique) { |
582 | 18 | index_info_.set_indexed_table_id(indexed_table_id); |
583 | 18 | index_info_.set_version(0); |
584 | 18 | index_info_.set_is_local(is_local); |
585 | 18 | index_info_.set_is_unique(is_unique); |
586 | 18 | DVLOG(3) << " After " << __PRETTY_FUNCTION__ << " index_info_ is " << yb::ToString(index_info_)0 ; |
587 | 18 | } |
588 | | |
589 | 18 | CHECKED_STATUS ApplyColumnMapping(const Schema& indexed_schema, const Schema& index_schema) { |
590 | 72 | for (size_t i = 0; i < index_schema.num_columns(); i++54 ) { |
591 | 54 | const auto& col_name = index_schema.column(i).name(); |
592 | 54 | const auto indexed_col_idx = indexed_schema.find_column(col_name); |
593 | 54 | if (PREDICT_FALSE(indexed_col_idx == Schema::kColumnNotFound)) { |
594 | 0 | return STATUS(NotFound, "The indexed table column does not exist", col_name); |
595 | 0 | } |
596 | 54 | auto* col = index_info_.add_columns(); |
597 | 54 | col->set_column_id(index_schema.column_id(i)); |
598 | 54 | col->set_indexed_column_id(indexed_schema.column_id(indexed_col_idx)); |
599 | 54 | } |
600 | 18 | index_info_.set_hash_column_count(narrow_cast<uint32_t>(index_schema.num_hash_key_columns())); |
601 | 18 | index_info_.set_range_column_count(narrow_cast<uint32_t>(index_schema.num_range_key_columns())); |
602 | | |
603 | 36 | for (size_t i = 0; i < indexed_schema.num_hash_key_columns(); i++18 ) { |
604 | 18 | index_info_.add_indexed_hash_column_ids(indexed_schema.column_id(i)); |
605 | 18 | } |
606 | 18 | for (size_t i = indexed_schema.num_hash_key_columns(); i < indexed_schema.num_key_columns(); |
607 | 18 | i++0 ) { |
608 | 0 | index_info_.add_indexed_range_column_ids(indexed_schema.column_id(i)); |
609 | 0 | } |
610 | 18 | DVLOG(3) << " After " << __PRETTY_FUNCTION__ << " index_info_ is " << yb::ToString(index_info_)0 ; |
611 | 18 | return Status::OK(); |
612 | 18 | } |
613 | | |
614 | | private: |
615 | | IndexInfoPB& index_info_; |
616 | | }; |
617 | | |
618 | | template<class Lock> |
619 | 469k | Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock) { |
620 | | // This covers both in progress and fully deleted objects. |
621 | 469k | if (lock->started_deleting()) { |
622 | 136 | return STATUS_EC_FORMAT( |
623 | 136 | NotFound, MasterError(MasterErrorPB::OBJECT_NOT_FOUND), |
624 | 136 | "The object '$0.$1' does not exist", lock->namespace_id(), lock->name()); |
625 | 136 | } |
626 | 469k | if (!lock->visible_to_client()) { |
627 | 1 | return STATUS_EC_FORMAT( |
628 | 1 | ServiceUnavailable, MasterError(MasterErrorPB::OBJECT_NOT_FOUND), |
629 | 1 | "The object '$0.$1' is not running", lock->namespace_id(), lock->name()); |
630 | 1 | } |
631 | 469k | return Status::OK(); |
632 | 469k | } catalog_manager.cc:yb::Status yb::master::(anonymous namespace)::CheckIfTableDeletedOrNotVisibleToClient<yb::CowWriteLock<yb::master::PersistentTableInfo> >(yb::CowWriteLock<yb::master::PersistentTableInfo> const&) Line | Count | Source | 619 | 7.09k | Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock) { | 620 | | // This covers both in progress and fully deleted objects. | 621 | 7.09k | if (lock->started_deleting()) { | 622 | 0 | return STATUS_EC_FORMAT( | 623 | 0 | NotFound, MasterError(MasterErrorPB::OBJECT_NOT_FOUND), | 624 | 0 | "The object '$0.$1' does not exist", lock->namespace_id(), lock->name()); | 625 | 0 | } | 626 | 7.09k | if (!lock->visible_to_client()) { | 627 | 0 | return STATUS_EC_FORMAT( | 628 | 0 | ServiceUnavailable, MasterError(MasterErrorPB::OBJECT_NOT_FOUND), | 629 | 0 | "The object '$0.$1' is not running", lock->namespace_id(), lock->name()); | 630 | 0 | } | 631 | 7.09k | return Status::OK(); | 632 | 7.09k | } |
catalog_manager.cc:yb::Status yb::master::(anonymous namespace)::CheckIfTableDeletedOrNotVisibleToClient<yb::CowReadLock<yb::master::PersistentTableInfo> >(yb::CowReadLock<yb::master::PersistentTableInfo> const&) Line | Count | Source | 619 | 462k | Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock) { | 620 | | // This covers both in progress and fully deleted objects. | 621 | 462k | if (lock->started_deleting()) { | 622 | 136 | return STATUS_EC_FORMAT( | 623 | 136 | NotFound, MasterError(MasterErrorPB::OBJECT_NOT_FOUND), | 624 | 136 | "The object '$0.$1' does not exist", lock->namespace_id(), lock->name()); | 625 | 136 | } | 626 | 462k | if (!lock->visible_to_client()) { | 627 | 1 | return STATUS_EC_FORMAT( | 628 | 1 | ServiceUnavailable, MasterError(MasterErrorPB::OBJECT_NOT_FOUND), | 629 | 1 | "The object '$0.$1' is not running", lock->namespace_id(), lock->name()); | 630 | 1 | } | 631 | 462k | return Status::OK(); | 632 | 462k | } |
|
633 | | |
634 | | template<class Lock, class RespClass> |
635 | 429k | Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) { |
636 | 429k | auto status = CheckIfTableDeletedOrNotVisibleToClient(lock); |
637 | 429k | if (!status.ok()) { |
638 | 52 | return SetupError(resp->mutable_error(), status); |
639 | 52 | } |
640 | 429k | return Status::OK(); |
641 | 429k | } catalog_manager.cc:yb::Status yb::master::(anonymous namespace)::CheckIfTableDeletedOrNotVisibleToClient<yb::CowWriteLock<yb::master::PersistentTableInfo>, yb::master::CreateTableResponsePB>(yb::CowWriteLock<yb::master::PersistentTableInfo> const&, yb::master::CreateTableResponsePB*) Line | Count | Source | 635 | 1.19k | Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) { | 636 | 1.19k | auto status = CheckIfTableDeletedOrNotVisibleToClient(lock); | 637 | 1.19k | if (!status.ok()) { | 638 | 0 | return SetupError(resp->mutable_error(), status); | 639 | 0 | } | 640 | 1.19k | return Status::OK(); | 641 | 1.19k | } |
catalog_manager.cc:yb::Status yb::master::(anonymous namespace)::CheckIfTableDeletedOrNotVisibleToClient<yb::CowReadLock<yb::master::PersistentTableInfo>, yb::master::CreateTableResponsePB>(yb::CowReadLock<yb::master::PersistentTableInfo> const&, yb::master::CreateTableResponsePB*) Line | Count | Source | 635 | 1.20k | Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) { | 636 | 1.20k | auto status = CheckIfTableDeletedOrNotVisibleToClient(lock); | 637 | 1.20k | if (!status.ok()) { | 638 | 1 | return SetupError(resp->mutable_error(), status); | 639 | 1 | } | 640 | 1.20k | return Status::OK(); | 641 | 1.20k | } |
catalog_manager.cc:yb::Status yb::master::(anonymous namespace)::CheckIfTableDeletedOrNotVisibleToClient<yb::CowReadLock<yb::master::PersistentTableInfo>, yb::master::GetTransactionStatusTabletsResponsePB>(yb::CowReadLock<yb::master::PersistentTableInfo> const&, yb::master::GetTransactionStatusTabletsResponsePB*) Line | Count | Source | 635 | 3.31k | Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) { | 636 | 3.31k | auto status = CheckIfTableDeletedOrNotVisibleToClient(lock); | 637 | 3.31k | if (!status.ok()) { | 638 | 0 | return SetupError(resp->mutable_error(), status); | 639 | 0 | } | 640 | 3.31k | return Status::OK(); | 641 | 3.31k | } |
catalog_manager.cc:yb::Status yb::master::(anonymous namespace)::CheckIfTableDeletedOrNotVisibleToClient<yb::CowReadLock<yb::master::PersistentTableInfo>, yb::master::TruncateTableResponsePB>(yb::CowReadLock<yb::master::PersistentTableInfo> const&, yb::master::TruncateTableResponsePB*) Line | Count | Source | 635 | 7.17k | Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) { | 636 | 7.17k | auto status = CheckIfTableDeletedOrNotVisibleToClient(lock); | 637 | 7.17k | if (!status.ok()) { | 638 | 0 | return SetupError(resp->mutable_error(), status); | 639 | 0 | } | 640 | 7.17k | return Status::OK(); | 641 | 7.17k | } |
catalog_manager.cc:yb::Status yb::master::(anonymous namespace)::CheckIfTableDeletedOrNotVisibleToClient<yb::CowReadLock<yb::master::PersistentTableInfo>, yb::master::IsTruncateTableDoneResponsePB>(yb::CowReadLock<yb::master::PersistentTableInfo> const&, yb::master::IsTruncateTableDoneResponsePB*) Line | Count | Source | 635 | 10.5k | Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) { | 636 | 10.5k | auto status = CheckIfTableDeletedOrNotVisibleToClient(lock); | 637 | 10.5k | if (!status.ok()) { | 638 | 0 | return SetupError(resp->mutable_error(), status); | 639 | 0 | } | 640 | 10.5k | return Status::OK(); | 641 | 10.5k | } |
catalog_manager.cc:yb::Status yb::master::(anonymous namespace)::CheckIfTableDeletedOrNotVisibleToClient<yb::CowWriteLock<yb::master::PersistentTableInfo>, yb::master::AlterTableResponsePB>(yb::CowWriteLock<yb::master::PersistentTableInfo> const&, yb::master::AlterTableResponsePB*) Line | Count | Source | 635 | 5.89k | Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) { | 636 | 5.89k | auto status = CheckIfTableDeletedOrNotVisibleToClient(lock); | 637 | 5.89k | if (!status.ok()) { | 638 | 0 | return SetupError(resp->mutable_error(), status); | 639 | 0 | } | 640 | 5.89k | return Status::OK(); | 641 | 5.89k | } |
catalog_manager.cc:yb::Status yb::master::(anonymous namespace)::CheckIfTableDeletedOrNotVisibleToClient<yb::CowReadLock<yb::master::PersistentTableInfo>, yb::master::IsAlterTableDoneResponsePB>(yb::CowReadLock<yb::master::PersistentTableInfo> const&, yb::master::IsAlterTableDoneResponsePB*) Line | Count | Source | 635 | 1.39k | Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) { | 636 | 1.39k | auto status = CheckIfTableDeletedOrNotVisibleToClient(lock); | 637 | 1.39k | if (!status.ok()) { | 638 | 0 | return SetupError(resp->mutable_error(), status); | 639 | 0 | } | 640 | 1.39k | return Status::OK(); | 641 | 1.39k | } |
catalog_manager.cc:yb::Status yb::master::(anonymous namespace)::CheckIfTableDeletedOrNotVisibleToClient<yb::CowReadLock<yb::master::PersistentTableInfo>, yb::master::GetTableSchemaResponsePB>(yb::CowReadLock<yb::master::PersistentTableInfo> const&, yb::master::GetTableSchemaResponsePB*) Line | Count | Source | 635 | 170k | Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) { | 636 | 170k | auto status = CheckIfTableDeletedOrNotVisibleToClient(lock); | 637 | 170k | if (!status.ok()) { | 638 | 21 | return SetupError(resp->mutable_error(), status); | 639 | 21 | } | 640 | 169k | return Status::OK(); | 641 | 170k | } |
Unexecuted instantiation: catalog_manager.cc:yb::Status yb::master::(anonymous namespace)::CheckIfTableDeletedOrNotVisibleToClient<yb::CowReadLock<yb::master::PersistentTableInfo>, yb::master::GetColocatedTabletSchemaResponsePB>(yb::CowReadLock<yb::master::PersistentTableInfo> const&, yb::master::GetColocatedTabletSchemaResponsePB*) catalog_manager.cc:yb::Status yb::master::(anonymous namespace)::CheckIfTableDeletedOrNotVisibleToClient<yb::CowReadLock<yb::master::PersistentTableInfo>, yb::master::GetTableLocationsResponsePB>(yb::CowReadLock<yb::master::PersistentTableInfo> const&, yb::master::GetTableLocationsResponsePB*) Line | Count | Source | 635 | 228k | Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) { | 636 | 228k | auto status = CheckIfTableDeletedOrNotVisibleToClient(lock); | 637 | 228k | if (!status.ok()) { | 638 | 30 | return SetupError(resp->mutable_error(), status); | 639 | 30 | } | 640 | 228k | return Status::OK(); | 641 | 228k | } |
|
642 | | |
643 | 10.6k | #define VERIFY_NAMESPACE_FOUND(expr, resp) RESULT_CHECKER_HELPER( \ |
644 | 10.0k | expr, \ |
645 | 10.0k | if (!__result.ok()) { \ |
646 | 10.0k | return SetupError((resp)->mutable_error(), __result.status()); \ |
647 | 10.0k | }); |
648 | | |
649 | 2 | MasterErrorPB_Code NamespaceMasterError(SysNamespaceEntryPB_State state) { |
650 | 2 | switch (state) { |
651 | 2 | case SysNamespaceEntryPB::PREPARING: FALLTHROUGH_INTENDED; |
652 | 2 | case SysNamespaceEntryPB::DELETING: |
653 | 2 | return MasterErrorPB::IN_TRANSITION_CAN_RETRY; |
654 | 0 | case SysNamespaceEntryPB::DELETED: FALLTHROUGH_INTENDED; |
655 | 0 | case SysNamespaceEntryPB::FAILED: FALLTHROUGH_INTENDED; |
656 | 0 | case SysNamespaceEntryPB::RUNNING: |
657 | 0 | return MasterErrorPB::INTERNAL_ERROR; |
658 | 0 | default: |
659 | 0 | FATAL_INVALID_ENUM_VALUE(SysNamespaceEntryPB_State, state); |
660 | 2 | } |
661 | 2 | } |
662 | | |
663 | 299k | size_t GetNameMapperIndex(YQLDatabase db_type) { |
664 | 299k | switch (db_type) { |
665 | 0 | case YQL_DATABASE_UNKNOWN: break; |
666 | 286k | case YQL_DATABASE_CQL: return 1; |
667 | 5.00k | case YQL_DATABASE_PGSQL: return 2; |
668 | 8.09k | case YQL_DATABASE_REDIS: return 3; |
669 | 299k | } |
670 | 0 | CHECK(false) << "Unexpected db type " << db_type; |
671 | 0 | return 0; |
672 | 299k | } |
673 | | |
674 | 9.23k | bool IsIndexBackfillEnabled(TableType table_type, bool is_transactional) { |
675 | | // Fetch the runtime flag to prevent any issues from the updates to flag while processing. |
676 | 9.23k | const bool disabled = |
677 | 9.23k | (table_type == PGSQL_TABLE_TYPE |
678 | 9.23k | ? GetAtomicFlag(&FLAGS_ysql_disable_index_backfill)5.58k |
679 | 9.23k | : GetAtomicFlag(&FLAGS_disable_index_backfill)3.65k || |
680 | 3.65k | (!is_transactional && GetAtomicFlag(&FLAGS_disable_index_backfill_for_non_txn_tables)3.04k )); |
681 | 9.23k | return !disabled; |
682 | 9.23k | } |
683 | | |
684 | | constexpr auto kDefaultYQLPartitionsRefreshBgTaskSleep = 10s; |
685 | | |
686 | | void FillRetainedBySnapshotSchedules( |
687 | | const SnapshotSchedulesToObjectIdsMap& schedules_to_tables_map, |
688 | | const TableId& table_id, |
689 | 5.91k | RepeatedBytes* retained_by_snapshot_schedules) { |
690 | 5.91k | for (const auto& entry : schedules_to_tables_map) { |
691 | 4 | if (std::binary_search(entry.second.begin(), entry.second.end(), table_id)) { |
692 | 4 | retained_by_snapshot_schedules->Add()->assign( |
693 | 4 | entry.first.AsSlice().cdata(), entry.first.size()); |
694 | 4 | } |
695 | 4 | } |
696 | 5.91k | } |
697 | | |
698 | 7.97k | int GetTransactionTableNumShardsPerTServer() { |
699 | 7.97k | int value = 8; |
700 | 7.97k | if (IsTsan()) { |
701 | 0 | value = 2; |
702 | 7.97k | } else if (base::NumCPUs() <= 2) { |
703 | 0 | value = 4; |
704 | 0 | } |
705 | 7.97k | return value; |
706 | 7.97k | } |
707 | | |
708 | 8.07k | void InitMasterFlags() { |
709 | 8.07k | yb::InitCommonFlags(); |
710 | 8.07k | if (GetAtomicFlag(&FLAGS_transaction_table_num_tablets_per_tserver) == |
711 | 8.07k | kAutoDetectNumShardsPerTServer) { |
712 | 7.97k | const auto value = GetTransactionTableNumShardsPerTServer(); |
713 | 7.97k | VLOG(1) << "Auto setting FLAGS_transaction_table_num_tablets_per_tserver to " << value0 ; |
714 | 7.97k | SetAtomicFlag(value, &FLAGS_transaction_table_num_tablets_per_tserver); |
715 | 7.97k | } |
716 | 8.07k | } |
717 | | |
718 | 13.6k | Result<bool> DoesTableExist(const Result<TableInfoPtr>& result) { |
719 | 13.6k | if (result.ok()) { |
720 | 12.4k | return true; |
721 | 12.4k | } |
722 | 1.20k | if (result.status().IsNotFound() |
723 | 1.20k | && MasterError(result.status()) == MasterErrorPB::OBJECT_NOT_FOUND) { |
724 | 1.20k | return false; |
725 | 1.20k | } |
726 | 0 | return result.status(); |
727 | 1.20k | } |
728 | | |
729 | | } // anonymous namespace |
730 | | |
731 | | //////////////////////////////////////////////////////////// |
732 | | // CatalogManager |
733 | | //////////////////////////////////////////////////////////// |
734 | | |
735 | | CatalogManager::NamespaceInfoMap& CatalogManager::NamespaceNameMapper::operator[]( |
736 | 19.5k | YQLDatabase db_type) { |
737 | 19.5k | return typed_maps_[GetNameMapperIndex(db_type)]; |
738 | 19.5k | } |
739 | | |
740 | | const CatalogManager::NamespaceInfoMap& CatalogManager::NamespaceNameMapper::operator[]( |
741 | 279k | YQLDatabase db_type) const { |
742 | 279k | return typed_maps_[GetNameMapperIndex(db_type)]; |
743 | 279k | } |
744 | | |
745 | 3.75k | void CatalogManager::NamespaceNameMapper::clear() { |
746 | 15.0k | for (auto& m : typed_maps_) { |
747 | 15.0k | m.clear(); |
748 | 15.0k | } |
749 | 3.75k | } |
750 | | |
751 | | CatalogManager::CatalogManager(Master* master) |
752 | | : master_(master), |
753 | | tablet_exists_(false), |
754 | | state_(kConstructed), |
755 | | leader_ready_term_(-1), |
756 | | leader_lock_(RWMutex::Priority::PREFER_WRITING), |
757 | | load_balance_policy_(std::make_unique<ClusterLoadBalancer>(this)), |
758 | | permissions_manager_(std::make_unique<PermissionsManager>(this)), |
759 | | tasks_tracker_(new TasksTracker(IsUserInitiated::kFalse)), |
760 | | jobs_tracker_(new TasksTracker(IsUserInitiated::kTrue)), |
761 | | encryption_manager_(new EncryptionManager()), |
762 | | tablespace_manager_(std::make_shared<YsqlTablespaceManager>(nullptr, nullptr)), |
763 | | tablespace_bg_task_running_(false), |
764 | 8.07k | tablet_split_manager_(this, this, this) { |
765 | 8.07k | InitMasterFlags(); |
766 | 8.07k | CHECK_OK(ThreadPoolBuilder("leader-initialization") |
767 | 8.07k | .set_max_threads(1) |
768 | 8.07k | .Build(&leader_initialization_pool_)); |
769 | 8.07k | CHECK_OK(ThreadPoolBuilder("CatalogManagerBGTasks").Build(&background_tasks_thread_pool_)); |
770 | 8.07k | CHECK_OK(ThreadPoolBuilder("async-tasks").Build(&async_task_pool_)); |
771 | | |
772 | 8.07k | if (master_) { |
773 | 8.07k | sys_catalog_.reset(new SysCatalogTable( |
774 | 8.07k | master_, master_->metric_registry(), |
775 | 8.07k | Bind(&CatalogManager::ElectedAsLeaderCb, Unretained(this)))); |
776 | 8.07k | } |
777 | 8.07k | } |
778 | | |
779 | 92 | CatalogManager::~CatalogManager() { |
780 | 92 | if (StartShutdown()) { |
781 | 0 | CompleteShutdown(); |
782 | 0 | } |
783 | 92 | } |
784 | | |
785 | 8.03k | Status CatalogManager::Init() { |
786 | 8.03k | { |
787 | 8.03k | std::lock_guard<simple_spinlock> l(state_lock_); |
788 | 8.03k | CHECK_EQ(kConstructed, state_); |
789 | 8.03k | state_ = kStarting; |
790 | 8.03k | } |
791 | | |
792 | 8.03k | if (master_) { |
793 | 8.03k | ysql_transaction_ = std::make_unique<YsqlTransactionDdl>( |
794 | 8.03k | sys_catalog_.get(), master_->async_client_initializer().get_client_future(), |
795 | 8.03k | background_tasks_thread_pool_.get()); |
796 | 8.03k | } |
797 | | |
798 | | // Initialize the metrics emitted by the catalog manager. |
799 | 8.03k | metric_num_tablet_servers_live_ = |
800 | 8.03k | METRIC_num_tablet_servers_live.Instantiate(master_->metric_entity_cluster(), 0); |
801 | | |
802 | 8.03k | metric_num_tablet_servers_dead_ = |
803 | 8.03k | METRIC_num_tablet_servers_dead.Instantiate(master_->metric_entity_cluster(), 0); |
804 | | |
805 | 8.03k | RETURN_NOT_OK_PREPEND(InitSysCatalogAsync(), |
806 | 8.03k | "Failed to initialize sys tables async"); |
807 | | |
808 | 8.02k | if (PREDICT_FALSE(FLAGS_TEST_simulate_slow_system_tablet_bootstrap_secs > 0)) { |
809 | 9 | LOG_WITH_PREFIX(INFO) << "Simulating slow system tablet bootstrap"; |
810 | 9 | SleepFor(MonoDelta::FromSeconds(FLAGS_TEST_simulate_slow_system_tablet_bootstrap_secs)); |
811 | 9 | } |
812 | | |
813 | | // WaitUntilRunning() must run outside of the lock as to prevent |
814 | | // deadlock. This is safe as WaitUntilRunning waits for another |
815 | | // thread to finish its work and doesn't itself depend on any state |
816 | | // within CatalogManager. Need not start sys catalog or background tasks |
817 | | // when we are started in shell mode. |
818 | 8.02k | if (!master_->opts().IsShellMode()) { |
819 | 7.88k | RETURN_NOT_OK_PREPEND(sys_catalog_->WaitUntilRunning(), |
820 | 7.88k | "Failed waiting for the catalog tablet to run"); |
821 | 7.88k | std::vector<consensus::RaftPeerPB> masters_raft; |
822 | 7.88k | RETURN_NOT_OK(master_->ListRaftConfigMasters(&masters_raft)); |
823 | 7.88k | std::vector<HostPort> hps; |
824 | 20.6k | for (const auto& peer : masters_raft) { |
825 | 20.6k | if (NodeInstance().permanent_uuid() == peer.permanent_uuid()) { |
826 | 7.88k | continue; |
827 | 7.88k | } |
828 | 12.8k | HostPort hp = HostPortFromPB(DesiredHostPort(peer, master_->MakeCloudInfoPB())); |
829 | 12.8k | hps.push_back(hp); |
830 | 12.8k | } |
831 | 7.88k | universe_key_client_ = std::make_unique<client::UniverseKeyClient>( |
832 | 12.7k | hps, &master_->proxy_cache(), [&] (const encryption::UniverseKeysPB& universe_keys) { |
833 | 12.7k | encryption_manager_->PopulateUniverseKeys(universe_keys); |
834 | 12.7k | }); |
835 | 7.88k | universe_key_client_->GetUniverseKeyRegistryAsync(); |
836 | 7.88k | RETURN_NOT_OK(EnableBgTasks()); |
837 | 7.88k | } |
838 | | |
839 | | // Cache the server registration even for shell mode masters. See |
840 | | // https://github.com/yugabyte/yugabyte-db/issues/8065. |
841 | 8.02k | RETURN_NOT_OK(GetRegistration(&server_registration_)); |
842 | | |
843 | 8.02k | { |
844 | 8.02k | std::lock_guard<simple_spinlock> l(state_lock_); |
845 | 8.02k | CHECK_EQ(kStarting, state_); |
846 | 8.02k | state_ = kRunning; |
847 | 8.02k | } |
848 | | |
849 | 8.02k | Started(); |
850 | | |
851 | 8.02k | return Status::OK(); |
852 | 8.02k | } |
853 | | |
854 | | Status CatalogManager::ChangeEncryptionInfo(const ChangeEncryptionInfoRequestPB* req, |
855 | 0 | ChangeEncryptionInfoResponsePB* resp) { |
856 | 0 | return STATUS(InvalidCommand, "Command only supported in enterprise build."); |
857 | 0 | } |
858 | | |
859 | 3.02k | Status CatalogManager::ElectedAsLeaderCb() { |
860 | 3.02k | time_elected_leader_.store(MonoTime::Now()); |
861 | 3.02k | return leader_initialization_pool_->SubmitClosure( |
862 | 3.02k | Bind(&CatalogManager::LoadSysCatalogDataTask, Unretained(this))); |
863 | 3.02k | } |
864 | | |
865 | 3.02k | Status CatalogManager::WaitUntilCaughtUpAsLeader(const MonoDelta& timeout) { |
866 | 3.02k | string uuid = master_->fs_manager()->uuid(); |
867 | 3.02k | Consensus* consensus = tablet_peer()->consensus(); |
868 | 3.02k | ConsensusStatePB cstate = consensus->ConsensusState(CONSENSUS_CONFIG_ACTIVE); |
869 | 3.02k | if (!cstate.has_leader_uuid() || cstate.leader_uuid() != uuid) { |
870 | 6 | return STATUS_SUBSTITUTE(IllegalState, |
871 | 6 | "Node $0 not leader. Consensus state: $1", uuid, cstate.ShortDebugString()); |
872 | 6 | } |
873 | | |
874 | | // Wait for all transactions to be committed. |
875 | 3.02k | const CoarseTimePoint deadline = CoarseMonoClock::now() + timeout; |
876 | 3.02k | { |
877 | 3.02k | tablet::HistoryCutoffPropagationDisabler disabler(tablet_peer()->tablet()->RetentionPolicy()); |
878 | 3.02k | RETURN_NOT_OK(tablet_peer()->operation_tracker()->WaitForAllToFinish(timeout)); |
879 | 3.02k | } |
880 | | |
881 | 3.02k | RETURN_NOT_OK(tablet_peer()->consensus()->WaitForLeaderLeaseImprecise(deadline)); |
882 | 3.01k | return Status::OK(); |
883 | 3.02k | } |
884 | | |
885 | 3.02k | void CatalogManager::LoadSysCatalogDataTask() { |
886 | 3.02k | auto consensus = tablet_peer()->shared_consensus(); |
887 | 3.02k | const int64_t term = consensus->ConsensusState(CONSENSUS_CONFIG_ACTIVE).current_term(); |
888 | 3.02k | Status s = WaitUntilCaughtUpAsLeader( |
889 | 3.02k | MonoDelta::FromMilliseconds(FLAGS_master_failover_catchup_timeout_ms)); |
890 | | |
891 | 3.02k | int64_t term_after_wait = consensus->ConsensusState(CONSENSUS_CONFIG_ACTIVE).current_term(); |
892 | 3.02k | if (term_after_wait != term) { |
893 | | // If we got elected leader again while waiting to catch up then we will get another callback to |
894 | | // update state from sys_catalog, so bail now. |
895 | | // |
896 | | // If we failed when waiting, i.e. could not acquire a leader lease, this could be due to us |
897 | | // becoming a follower. If we're not partitioned away, we'll know about a new term soon. |
898 | 5 | LOG_WITH_PREFIX(INFO) |
899 | 5 | << "Term change from " << term << " to " << term_after_wait |
900 | 5 | << " while waiting for master leader catchup. Not loading sys catalog metadata. " |
901 | 5 | << "Status of waiting: " << s; |
902 | 5 | return; |
903 | 5 | } |
904 | | |
905 | 3.02k | if (!s.ok()) { |
906 | | // This could happen e.g. if we are a partitioned-away leader that failed to acquire a leader |
907 | | // lease. |
908 | | // |
909 | | // TODO: handle this cleanly by transitioning to a follower without crashing. |
910 | 6 | LOG_WITH_PREFIX(WARNING) << "Failed waiting for node to catch up after master election: " << s; |
911 | | |
912 | 6 | if (s.IsTimedOut()) { |
913 | 0 | LOG_WITH_PREFIX(FATAL) << "Shutting down due to unavailability of other masters after" |
914 | 0 | << " election. TODO: Abdicate instead."; |
915 | 0 | } |
916 | 6 | return; |
917 | 6 | } |
918 | | |
919 | 3.01k | LOG_WITH_PREFIX(INFO) << "Loading table and tablet metadata into memory for term " << term; |
920 | 3.01k | LOG_SLOW_EXECUTION(WARNING, 1000, LogPrefix() + "Loading metadata into memory") { |
921 | 3.01k | Status status = VisitSysCatalog(term); |
922 | 3.01k | if (!status.ok()) { |
923 | 4 | { |
924 | 4 | std::lock_guard<simple_spinlock> l(state_lock_); |
925 | 4 | if (state_ == kClosing) { |
926 | 0 | LOG_WITH_PREFIX(INFO) |
927 | 0 | << "Error loading sys catalog; because shutdown is in progress. term " << term |
928 | 0 | << " status : " << status; |
929 | 0 | return; |
930 | 0 | } |
931 | 4 | } |
932 | 4 | auto new_term = consensus->ConsensusState(CONSENSUS_CONFIG_ACTIVE).current_term(); |
933 | 4 | if (new_term != term) { |
934 | 0 | LOG_WITH_PREFIX(INFO) |
935 | 0 | << "Error loading sys catalog; but that's OK as term was changed from " << term |
936 | 0 | << " to " << new_term << ": " << status; |
937 | 0 | return; |
938 | 0 | } |
939 | 4 | LOG_WITH_PREFIX(FATAL) << "Failed to load sys catalog: " << status; |
940 | 4 | } |
941 | 3.01k | } |
942 | | |
943 | 3.01k | { |
944 | 3.01k | std::lock_guard<simple_spinlock> l(state_lock_); |
945 | 3.01k | leader_ready_term_ = term; |
946 | 3.01k | LOG_WITH_PREFIX(INFO) << "Completed load of sys catalog in term " << term; |
947 | 3.01k | } |
948 | 3.01k | SysCatalogLoaded(term); |
949 | | // Once we have loaded the SysCatalog, reset and regenerate the yql partitions table in order to |
950 | | // regenerate entries for previous tables. |
951 | 3.01k | GetYqlPartitionsVtable().ResetAndRegenerateCache(); |
952 | 3.01k | } |
953 | | |
954 | 1.01k | CHECKED_STATUS CatalogManager::WaitForWorkerPoolTests(const MonoDelta& timeout) const { |
955 | 1.01k | if (!async_task_pool_->WaitFor(timeout)) { |
956 | 0 | return STATUS(TimedOut, "Worker Pool hasn't finished processing tasks"); |
957 | 0 | } |
958 | 1.01k | return Status::OK(); |
959 | 1.01k | } |
960 | | |
961 | 3.01k | Status CatalogManager::VisitSysCatalog(int64_t term) { |
962 | | // Block new catalog operations, and wait for existing operations to finish. |
963 | 3.01k | LOG_WITH_PREFIX_AND_FUNC(INFO) |
964 | 3.01k | << "Wait on leader_lock_ for any existing operations to finish. Term: " << term; |
965 | 3.01k | auto start = std::chrono::steady_clock::now(); |
966 | 3.01k | std::lock_guard<RWMutex> leader_lock_guard(leader_lock_); |
967 | 3.01k | auto finish = std::chrono::steady_clock::now(); |
968 | | |
969 | 3.01k | static const auto kLongLockAcquisitionLimit = RegularBuildVsSanitizers(100ms, 750ms); |
970 | 3.01k | if (finish > start + kLongLockAcquisitionLimit) { |
971 | 0 | LOG_WITH_PREFIX(WARNING) << "Long wait on leader_lock_: " << yb::ToString(finish - start); |
972 | 0 | } |
973 | | |
974 | 3.01k | LOG_WITH_PREFIX(INFO) |
975 | 3.01k | << __func__ << ": Acquire catalog manager lock_ before loading sys catalog."; |
976 | 3.01k | LockGuard lock(mutex_); |
977 | 3.01k | VLOG_WITH_FUNC0 (3) << "Acquired the catalog manager lock"0 ; |
978 | | |
979 | | // Abort any outstanding tasks. All TableInfos are orphaned below, so |
980 | | // it's important to end their tasks now; otherwise Shutdown() will |
981 | | // destroy master state used by these tasks. |
982 | 3.01k | std::vector<scoped_refptr<TableInfo>> tables; |
983 | 3.01k | AppendValuesFromMap(*table_ids_map_, &tables); |
984 | 3.01k | AbortAndWaitForAllTasks(tables); |
985 | | |
986 | | // Clear internal maps and run data loaders. |
987 | 3.01k | RETURN_NOT_OK(RunLoaders(term)); |
988 | | |
989 | | // Prepare various default system configurations. |
990 | 3.01k | RETURN_NOT_OK(PrepareDefaultSysConfig(term)); |
991 | | |
992 | 3.01k | if ((FLAGS_use_initial_sys_catalog_snapshot || FLAGS_enable_ysql3.01k ) && |
993 | 3.01k | !FLAGS_initial_sys_catalog_snapshot_path.empty()981 && |
994 | 3.01k | !FLAGS_create_initial_sys_catalog_snapshot767 ) { |
995 | 765 | if (!namespace_ids_map_.empty() || !system_tablets_.empty()746 ) { |
996 | 19 | LOG_WITH_PREFIX(INFO) |
997 | 19 | << "This is an existing cluster, not initializing from a sys catalog snapshot."; |
998 | 746 | } else { |
999 | 746 | Result<bool> dir_exists = |
1000 | 746 | Env::Default()->DoesDirectoryExist(FLAGS_initial_sys_catalog_snapshot_path); |
1001 | 746 | if (dir_exists.ok() && *dir_exists) { |
1002 | 746 | bool initdb_was_already_done = false; |
1003 | 746 | { |
1004 | 746 | auto l = ysql_catalog_config_->LockForRead(); |
1005 | 746 | initdb_was_already_done = l->pb.ysql_catalog_config().initdb_done(); |
1006 | 746 | } |
1007 | 746 | if (initdb_was_already_done) { |
1008 | 0 | LOG_WITH_PREFIX(INFO) |
1009 | 0 | << "initdb has been run before, no need to restore sys catalog from " |
1010 | 0 | << "the initial snapshot"; |
1011 | 746 | } else { |
1012 | 746 | LOG_WITH_PREFIX(INFO) << "Restoring snapshot in sys catalog"; |
1013 | 746 | Status restore_status = RestoreInitialSysCatalogSnapshot( |
1014 | 746 | FLAGS_initial_sys_catalog_snapshot_path, |
1015 | 746 | sys_catalog_->tablet_peer().get(), |
1016 | 746 | term); |
1017 | 746 | if (!restore_status.ok()) { |
1018 | 0 | LOG_WITH_PREFIX(ERROR) << "Failed restoring snapshot in sys catalog"; |
1019 | 0 | return restore_status; |
1020 | 0 | } |
1021 | | |
1022 | 746 | LOG_WITH_PREFIX(INFO) << "Re-initializing cluster config"; |
1023 | 746 | { |
1024 | 746 | std::lock_guard<decltype(config_mutex_)> lock(config_mutex_); |
1025 | 746 | cluster_config_.reset(); |
1026 | 746 | } |
1027 | 746 | RETURN_NOT_OK(PrepareDefaultClusterConfig(term)); |
1028 | | |
1029 | 746 | LOG_WITH_PREFIX(INFO) << "Restoring snapshot completed, considering initdb finished"; |
1030 | 746 | RETURN_NOT_OK(InitDbFinished(Status::OK(), term)); |
1031 | 746 | RETURN_NOT_OK(RunLoaders(term)); |
1032 | 746 | } |
1033 | 746 | } else { |
1034 | 0 | LOG_WITH_PREFIX(WARNING) |
1035 | 0 | << "Initial sys catalog snapshot directory does not exist: " |
1036 | 0 | << FLAGS_initial_sys_catalog_snapshot_path |
1037 | 0 | << (dir_exists.ok() ? "" : ", status: " + dir_exists.status().ToString()); |
1038 | 0 | } |
1039 | 746 | } |
1040 | 765 | } |
1041 | | |
1042 | | // Create the system namespaces (created only if they don't already exist). |
1043 | 3.01k | RETURN_NOT_OK(PrepareDefaultNamespaces(term)); |
1044 | | |
1045 | | // Create the system tables (created only if they don't already exist). |
1046 | 3.01k | RETURN_NOT_OK(PrepareSystemTables(term)); |
1047 | | |
1048 | | // Create the default cassandra (created only if they don't already exist). |
1049 | 3.01k | RETURN_NOT_OK(permissions_manager_->PrepareDefaultRoles(term)); |
1050 | | |
1051 | | // If this is the first time we start up, we have no config information as default. We write an |
1052 | | // empty version 0. |
1053 | 3.00k | RETURN_NOT_OK(PrepareDefaultClusterConfig(term)); |
1054 | | |
1055 | 3.00k | permissions_manager_->BuildRecursiveRoles(); |
1056 | | |
1057 | 3.00k | if (FLAGS_enable_ysql) { |
1058 | | // Number of TS to wait for before creating the txn table. |
1059 | 977 | auto wait_ts_count = std::max(FLAGS_txn_table_wait_min_ts_count, FLAGS_replication_factor); |
1060 | | |
1061 | 977 | LOG_WITH_PREFIX(INFO) |
1062 | 977 | << "YSQL is enabled, will create the transaction status table when " |
1063 | 977 | << wait_ts_count << " tablet servers are online"; |
1064 | 977 | master_->ts_manager()->SetTSCountCallback(wait_ts_count, [this, wait_ts_count] { |
1065 | 901 | LOG_WITH_PREFIX(INFO) |
1066 | 901 | << wait_ts_count |
1067 | 901 | << " tablet servers registered, creating the transaction status table"; |
1068 | | // Retry table creation until it succeedes. It might fail initially because placement UUID |
1069 | | // of live replicas is set through an RPC from YugaWare, and we won't be able to calculate |
1070 | | // the number of primary (non-read-replica) tablet servers until that happens. |
1071 | 923 | while (true) { |
1072 | 903 | const auto s = CreateGlobalTransactionStatusTableIfNeeded(/* rpc */ nullptr); |
1073 | 903 | if (s.ok()) { |
1074 | 881 | break; |
1075 | 881 | } |
1076 | 22 | LOG_WITH_PREFIX(WARNING) << "Failed creating transaction status table, waiting: " << s; |
1077 | 22 | if (s.IsShutdownInProgress()) { |
1078 | 0 | return; |
1079 | 0 | } |
1080 | 22 | auto role = Role(); |
1081 | 22 | if (role != PeerRole::LEADER) { |
1082 | 0 | LOG_WITH_PREFIX(WARNING) |
1083 | 0 | << "Cancel creating transaction because of role: " << PeerRole_Name(role); |
1084 | 0 | return; |
1085 | 0 | } |
1086 | 22 | SleepFor(MonoDelta::FromSeconds(1)); |
1087 | 22 | } |
1088 | 901 | LOG_WITH_PREFIX(INFO) << "Finished creating transaction status table asynchronously"; |
1089 | 901 | }); |
1090 | 977 | } |
1091 | | |
1092 | 3.00k | if (!StartRunningInitDbIfNeeded(term)) { |
1093 | | // If we are not running initdb, this is an existing cluster, and we need to check whether we |
1094 | | // need to do a one-time migration to make YSQL system catalog tables transactional. |
1095 | 3.00k | RETURN_NOT_OK(MakeYsqlSysCatalogTablesTransactional( |
1096 | 3.00k | table_ids_map_.CheckOut().get_ptr(), sys_catalog_.get(), ysql_catalog_config_.get(), term)); |
1097 | 3.00k | } |
1098 | | |
1099 | 3.00k | return Status::OK(); |
1100 | 3.00k | } |
1101 | | |
1102 | | template <class Loader> |
1103 | 30.0k | Status CatalogManager::Load(const std::string& title, const int64_t term) { |
1104 | 30.0k | LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory."; |
1105 | 30.0k | std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term); |
1106 | 30.0k | RETURN_NOT_OK_PREPEND( |
1107 | 30.0k | sys_catalog_->Visit(loader.get()), |
1108 | 30.0k | "Failed while visiting " + title + " in sys catalog"); |
1109 | 30.0k | return Status::OK(); |
1110 | 30.0k | } yb::Status yb::master::CatalogManager::Load<yb::master::RoleLoader>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long) Line | Count | Source | 1103 | 3.75k | Status CatalogManager::Load(const std::string& title, const int64_t term) { | 1104 | 3.75k | LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory."; | 1105 | 3.75k | std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term); | 1106 | 3.75k | RETURN_NOT_OK_PREPEND( | 1107 | 3.75k | sys_catalog_->Visit(loader.get()), | 1108 | 3.75k | "Failed while visiting " + title + " in sys catalog"); | 1109 | 3.75k | return Status::OK(); | 1110 | 3.75k | } |
yb::Status yb::master::CatalogManager::Load<yb::master::SysConfigLoader>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long) Line | Count | Source | 1103 | 3.75k | Status CatalogManager::Load(const std::string& title, const int64_t term) { | 1104 | 3.75k | LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory."; | 1105 | 3.75k | std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term); | 1106 | 3.75k | RETURN_NOT_OK_PREPEND( | 1107 | 3.75k | sys_catalog_->Visit(loader.get()), | 1108 | 3.75k | "Failed while visiting " + title + " in sys catalog"); | 1109 | 3.75k | return Status::OK(); | 1110 | 3.75k | } |
yb::Status yb::master::CatalogManager::Load<yb::master::TableLoader>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long) Line | Count | Source | 1103 | 3.75k | Status CatalogManager::Load(const std::string& title, const int64_t term) { | 1104 | 3.75k | LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory."; | 1105 | 3.75k | std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term); | 1106 | 3.75k | RETURN_NOT_OK_PREPEND( | 1107 | 3.75k | sys_catalog_->Visit(loader.get()), | 1108 | 3.75k | "Failed while visiting " + title + " in sys catalog"); | 1109 | 3.75k | return Status::OK(); | 1110 | 3.75k | } |
yb::Status yb::master::CatalogManager::Load<yb::master::TabletLoader>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long) Line | Count | Source | 1103 | 3.75k | Status CatalogManager::Load(const std::string& title, const int64_t term) { | 1104 | 3.75k | LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory."; | 1105 | 3.75k | std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term); | 1106 | 3.75k | RETURN_NOT_OK_PREPEND( | 1107 | 3.75k | sys_catalog_->Visit(loader.get()), | 1108 | 3.75k | "Failed while visiting " + title + " in sys catalog"); | 1109 | 3.75k | return Status::OK(); | 1110 | 3.75k | } |
yb::Status yb::master::CatalogManager::Load<yb::master::NamespaceLoader>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long) Line | Count | Source | 1103 | 3.75k | Status CatalogManager::Load(const std::string& title, const int64_t term) { | 1104 | 3.75k | LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory."; | 1105 | 3.75k | std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term); | 1106 | 3.75k | RETURN_NOT_OK_PREPEND( | 1107 | 3.75k | sys_catalog_->Visit(loader.get()), | 1108 | 3.75k | "Failed while visiting " + title + " in sys catalog"); | 1109 | 3.75k | return Status::OK(); | 1110 | 3.75k | } |
yb::Status yb::master::CatalogManager::Load<yb::master::UDTypeLoader>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long) Line | Count | Source | 1103 | 3.75k | Status CatalogManager::Load(const std::string& title, const int64_t term) { | 1104 | 3.75k | LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory."; | 1105 | 3.75k | std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term); | 1106 | 3.75k | RETURN_NOT_OK_PREPEND( | 1107 | 3.75k | sys_catalog_->Visit(loader.get()), | 1108 | 3.75k | "Failed while visiting " + title + " in sys catalog"); | 1109 | 3.75k | return Status::OK(); | 1110 | 3.75k | } |
yb::Status yb::master::CatalogManager::Load<yb::master::ClusterConfigLoader>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long) Line | Count | Source | 1103 | 3.75k | Status CatalogManager::Load(const std::string& title, const int64_t term) { | 1104 | 3.75k | LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory."; | 1105 | 3.75k | std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term); | 1106 | 3.75k | RETURN_NOT_OK_PREPEND( | 1107 | 3.75k | sys_catalog_->Visit(loader.get()), | 1108 | 3.75k | "Failed while visiting " + title + " in sys catalog"); | 1109 | 3.75k | return Status::OK(); | 1110 | 3.75k | } |
yb::Status yb::master::CatalogManager::Load<yb::master::RedisConfigLoader>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long) Line | Count | Source | 1103 | 3.75k | Status CatalogManager::Load(const std::string& title, const int64_t term) { | 1104 | 3.75k | LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory."; | 1105 | 3.75k | std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term); | 1106 | 3.75k | RETURN_NOT_OK_PREPEND( | 1107 | 3.75k | sys_catalog_->Visit(loader.get()), | 1108 | 3.75k | "Failed while visiting " + title + " in sys catalog"); | 1109 | 3.75k | return Status::OK(); | 1110 | 3.75k | } |
|
1111 | | |
1112 | 3.75k | Status CatalogManager::RunLoaders(int64_t term) { |
1113 | | // Clear the table and tablet state. |
1114 | 3.75k | table_names_map_.clear(); |
1115 | 3.75k | transaction_table_ids_set_.clear(); |
1116 | 3.75k | auto table_ids_map_checkout = table_ids_map_.CheckOut(); |
1117 | 3.75k | table_ids_map_checkout->clear(); |
1118 | | |
1119 | 3.75k | auto tablet_map_checkout = tablet_map_.CheckOut(); |
1120 | 3.75k | tablet_map_checkout->clear(); |
1121 | | |
1122 | | // Clear the namespace mappings. |
1123 | 3.75k | namespace_ids_map_.clear(); |
1124 | 3.75k | namespace_names_mapper_.clear(); |
1125 | | |
1126 | | // Clear the type mappings. |
1127 | 3.75k | udtype_ids_map_.clear(); |
1128 | 3.75k | udtype_names_map_.clear(); |
1129 | | |
1130 | | // Clear the current cluster config. |
1131 | 3.75k | { |
1132 | 3.75k | std::lock_guard<decltype(config_mutex_)> lock(config_mutex_); |
1133 | 3.75k | cluster_config_.reset(); |
1134 | 3.75k | } |
1135 | | |
1136 | | // Clear redis config mapping. |
1137 | 3.75k | redis_config_map_.clear(); |
1138 | | |
1139 | | // Clear ysql catalog config. |
1140 | 3.75k | ysql_catalog_config_.reset(); |
1141 | | |
1142 | | // Clear transaction tables config. |
1143 | 3.75k | transaction_tables_config_.reset(); |
1144 | | |
1145 | | // Clear recent tasks. |
1146 | 3.75k | tasks_tracker_->Reset(); |
1147 | | |
1148 | | // Clear recent jobs. |
1149 | 3.75k | jobs_tracker_->Reset(); |
1150 | | |
1151 | 3.75k | std::vector<std::shared_ptr<TSDescriptor>> descs; |
1152 | 3.75k | master_->ts_manager()->GetAllDescriptors(&descs); |
1153 | 3.75k | for (const auto& ts_desc : descs) { |
1154 | 52 | ts_desc->set_has_tablet_report(false); |
1155 | 52 | } |
1156 | | |
1157 | 3.75k | { |
1158 | 3.75k | LockGuard lock(permissions_manager()->mutex()); |
1159 | | |
1160 | | // Clear the roles mapping. |
1161 | 3.75k | permissions_manager()->ClearRolesUnlocked(); |
1162 | 3.75k | RETURN_NOT_OK(Load<RoleLoader>("roles", term)); |
1163 | 3.75k | RETURN_NOT_OK(Load<SysConfigLoader>("sys config", term)); |
1164 | 3.75k | } |
1165 | | // Clear the hidden tablets vector. |
1166 | 3.75k | hidden_tablets_.clear(); |
1167 | | |
1168 | 3.75k | RETURN_NOT_OK(Load<TableLoader>("tables", term)); |
1169 | 3.75k | RETURN_NOT_OK(Load<TabletLoader>("tablets", term)); |
1170 | 3.75k | RETURN_NOT_OK(Load<NamespaceLoader>("namespaces", term)); |
1171 | 3.75k | RETURN_NOT_OK(Load<UDTypeLoader>("user-defined types", term)); |
1172 | 3.75k | RETURN_NOT_OK(Load<ClusterConfigLoader>("cluster configuration", term)); |
1173 | 3.75k | RETURN_NOT_OK(Load<RedisConfigLoader>("Redis config", term)); |
1174 | | |
1175 | 3.75k | if (!transaction_tables_config_) { |
1176 | 2.91k | RETURN_NOT_OK(InitializeTransactionTablesConfig(term)); |
1177 | 2.91k | } |
1178 | | |
1179 | 3.75k | return Status::OK(); |
1180 | 3.75k | } |
1181 | | |
1182 | | Status CatalogManager::CheckResource( |
1183 | | const GrantRevokePermissionRequestPB* req, |
1184 | 721 | GrantRevokePermissionResponsePB* resp) { |
1185 | 721 | scoped_refptr<TableInfo> table; |
1186 | | |
1187 | | // Checking if resources exist. |
1188 | 721 | if (req->resource_type() == ResourceType::TABLE || |
1189 | 721 | req->resource_type() == ResourceType::KEYSPACE522 ) { |
1190 | | // We can't match Apache Cassandra's error because when a namespace is not provided, the error |
1191 | | // is detected by the semantic analysis in PTQualifiedName::AnalyzeName. |
1192 | 435 | DCHECK(req->has_namespace_()); |
1193 | 435 | const auto& namespace_info = req->namespace_(); |
1194 | 435 | auto ns = FindNamespace(namespace_info); |
1195 | | |
1196 | 435 | if (req->resource_type() == ResourceType::KEYSPACE) { |
1197 | 236 | if (!ns.ok()) { |
1198 | | // Matches Apache Cassandra's error. |
1199 | 0 | Status s = STATUS_SUBSTITUTE( |
1200 | 0 | NotFound, "Resource <keyspace $0> doesn't exist", namespace_info.name()); |
1201 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s); |
1202 | 0 | } |
1203 | 236 | } else { |
1204 | 199 | if (ns.ok()) { |
1205 | 199 | CatalogManager::SharedLock l(mutex_); |
1206 | 199 | table = FindPtrOrNull(table_names_map_, {(**ns).id(), req->resource_name()}); |
1207 | 199 | } |
1208 | 199 | if (table == nullptr) { |
1209 | | // Matches Apache Cassandra's error. |
1210 | 0 | Status s = STATUS_SUBSTITUTE( |
1211 | 0 | NotFound, "Resource <object '$0.$1'> doesn't exist", |
1212 | 0 | namespace_info.name(), req->resource_name()); |
1213 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); |
1214 | 0 | } |
1215 | 199 | } |
1216 | 435 | } |
1217 | 721 | return Status::OK(); |
1218 | 721 | } |
1219 | | |
1220 | 3.74k | Status CatalogManager::PrepareDefaultClusterConfig(int64_t term) { |
1221 | 3.74k | std::lock_guard<decltype(config_mutex_)> lock(config_mutex_); |
1222 | 3.74k | if (cluster_config_) { |
1223 | 845 | LOG_WITH_PREFIX(INFO) |
1224 | 845 | << "Cluster configuration has already been set up, skipping re-initialization."; |
1225 | 845 | return Status::OK(); |
1226 | 845 | } |
1227 | | |
1228 | | // Create default. |
1229 | 2.90k | SysClusterConfigEntryPB config; |
1230 | 2.90k | config.set_version(0); |
1231 | | |
1232 | 2.90k | std::string cluster_uuid_source; |
1233 | 2.90k | if (!FLAGS_cluster_uuid.empty()) { |
1234 | 1 | RETURN_NOT_OK(Uuid::FromString(FLAGS_cluster_uuid)); |
1235 | 0 | config.set_cluster_uuid(FLAGS_cluster_uuid); |
1236 | 0 | cluster_uuid_source = "from the --cluster_uuid flag"; |
1237 | 2.90k | } else { |
1238 | 2.90k | auto uuid = Uuid::Generate(); |
1239 | 2.90k | config.set_cluster_uuid(uuid.ToString()); |
1240 | 2.90k | cluster_uuid_source = "(randomly generated)"; |
1241 | 2.90k | } |
1242 | 2.90k | LOG_WITH_PREFIX(INFO) |
1243 | 2.90k | << "Setting cluster UUID to " << config.cluster_uuid() << " " << cluster_uuid_source; |
1244 | | |
1245 | | // Create in memory object. |
1246 | 2.90k | cluster_config_ = std::make_shared<ClusterConfigInfo>(); |
1247 | | |
1248 | | // Prepare write. |
1249 | 2.90k | auto l = cluster_config_->LockForWrite(); |
1250 | 2.90k | l.mutable_data()->pb = std::move(config); |
1251 | | |
1252 | | // Write to sys_catalog and in memory. |
1253 | 2.90k | RETURN_NOT_OK(sys_catalog_->Upsert(term, cluster_config_.get())); |
1254 | 2.90k | l.Commit(); |
1255 | | |
1256 | 2.90k | return Status::OK(); |
1257 | 2.90k | } |
1258 | | |
1259 | 29.4k | std::vector<std::string> CatalogManager::GetMasterAddresses() { |
1260 | 29.4k | std::vector<std::string> result; |
1261 | 29.4k | consensus::ConsensusStatePB state; |
1262 | 29.4k | auto status = GetCurrentConfig(&state); |
1263 | 29.4k | if (!status.ok()) { |
1264 | 17.6k | LOG(WARNING) << "Failed to get current config: " << status; |
1265 | 17.6k | return result; |
1266 | 17.6k | } |
1267 | 32.5k | for (const auto& peer : state.config().peers())11.7k { |
1268 | 32.5k | std::vector<std::string> peer_addresses; |
1269 | 65.0k | for (const auto& list : {peer.last_known_private_addr(), peer.last_known_broadcast_addr()}) { |
1270 | 65.0k | for (const auto& entry : list) { |
1271 | 32.8k | peer_addresses.push_back(HostPort::FromPB(entry).ToString()); |
1272 | 32.8k | } |
1273 | 65.0k | } |
1274 | 32.5k | if (!peer_addresses.empty()) { |
1275 | 32.5k | result.push_back(JoinStrings(peer_addresses, ",")); |
1276 | 32.5k | } |
1277 | 32.5k | } |
1278 | 11.7k | return result; |
1279 | 29.4k | } |
1280 | | |
1281 | 3.01k | Status CatalogManager::PrepareDefaultSysConfig(int64_t term) { |
1282 | 3.01k | { |
1283 | 3.01k | LockGuard lock(permissions_manager()->mutex()); |
1284 | 3.01k | RETURN_NOT_OK(permissions_manager()->PrepareDefaultSecurityConfigUnlocked(term)); |
1285 | 3.01k | } |
1286 | | |
1287 | 3.01k | if (!ysql_catalog_config_) { |
1288 | 2.90k | SysYSQLCatalogConfigEntryPB ysql_catalog_config; |
1289 | 2.90k | ysql_catalog_config.set_version(0); |
1290 | | |
1291 | | // Create in memory objects. |
1292 | 2.90k | ysql_catalog_config_ = new SysConfigInfo(kYsqlCatalogConfigType); |
1293 | | |
1294 | | // Prepare write. |
1295 | 2.90k | auto l = ysql_catalog_config_->LockForWrite(); |
1296 | 2.90k | *l.mutable_data()->pb.mutable_ysql_catalog_config() = std::move(ysql_catalog_config); |
1297 | | |
1298 | | // Write to sys_catalog and in memory. |
1299 | 2.90k | RETURN_NOT_OK(sys_catalog_->Upsert(term, ysql_catalog_config_)); |
1300 | 2.90k | l.Commit(); |
1301 | 2.90k | } |
1302 | | |
1303 | 3.01k | if (!transaction_tables_config_) { |
1304 | 0 | RETURN_NOT_OK(InitializeTransactionTablesConfig(term)); |
1305 | 0 | } |
1306 | | |
1307 | 3.01k | return Status::OK(); |
1308 | 3.01k | } |
1309 | | |
1310 | 3.00k | bool CatalogManager::StartRunningInitDbIfNeeded(int64_t term) { |
1311 | 3.00k | if (!ShouldAutoRunInitDb(ysql_catalog_config_.get(), pg_proc_exists_)) { |
1312 | 3.00k | return false; |
1313 | 3.00k | } |
1314 | | |
1315 | 2 | string master_addresses_str = MasterAddressesToString( |
1316 | 2 | *master_->opts().GetMasterAddresses()); |
1317 | | |
1318 | 2 | initdb_future_ = std::async(std::launch::async, [this, master_addresses_str, term] { |
1319 | 2 | if (FLAGS_create_initial_sys_catalog_snapshot) { |
1320 | 2 | initial_snapshot_writer_.emplace(); |
1321 | 2 | } |
1322 | | |
1323 | 2 | Status status = PgWrapper::InitDbForYSQL( |
1324 | 2 | master_addresses_str, "/tmp", master_->GetSharedMemoryFd()); |
1325 | | |
1326 | 2 | if (FLAGS_create_initial_sys_catalog_snapshot && status.ok()) { |
1327 | 2 | Status write_snapshot_status = initial_snapshot_writer_->WriteSnapshot( |
1328 | 2 | sys_catalog_->tablet_peer()->tablet(), |
1329 | 2 | FLAGS_initial_sys_catalog_snapshot_path); |
1330 | 2 | if (!write_snapshot_status.ok()) { |
1331 | 0 | status = write_snapshot_status; |
1332 | 0 | } |
1333 | 2 | } |
1334 | 2 | Status finish_status = InitDbFinished(status, term); |
1335 | 2 | if (!finish_status.ok()) { |
1336 | 0 | if (status.ok()) { |
1337 | 0 | status = finish_status; |
1338 | 0 | } |
1339 | 0 | LOG_WITH_PREFIX(WARNING) |
1340 | 0 | << "Failed to set initdb as finished in sys catalog: " << finish_status; |
1341 | 0 | } |
1342 | 2 | return status; |
1343 | 2 | }); |
1344 | 2 | return true; |
1345 | 3.00k | } |
1346 | | |
1347 | 3.01k | Status CatalogManager::PrepareDefaultNamespaces(int64_t term) { |
1348 | 3.01k | RETURN_NOT_OK(PrepareNamespace( |
1349 | 3.01k | YQL_DATABASE_CQL, kSystemNamespaceName, kSystemNamespaceId, term)); |
1350 | 3.01k | RETURN_NOT_OK(PrepareNamespace( |
1351 | 3.01k | YQL_DATABASE_CQL, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term)); |
1352 | 3.01k | RETURN_NOT_OK(PrepareNamespace( |
1353 | 3.01k | YQL_DATABASE_CQL, kSystemAuthNamespaceName, kSystemAuthNamespaceId, term)); |
1354 | 3.01k | return Status::OK(); |
1355 | 3.01k | } |
1356 | | |
1357 | 3.00k | Status CatalogManager::PrepareSystemTables(int64_t term) { |
1358 | | // Prepare sys catalog table. |
1359 | 3.00k | RETURN_NOT_OK(PrepareSysCatalogTable(term)); |
1360 | | |
1361 | | // Create the required system tables here. |
1362 | 3.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<PeersVTable>( |
1363 | 3.00k | kSystemPeersTableName, kSystemNamespaceName, kSystemNamespaceId, term))); |
1364 | 3.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<LocalVTable>( |
1365 | 3.00k | kSystemLocalTableName, kSystemNamespaceName, kSystemNamespaceId, term))); |
1366 | 3.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLKeyspacesVTable>( |
1367 | 3.00k | kSystemSchemaKeyspacesTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, |
1368 | 3.00k | term))); |
1369 | 3.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLTablesVTable>( |
1370 | 3.00k | kSystemSchemaTablesTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term))); |
1371 | 3.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLColumnsVTable>( |
1372 | 3.00k | kSystemSchemaColumnsTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term))); |
1373 | 3.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLSizeEstimatesVTable>( |
1374 | 3.00k | kSystemSizeEstimatesTableName, kSystemNamespaceName, kSystemNamespaceId, term))); |
1375 | | |
1376 | | // Empty tables. |
1377 | 3.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLAggregatesVTable>( |
1378 | 3.00k | kSystemSchemaAggregatesTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, |
1379 | 3.00k | term))); |
1380 | 3.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLFunctionsVTable>( |
1381 | 3.00k | kSystemSchemaFunctionsTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, |
1382 | 3.00k | term))); |
1383 | 3.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLIndexesVTable>( |
1384 | 3.00k | kSystemSchemaIndexesTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term))); |
1385 | 3.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLTriggersVTable>( |
1386 | 3.00k | kSystemSchemaTriggersTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term))); |
1387 | 3.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLViewsVTable>( |
1388 | 3.00k | kSystemSchemaViewsTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term))); |
1389 | 3.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<QLTypesVTable>( |
1390 | 3.00k | kSystemSchemaTypesTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term))); |
1391 | 3.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLPartitionsVTable>( |
1392 | 3.00k | kSystemPartitionsTableName, kSystemNamespaceName, kSystemNamespaceId, term))); |
1393 | | |
1394 | | // System auth tables. |
1395 | 3.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLAuthRolesVTable>( |
1396 | 3.00k | kSystemAuthRolesTableName, kSystemAuthNamespaceName, kSystemAuthNamespaceId, term))); |
1397 | 3.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLAuthRolePermissionsVTable>( |
1398 | 3.00k | kSystemAuthRolePermissionsTableName, kSystemAuthNamespaceName, kSystemAuthNamespaceId, |
1399 | 3.00k | term))); |
1400 | 3.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLAuthResourceRolePermissionsIndexVTable>( |
1401 | 3.00k | kSystemAuthResourceRolePermissionsIndexTableName, kSystemAuthNamespaceName, |
1402 | 3.00k | kSystemAuthNamespaceId, term))); |
1403 | | |
1404 | | // Ensure kNumSystemTables is in-sync with the system tables created. |
1405 | 3.00k | LOG_IF(DFATAL, system_tablets_.size() != kNumSystemTables) |
1406 | 1 | << "kNumSystemTables is " << kNumSystemTables << " but " << system_tablets_.size() |
1407 | 1 | << " tables were created"; |
1408 | | |
1409 | | // Cache the system.partitions tablet so we can access it in RebuildYQLSystemPartitions. |
1410 | 3.00k | RETURN_NOT_OK(GetYQLPartitionsVTable(&system_partitions_tablet_)); |
1411 | | |
1412 | 3.00k | return Status::OK(); |
1413 | 3.00k | } |
1414 | | |
1415 | 3.00k | Status CatalogManager::PrepareSysCatalogTable(int64_t term) { |
1416 | | // Prepare sys catalog table info. |
1417 | 3.00k | auto sys_catalog_table_iter = table_ids_map_->find(kSysCatalogTableId); |
1418 | 3.00k | if (sys_catalog_table_iter == table_ids_map_->end()) { |
1419 | 2.16k | scoped_refptr<TableInfo> table = NewTableInfo(kSysCatalogTableId); |
1420 | 2.16k | table->mutable_metadata()->StartMutation(); |
1421 | 2.16k | SysTablesEntryPB& metadata = table->mutable_metadata()->mutable_dirty()->pb; |
1422 | 2.16k | metadata.set_state(SysTablesEntryPB::RUNNING); |
1423 | 2.16k | metadata.set_namespace_id(kSystemSchemaNamespaceId); |
1424 | 2.16k | metadata.set_name(kSysCatalogTableName); |
1425 | 2.16k | metadata.set_table_type(TableType::YQL_TABLE_TYPE); |
1426 | 2.16k | SchemaToPB(*sys_catalog_->schema_, metadata.mutable_schema()); |
1427 | 2.16k | metadata.set_version(0); |
1428 | | |
1429 | 2.16k | auto table_ids_map_checkout = table_ids_map_.CheckOut(); |
1430 | 2.16k | sys_catalog_table_iter = table_ids_map_checkout->emplace(table->id(), table).first; |
1431 | 2.16k | table_names_map_[{kSystemSchemaNamespaceId, kSysCatalogTableName}] = table; |
1432 | 2.16k | table->set_is_system(); |
1433 | | |
1434 | 2.16k | RETURN_NOT_OK(sys_catalog_->Upsert(term, table)); |
1435 | 2.16k | table->mutable_metadata()->CommitMutation(); |
1436 | 2.16k | } |
1437 | | |
1438 | | // Prepare sys catalog tablet info. |
1439 | 3.00k | if (tablet_map_->count(kSysCatalogTabletId) == 0) { |
1440 | 2.16k | scoped_refptr<TableInfo> table = sys_catalog_table_iter->second; |
1441 | 2.16k | scoped_refptr<TabletInfo> tablet(new TabletInfo(table, kSysCatalogTabletId)); |
1442 | 2.16k | tablet->mutable_metadata()->StartMutation(); |
1443 | 2.16k | SysTabletsEntryPB& metadata = tablet->mutable_metadata()->mutable_dirty()->pb; |
1444 | 2.16k | metadata.set_state(SysTabletsEntryPB::RUNNING); |
1445 | | |
1446 | 2.16k | auto l = table->LockForRead(); |
1447 | 2.16k | PartitionSchema partition_schema; |
1448 | 2.16k | RETURN_NOT_OK(PartitionSchema::FromPB(l->pb.partition_schema(), |
1449 | 2.16k | *sys_catalog_->schema_, |
1450 | 2.16k | &partition_schema)); |
1451 | 2.16k | vector<Partition> partitions; |
1452 | 2.16k | RETURN_NOT_OK(partition_schema.CreatePartitions(1, &partitions)); |
1453 | 2.16k | partitions[0].ToPB(metadata.mutable_partition()); |
1454 | 2.16k | metadata.set_table_id(table->id()); |
1455 | 2.16k | metadata.add_table_ids(table->id()); |
1456 | | |
1457 | 2.16k | table->set_is_system(); |
1458 | 2.16k | table->AddTablet(tablet.get()); |
1459 | | |
1460 | 2.16k | auto tablet_map_checkout = tablet_map_.CheckOut(); |
1461 | 2.16k | (*tablet_map_checkout)[tablet->tablet_id()] = tablet; |
1462 | | |
1463 | 2.16k | RETURN_NOT_OK(sys_catalog_->Upsert(term, tablet)); |
1464 | 2.16k | tablet->mutable_metadata()->CommitMutation(); |
1465 | 2.16k | } |
1466 | | |
1467 | 3.00k | system_tablets_[kSysCatalogTabletId] = sys_catalog_->tablet_peer_->shared_tablet(); |
1468 | | |
1469 | 3.00k | return Status::OK(); |
1470 | 3.00k | } |
1471 | | |
1472 | | template <class T> |
1473 | | Status CatalogManager::PrepareSystemTableTemplate(const TableName& table_name, |
1474 | | const NamespaceName& namespace_name, |
1475 | | const NamespaceId& namespace_id, |
1476 | 48.1k | int64_t term) { |
1477 | 48.1k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); |
1478 | 48.1k | return PrepareSystemTable( |
1479 | 48.1k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); |
1480 | 48.1k | } yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::PeersVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long) Line | Count | Source | 1476 | 3.00k | int64_t term) { | 1477 | 3.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1478 | 3.00k | return PrepareSystemTable( | 1479 | 3.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1480 | 3.00k | } |
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::LocalVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long) Line | Count | Source | 1476 | 3.00k | int64_t term) { | 1477 | 3.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1478 | 3.00k | return PrepareSystemTable( | 1479 | 3.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1480 | 3.00k | } |
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLKeyspacesVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long) Line | Count | Source | 1476 | 3.00k | int64_t term) { | 1477 | 3.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1478 | 3.00k | return PrepareSystemTable( | 1479 | 3.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1480 | 3.00k | } |
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLTablesVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long) Line | Count | Source | 1476 | 3.00k | int64_t term) { | 1477 | 3.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1478 | 3.00k | return PrepareSystemTable( | 1479 | 3.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1480 | 3.00k | } |
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLColumnsVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long) Line | Count | Source | 1476 | 3.00k | int64_t term) { | 1477 | 3.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1478 | 3.00k | return PrepareSystemTable( | 1479 | 3.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1480 | 3.00k | } |
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLSizeEstimatesVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long) Line | Count | Source | 1476 | 3.00k | int64_t term) { | 1477 | 3.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1478 | 3.00k | return PrepareSystemTable( | 1479 | 3.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1480 | 3.00k | } |
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLAggregatesVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long) Line | Count | Source | 1476 | 3.00k | int64_t term) { | 1477 | 3.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1478 | 3.00k | return PrepareSystemTable( | 1479 | 3.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1480 | 3.00k | } |
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLFunctionsVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long) Line | Count | Source | 1476 | 3.00k | int64_t term) { | 1477 | 3.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1478 | 3.00k | return PrepareSystemTable( | 1479 | 3.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1480 | 3.00k | } |
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLIndexesVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long) Line | Count | Source | 1476 | 3.00k | int64_t term) { | 1477 | 3.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1478 | 3.00k | return PrepareSystemTable( | 1479 | 3.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1480 | 3.00k | } |
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLTriggersVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long) Line | Count | Source | 1476 | 3.00k | int64_t term) { | 1477 | 3.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1478 | 3.00k | return PrepareSystemTable( | 1479 | 3.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1480 | 3.00k | } |
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLViewsVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long) Line | Count | Source | 1476 | 3.00k | int64_t term) { | 1477 | 3.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1478 | 3.00k | return PrepareSystemTable( | 1479 | 3.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1480 | 3.00k | } |
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::QLTypesVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long) Line | Count | Source | 1476 | 3.00k | int64_t term) { | 1477 | 3.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1478 | 3.00k | return PrepareSystemTable( | 1479 | 3.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1480 | 3.00k | } |
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLPartitionsVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long) Line | Count | Source | 1476 | 3.00k | int64_t term) { | 1477 | 3.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1478 | 3.00k | return PrepareSystemTable( | 1479 | 3.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1480 | 3.00k | } |
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLAuthRolesVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long) Line | Count | Source | 1476 | 3.00k | int64_t term) { | 1477 | 3.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1478 | 3.00k | return PrepareSystemTable( | 1479 | 3.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1480 | 3.00k | } |
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLAuthRolePermissionsVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long) Line | Count | Source | 1476 | 3.00k | int64_t term) { | 1477 | 3.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1478 | 3.00k | return PrepareSystemTable( | 1479 | 3.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1480 | 3.00k | } |
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLAuthResourceRolePermissionsIndexVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long) Line | Count | Source | 1476 | 3.00k | int64_t term) { | 1477 | 3.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1478 | 3.00k | return PrepareSystemTable( | 1479 | 3.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1480 | 3.00k | } |
|
1481 | | |
1482 | | Status CatalogManager::PrepareSystemTable(const TableName& table_name, |
1483 | | const NamespaceName& namespace_name, |
1484 | | const NamespaceId& namespace_id, |
1485 | | const Schema& schema, |
1486 | | int64_t term, |
1487 | 48.1k | YQLVirtualTable* vtable) { |
1488 | 48.1k | std::unique_ptr<YQLVirtualTable> yql_storage(vtable); |
1489 | | |
1490 | 48.1k | scoped_refptr<TableInfo> table = FindPtrOrNull(table_names_map_, |
1491 | 48.1k | std::make_pair(namespace_id, table_name)); |
1492 | 48.1k | bool create_table = true; |
1493 | 48.1k | if (table != nullptr) { |
1494 | 13.5k | LOG_WITH_PREFIX(INFO) << "Table " << namespace_name << "." << table_name << " already created"; |
1495 | | |
1496 | | // Mark the table as a system table. |
1497 | 13.5k | table->set_is_system(); |
1498 | | |
1499 | 13.5k | Schema persisted_schema; |
1500 | 13.5k | RETURN_NOT_OK(table->GetSchema(&persisted_schema)); |
1501 | 13.5k | if (!persisted_schema.Equals(schema)) { |
1502 | 6 | LOG_WITH_PREFIX(INFO) |
1503 | 6 | << "Updating schema of " << namespace_name << "." << table_name << " ..."; |
1504 | 6 | auto l = table->LockForWrite(); |
1505 | 6 | SchemaToPB(schema, l.mutable_data()->pb.mutable_schema()); |
1506 | 6 | l.mutable_data()->pb.set_version(l->pb.version() + 1); |
1507 | 6 | l.mutable_data()->pb.set_updates_only_index_permissions(false); |
1508 | | |
1509 | | // Update sys-catalog with the new table schema. |
1510 | 6 | RETURN_NOT_OK(sys_catalog_->Upsert(term, table)); |
1511 | 6 | l.Commit(); |
1512 | 6 | } |
1513 | | |
1514 | | // There might have been a failure after writing the table but before writing the tablets. As |
1515 | | // a result, if we don't find any tablets, we try to create the tablets only again. |
1516 | 13.5k | auto tablets = table->GetTablets(); |
1517 | 13.5k | if (!tablets.empty()) { |
1518 | | // Initialize the appropriate system tablet. |
1519 | 13.5k | DCHECK_EQ(1, tablets.size()); |
1520 | 13.5k | auto tablet = tablets[0]; |
1521 | 13.5k | system_tablets_[tablet->tablet_id()] = |
1522 | 13.5k | std::make_shared<SystemTablet>(schema, std::move(yql_storage), tablet->tablet_id()); |
1523 | 13.5k | return Status::OK(); |
1524 | 13.5k | } else { |
1525 | | // Table is already created, only need to create tablets now. |
1526 | 1 | LOG_WITH_PREFIX(INFO) |
1527 | 1 | << "Creating tablets for " << namespace_name << "." << table_name << " ..."; |
1528 | 1 | create_table = false; |
1529 | 1 | } |
1530 | 13.5k | } |
1531 | | |
1532 | | // Create partitions. |
1533 | 34.5k | vector<Partition> partitions; |
1534 | 34.5k | PartitionSchemaPB partition_schema_pb; |
1535 | 34.5k | partition_schema_pb.set_hash_schema(PartitionSchemaPB::MULTI_COLUMN_HASH_SCHEMA); |
1536 | 34.5k | PartitionSchema partition_schema; |
1537 | 34.5k | RETURN_NOT_OK(PartitionSchema::FromPB(partition_schema_pb, schema, &partition_schema)); |
1538 | 34.5k | RETURN_NOT_OK(partition_schema.CreatePartitions(1, &partitions)); |
1539 | | |
1540 | 34.5k | TabletInfos tablets; |
1541 | | |
1542 | 34.5k | if (create_table) { |
1543 | | // Fill in details for the system table. |
1544 | 34.5k | CreateTableRequestPB req; |
1545 | 34.5k | req.set_name(table_name); |
1546 | 34.5k | req.set_table_type(TableType::YQL_TABLE_TYPE); |
1547 | | |
1548 | 34.5k | RETURN_NOT_OK(CreateTableInMemory( |
1549 | 34.5k | req, schema, partition_schema, namespace_id, namespace_name, |
1550 | 34.5k | partitions, nullptr, &tablets, nullptr, &table)); |
1551 | | // Mark the table as a system table. |
1552 | 34.5k | LOG_WITH_PREFIX(INFO) << "Inserted new " << namespace_name << "." << table_name |
1553 | 34.5k | << " table info into CatalogManager maps"; |
1554 | | // Update the on-disk table state to "running". |
1555 | 34.5k | table->mutable_metadata()->mutable_dirty()->pb.set_state(SysTablesEntryPB::RUNNING); |
1556 | 34.5k | RETURN_NOT_OK(sys_catalog_->Upsert(term, table)); |
1557 | 34.5k | LOG_WITH_PREFIX(INFO) << "Wrote table to system catalog: " << ToString(table) << ", tablets: " |
1558 | 34.5k | << ToString(tablets); |
1559 | 34.5k | } else { |
1560 | | // Still need to create the tablets. |
1561 | 1 | tablets = VERIFY_RESULT(CreateTabletsFromTable(partitions, table)); |
1562 | 1 | } |
1563 | | |
1564 | 34.5k | DCHECK_EQ(1, tablets.size()); |
1565 | | // We use LOG_ASSERT here since this is expected to crash in some unit tests. |
1566 | 34.5k | LOG_ASSERT(!FLAGS_TEST_catalog_manager_simulate_system_table_create_failure); |
1567 | | |
1568 | | // Write Tablets to sys-tablets (in "running" state since we don't want the loadbalancer to |
1569 | | // assign these tablets since this table is virtual). |
1570 | 34.5k | for (const auto& tablet : tablets) { |
1571 | 34.5k | tablet->mutable_metadata()->mutable_dirty()->pb.set_state(SysTabletsEntryPB::RUNNING); |
1572 | 34.5k | } |
1573 | 34.5k | RETURN_NOT_OK(sys_catalog_->Upsert(term, tablets)); |
1574 | 34.5k | LOG_WITH_PREFIX(INFO) << "Wrote tablets to system catalog: " << ToString(tablets); |
1575 | | |
1576 | | // Commit the in-memory state. |
1577 | 34.5k | if (create_table) { |
1578 | 34.5k | table->mutable_metadata()->CommitMutation(); |
1579 | 34.5k | } |
1580 | | |
1581 | 34.5k | for (const auto& tablet : tablets) { |
1582 | 34.5k | tablet->mutable_metadata()->CommitMutation(); |
1583 | 34.5k | } |
1584 | | // Mark the table as a system table. |
1585 | 34.5k | table->set_is_system(); |
1586 | | |
1587 | | // Finally create the appropriate tablet object. |
1588 | 34.5k | auto tablet = tablets[0]; |
1589 | 34.5k | system_tablets_[tablet->tablet_id()] = |
1590 | 34.5k | std::make_shared<SystemTablet>(schema, std::move(yql_storage), tablet->tablet_id()); |
1591 | 34.5k | return Status::OK(); |
1592 | 34.5k | } |
1593 | | |
1594 | 55.8k | bool IsYcqlNamespace(const NamespaceInfo& ns) { |
1595 | 55.8k | return ns.database_type() == YQLDatabase::YQL_DATABASE_CQL; |
1596 | 55.8k | } |
1597 | | |
1598 | 1.28M | bool IsYcqlTable(const TableInfo& table) { |
1599 | 1.28M | return table.GetTableType() == TableType::YQL_TABLE_TYPE && table.id() != kSysCatalogTableId665k ; |
1600 | 1.28M | } |
1601 | | |
1602 | | Status CatalogManager::PrepareNamespace( |
1603 | 9.02k | YQLDatabase db_type, const NamespaceName& name, const NamespaceId& id, int64_t term) { |
1604 | | |
1605 | 9.02k | scoped_refptr<NamespaceInfo> ns = FindPtrOrNull(namespace_ids_map_, id); |
1606 | 9.02k | if (ns != nullptr) { |
1607 | 2.53k | LOG_WITH_PREFIX(INFO) |
1608 | 2.53k | << "Keyspace " << ns->ToString() << " already created, skipping initialization"; |
1609 | 2.53k | return Status::OK(); |
1610 | 2.53k | } |
1611 | | |
1612 | | // Create entry. |
1613 | 6.48k | SysNamespaceEntryPB ns_entry; |
1614 | 6.48k | ns_entry.set_name(name); |
1615 | 6.48k | ns_entry.set_database_type(db_type); |
1616 | 6.48k | ns_entry.set_state(SysNamespaceEntryPB::RUNNING); |
1617 | | |
1618 | | // Create in memory object. |
1619 | 6.48k | ns = new NamespaceInfo(id); |
1620 | | |
1621 | | // Prepare write. |
1622 | 6.48k | auto l = ns->LockForWrite(); |
1623 | 6.48k | l.mutable_data()->pb = std::move(ns_entry); |
1624 | | |
1625 | 6.48k | namespace_ids_map_[id] = ns; |
1626 | 6.48k | namespace_names_mapper_[db_type][l.mutable_data()->pb.name()] = ns; |
1627 | | |
1628 | | // Write to sys_catalog and in memory. |
1629 | 6.48k | RETURN_NOT_OK(sys_catalog_->Upsert(term, ns)); |
1630 | 6.48k | l.Commit(); |
1631 | | |
1632 | 6.48k | LOG_WITH_PREFIX(INFO) << "Created default keyspace: " << ns->ToString(); |
1633 | 6.48k | return Status::OK(); |
1634 | 6.48k | } |
1635 | | |
1636 | 7.94k | Status CatalogManager::CheckLocalHostInMasterAddresses() { |
1637 | 7.94k | auto local_hostport = master_->first_rpc_address(); |
1638 | 7.94k | std::vector<IpAddress> local_addrs; |
1639 | | |
1640 | 7.94k | if (local_hostport.address().is_unspecified()) { |
1641 | 0 | auto status = GetLocalAddresses(&local_addrs, AddressFilter::ANY); |
1642 | 0 | if (!status.ok() || local_addrs.empty()) { |
1643 | 0 | LOG(WARNING) << "Could not enumerate network interfaces due to " << status << ", found " |
1644 | 0 | << local_addrs.size() << " local addresses."; |
1645 | 0 | return Status::OK(); |
1646 | 0 | } |
1647 | 7.94k | } else { |
1648 | 7.94k | for (auto const &addr : master_->rpc_addresses()) { |
1649 | 7.94k | local_addrs.push_back(addr.address()); |
1650 | 7.94k | } |
1651 | 7.94k | } |
1652 | | |
1653 | 7.94k | auto resolved_addresses = VERIFY_RESULT(server::ResolveMasterAddresses( |
1654 | 7.94k | *master_->opts().GetMasterAddresses())); |
1655 | | |
1656 | 14.1k | for (auto const &addr : resolved_addresses) { |
1657 | 14.1k | if (addr.address().is_unspecified() || |
1658 | 14.1k | std::find(local_addrs.begin(), local_addrs.end(), addr.address()) != |
1659 | 14.1k | local_addrs.end()) { |
1660 | 7.94k | return Status::OK(); |
1661 | 7.94k | } |
1662 | 14.1k | } |
1663 | 0 | return STATUS_SUBSTITUTE(IllegalState, |
1664 | 7.94k | "None of the local addresses are present in master_addresses $0.", |
1665 | 7.94k | master_->opts().master_addresses_flag); |
1666 | 7.94k | } |
1667 | | |
1668 | 8.03k | Status CatalogManager::InitSysCatalogAsync() { |
1669 | 8.03k | LockGuard lock(mutex_); |
1670 | | |
1671 | | // Optimistically try to load data from disk. |
1672 | 8.03k | Status s = sys_catalog_->Load(master_->fs_manager()); |
1673 | | |
1674 | 8.03k | if (!s.ok() && s.IsNotFound()8.00k ) { |
1675 | | // We have yet to intialize the syscatalog metadata, need to create the metadata file. |
1676 | 8.00k | LOG(INFO) << "Did not find previous SysCatalogTable data on disk. " << s; |
1677 | | |
1678 | 8.00k | if (!master_->opts().AreMasterAddressesProvided()) { |
1679 | 58 | master_->SetShellMode(true); |
1680 | 58 | LOG(INFO) << "Starting master in shell mode."; |
1681 | 58 | return Status::OK(); |
1682 | 58 | } |
1683 | | |
1684 | 7.94k | RETURN_NOT_OK(CheckLocalHostInMasterAddresses()); |
1685 | 7.94k | RETURN_NOT_OK_PREPEND(sys_catalog_->CreateNew(master_->fs_manager()), |
1686 | 7.94k | Substitute("Encountered errors during system catalog initialization:" |
1687 | 7.94k | "\n\tError on Load: $0\n\tError on CreateNew: ", s.ToString())); |
1688 | | |
1689 | 7.93k | return Status::OK(); |
1690 | 7.94k | } |
1691 | | |
1692 | 31 | return s; |
1693 | 8.03k | } |
1694 | | |
1695 | 31.7M | bool CatalogManager::IsInitialized() const { |
1696 | 31.7M | std::lock_guard<simple_spinlock> l(state_lock_); |
1697 | 31.7M | return state_ == kRunning; |
1698 | 31.7M | } |
1699 | | |
1700 | | // TODO - delete this API after HandleReportedTablet() usage is removed. |
1701 | 458k | Status CatalogManager::CheckIsLeaderAndReady() const { |
1702 | 458k | std::lock_guard<simple_spinlock> l(state_lock_); |
1703 | 458k | if (PREDICT_FALSE(state_ != kRunning)) { |
1704 | 39 | return STATUS_SUBSTITUTE(ServiceUnavailable, |
1705 | 39 | "Catalog manager is shutting down. State: $0", state_); |
1706 | 39 | } |
1707 | 458k | string uuid = master_->fs_manager()->uuid(); |
1708 | 458k | if (master_->opts().IsShellMode()) { |
1709 | | // Consensus and other internal fields should not be checked when is shell mode. |
1710 | 0 | return STATUS_SUBSTITUTE(IllegalState, |
1711 | 0 | "Catalog manager of $0 is in shell mode, not the leader", uuid); |
1712 | 0 | } |
1713 | 458k | Consensus* consensus = tablet_peer()->consensus(); |
1714 | 458k | if (consensus == nullptr) { |
1715 | 0 | return STATUS(IllegalState, "Consensus has not been initialized yet"); |
1716 | 0 | } |
1717 | 458k | ConsensusStatePB cstate = consensus->ConsensusState(CONSENSUS_CONFIG_COMMITTED); |
1718 | 458k | if (PREDICT_FALSE(!cstate.has_leader_uuid() || cstate.leader_uuid() != uuid)) { |
1719 | 6 | return STATUS_SUBSTITUTE(IllegalState, |
1720 | 6 | "Not the leader. Local UUID: $0, Consensus state: $1", uuid, cstate.ShortDebugString()); |
1721 | 6 | } |
1722 | 458k | if (PREDICT_FALSE(leader_ready_term_ != cstate.current_term())) { |
1723 | 0 | return STATUS_SUBSTITUTE(ServiceUnavailable, |
1724 | 0 | "Leader not yet ready to serve requests: ready term $0 vs cstate term $1", |
1725 | 0 | leader_ready_term_, cstate.current_term()); |
1726 | 0 | } |
1727 | 458k | return Status::OK(); |
1728 | 458k | } |
1729 | | |
1730 | 34.5M | std::shared_ptr<tablet::TabletPeer> CatalogManager::tablet_peer() const { |
1731 | 34.5M | return sys_catalog_->tablet_peer(); |
1732 | 34.5M | } |
1733 | | |
1734 | 28.8M | PeerRole CatalogManager::Role() const { |
1735 | 28.8M | if (!IsInitialized()28.8M || master_->opts().IsShellMode()) { |
1736 | 317 | return PeerRole::NON_PARTICIPANT; |
1737 | 317 | } |
1738 | | |
1739 | 28.8M | return tablet_peer()->consensus()->role(); |
1740 | 28.8M | } |
1741 | | |
1742 | 278 | bool CatalogManager::StartShutdown() { |
1743 | 278 | { |
1744 | 278 | std::lock_guard<simple_spinlock> l(state_lock_); |
1745 | 278 | if (state_ == kClosing) { |
1746 | 178 | VLOG(2) << "CatalogManager already shut down"0 ; |
1747 | 178 | return false; |
1748 | 178 | } |
1749 | 100 | state_ = kClosing; |
1750 | 100 | } |
1751 | | |
1752 | 0 | refresh_yql_partitions_task_.StartShutdown(); |
1753 | | |
1754 | 100 | refresh_ysql_tablespace_info_task_.StartShutdown(); |
1755 | | |
1756 | 100 | if (sys_catalog_) { |
1757 | 100 | sys_catalog_->StartShutdown(); |
1758 | 100 | } |
1759 | | |
1760 | 100 | return true; |
1761 | 278 | } |
1762 | | |
1763 | 94 | void CatalogManager::CompleteShutdown() { |
1764 | | // Shutdown the Catalog Manager background thread (load balancing). |
1765 | 94 | refresh_yql_partitions_task_.CompleteShutdown(); |
1766 | 94 | refresh_ysql_tablespace_info_task_.CompleteShutdown(); |
1767 | | |
1768 | 94 | if (background_tasks_) { |
1769 | 83 | background_tasks_->Shutdown(); |
1770 | 83 | } |
1771 | 94 | if (background_tasks_thread_pool_) { |
1772 | 93 | background_tasks_thread_pool_->Shutdown(); |
1773 | 93 | } |
1774 | 94 | if (leader_initialization_pool_) { |
1775 | 93 | leader_initialization_pool_->Shutdown(); |
1776 | 93 | } |
1777 | 94 | if (async_task_pool_) { |
1778 | 93 | async_task_pool_->Shutdown(); |
1779 | 93 | } |
1780 | | |
1781 | | // Mark all outstanding table tasks as aborted and wait for them to fail. |
1782 | | // |
1783 | | // There may be an outstanding table visitor thread modifying the table map, |
1784 | | // so we must make a copy of it before we iterate. It's OK if the visitor |
1785 | | // adds more entries to the map even after we finish; it won't start any new |
1786 | | // tasks for those entries. |
1787 | 94 | vector<scoped_refptr<TableInfo>> copy; |
1788 | 94 | { |
1789 | 94 | SharedLock lock(mutex_); |
1790 | 94 | AppendValuesFromMap(*table_ids_map_, ©); |
1791 | 94 | } |
1792 | 94 | AbortAndWaitForAllTasks(copy); |
1793 | | |
1794 | | // Shut down the underlying storage for tables and tablets. |
1795 | 94 | if (sys_catalog_) { |
1796 | 92 | sys_catalog_->CompleteShutdown(); |
1797 | 92 | } |
1798 | | |
1799 | | // Reset the jobs/tasks tracker. |
1800 | 94 | tasks_tracker_->Reset(); |
1801 | 94 | jobs_tracker_->Reset(); |
1802 | | |
1803 | 94 | if (initdb_future_ && initdb_future_->wait_for(0s) != std::future_status::ready0 ) { |
1804 | 0 | LOG(WARNING) << "initdb is still running, waiting for it to complete."; |
1805 | 0 | initdb_future_->wait(); |
1806 | 0 | LOG(INFO) << "Finished running initdb, proceeding with catalog manager shutdown."; |
1807 | 0 | } |
1808 | 94 | } |
1809 | | |
1810 | | Status CatalogManager::AbortTableCreation(TableInfo* table, |
1811 | | const TabletInfos& tablets, |
1812 | | const Status& s, |
1813 | 7 | CreateTableResponsePB* resp) { |
1814 | 7 | LOG(WARNING) << s; |
1815 | | |
1816 | 7 | const TableId table_id = table->id(); |
1817 | 7 | const TableName table_name = table->mutable_metadata()->mutable_dirty()->pb.name(); |
1818 | 7 | const NamespaceId table_namespace_id = |
1819 | 7 | table->mutable_metadata()->mutable_dirty()->pb.namespace_id(); |
1820 | 7 | vector<string> tablet_ids_to_erase; |
1821 | 14 | for (const auto& tablet : tablets) { |
1822 | 14 | tablet_ids_to_erase.push_back(tablet->tablet_id()); |
1823 | 14 | } |
1824 | | |
1825 | 7 | LOG(INFO) << "Aborting creation of table '" << table_name << "', erasing table and tablets (" << |
1826 | 7 | JoinStrings(tablet_ids_to_erase, ",") << ") from in-memory state."; |
1827 | | |
1828 | | // Since this is a failed creation attempt, it's safe to just abort |
1829 | | // all tasks, as (by definition) no tasks may be pending against a |
1830 | | // table that has failed to successfully create. |
1831 | 7 | table->AbortTasksAndClose(); |
1832 | 7 | table->WaitTasksCompletion(); |
1833 | | |
1834 | 7 | LockGuard lock(mutex_); |
1835 | | |
1836 | | // Call AbortMutation() manually, as otherwise the lock won't be released. |
1837 | 14 | for (const auto& tablet : tablets) { |
1838 | 14 | tablet->mutable_metadata()->AbortMutation(); |
1839 | 14 | } |
1840 | 7 | table->mutable_metadata()->AbortMutation(); |
1841 | 7 | auto tablet_map_checkout = tablet_map_.CheckOut(); |
1842 | 14 | for (const TabletId& tablet_id_to_erase : tablet_ids_to_erase) { |
1843 | 14 | CHECK_EQ(tablet_map_checkout->erase(tablet_id_to_erase), 1) |
1844 | 0 | << "Unable to erase tablet " << tablet_id_to_erase << " from tablet map."; |
1845 | 14 | } |
1846 | | |
1847 | 7 | auto table_ids_map_checkout = table_ids_map_.CheckOut(); |
1848 | 7 | table_names_map_.erase({table_namespace_id, table_name}); // Not present if PGSQL table. |
1849 | 7 | CHECK_EQ(table_ids_map_checkout->erase(table_id), 1) |
1850 | 0 | << "Unable to erase table with id " << table_id << " from table ids map."; |
1851 | | |
1852 | 7 | if (IsYcqlTable(*table)) { |
1853 | 7 | GetYqlPartitionsVtable().RemoveFromCache(table->id()); |
1854 | 7 | } |
1855 | 7 | return CheckIfNoLongerLeaderAndSetupError(s, resp); |
1856 | 7 | } |
1857 | | |
1858 | | Result<ReplicationInfoPB> CatalogManager::GetTableReplicationInfo( |
1859 | | const ReplicationInfoPB& table_replication_info, |
1860 | 56.9k | const TablespaceId& tablespace_id) { |
1861 | | |
1862 | 56.9k | if (IsReplicationInfoSet(table_replication_info)) { |
1863 | | // The table has custom replication info set for it, return it if valid. |
1864 | 5 | RETURN_NOT_OK(ValidateTableReplicationInfo(table_replication_info)); |
1865 | 5 | return table_replication_info; |
1866 | 5 | } |
1867 | | // Table level replication info not set. Check whether the table is |
1868 | | // associated with a tablespace and if so, return the tablespace |
1869 | | // replication info. |
1870 | 56.8k | if (GetAtomicFlag(&FLAGS_enable_ysql_tablespaces_for_placement)) { |
1871 | 56.8k | boost::optional<ReplicationInfoPB> tablespace_pb = |
1872 | 56.8k | VERIFY_RESULT(GetTablespaceReplicationInfoWithRetry(tablespace_id)); |
1873 | 56.8k | if (tablespace_pb) { |
1874 | | // Return the tablespace placement. |
1875 | 728 | return tablespace_pb.value(); |
1876 | 728 | } |
1877 | 56.8k | } |
1878 | | |
1879 | | // Neither table nor tablespace info set. Return cluster level replication info. |
1880 | 56.1k | auto l = ClusterConfig()->LockForRead(); |
1881 | 56.1k | return l->pb.replication_info(); |
1882 | 56.8k | } |
1883 | | |
1884 | 1.00M | std::shared_ptr<YsqlTablespaceManager> CatalogManager::GetTablespaceManager() const { |
1885 | 1.00M | SharedLock lock(tablespace_mutex_); |
1886 | 1.00M | return tablespace_manager_; |
1887 | 1.00M | } |
1888 | | |
1889 | | Result<boost::optional<TablespaceId>> CatalogManager::GetTablespaceForTable( |
1890 | 1 | const scoped_refptr<TableInfo>& table) { |
1891 | | |
1892 | 1 | auto tablespace_manager = GetTablespaceManager(); |
1893 | 1 | return tablespace_manager->GetTablespaceForTable(table); |
1894 | 1 | } |
1895 | | |
1896 | | Result<boost::optional<ReplicationInfoPB>> CatalogManager::GetTablespaceReplicationInfoWithRetry( |
1897 | 57.0k | const TablespaceId& tablespace_id) { |
1898 | | |
1899 | 57.0k | auto tablespace_manager = GetTablespaceManager(); |
1900 | 57.0k | auto replication_info_result = tablespace_manager->GetTablespaceReplicationInfo(tablespace_id); |
1901 | | |
1902 | 57.0k | if (replication_info_result) { |
1903 | 56.9k | return replication_info_result; |
1904 | 56.9k | } |
1905 | | |
1906 | | // We failed to find the tablespace placement policy. Refresh the tablespace info and try again. |
1907 | 19 | auto tablespace_map = VERIFY_RESULT(GetYsqlTablespaceInfo()); |
1908 | | |
1909 | | // We clone the tablespace_manager and update the clone with the new tablespace_map that we |
1910 | | // fetched above. We do this instead of updating the tablespace_manager object in-place because |
1911 | | // other clients may have a shared_ptr to it through 'GetTablespaceManager()'. |
1912 | 0 | tablespace_manager = tablespace_manager->CreateCloneWithTablespaceMap(tablespace_map); |
1913 | 19 | { |
1914 | 19 | LockGuard lock(tablespace_mutex_); |
1915 | 19 | tablespace_manager_ = tablespace_manager; |
1916 | 19 | } |
1917 | | |
1918 | 19 | return tablespace_manager->GetTablespaceReplicationInfo(tablespace_id); |
1919 | 19 | } |
1920 | | |
1921 | 239k | bool CatalogManager::IsReplicationInfoSet(const ReplicationInfoPB& replication_info) { |
1922 | 239k | const auto& live_placement_info = replication_info.live_replicas(); |
1923 | 239k | if (!(live_placement_info.placement_blocks().empty() && |
1924 | 239k | live_placement_info.num_replicas() <= 0239k && |
1925 | 239k | live_placement_info.placement_uuid().empty()238k ) || |
1926 | 239k | !replication_info.read_replicas().empty()238k || |
1927 | 239k | !replication_info.affinitized_leaders().empty()238k ) { |
1928 | | |
1929 | 621 | return true; |
1930 | 621 | } |
1931 | 239k | return false; |
1932 | 239k | } |
1933 | | |
1934 | 428 | Status CatalogManager::ValidateTableReplicationInfo(const ReplicationInfoPB& replication_info) { |
1935 | 428 | if (!IsReplicationInfoSet(replication_info)) { |
1936 | 0 | return STATUS(InvalidArgument, "No replication info set."); |
1937 | 0 | } |
1938 | | // We don't support setting any other fields other than live replica placements for now. |
1939 | 428 | if (!replication_info.read_replicas().empty() || |
1940 | 428 | !replication_info.affinitized_leaders().empty()) { |
1941 | |
|
1942 | 0 | return STATUS(InvalidArgument, "Only live placement info can be set for table " |
1943 | 0 | "level replication info."); |
1944 | 0 | } |
1945 | | // Today we support setting table level replication info only in clusters where read replica |
1946 | | // placements is not set. Return error if the cluster has read replica placements set. |
1947 | 428 | auto l = ClusterConfig()->LockForRead(); |
1948 | 428 | const ReplicationInfoPB& cluster_replication_info = l->pb.replication_info(); |
1949 | | // TODO(bogdan): figure this out when we expand on geopartition support. |
1950 | | // if (!cluster_replication_info.read_replicas().empty() || |
1951 | | // !cluster_replication_info.affinitized_leaders().empty()) { |
1952 | | |
1953 | | // return STATUS(InvalidArgument, "Setting table level replication info is not supported " |
1954 | | // "for clusters with read replica placements"); |
1955 | | // } |
1956 | | // If the replication info has placement_uuid set, verify that it matches the cluster |
1957 | | // placement_uuid. |
1958 | 428 | if (replication_info.live_replicas().placement_uuid().empty()) { |
1959 | 426 | return Status::OK(); |
1960 | 426 | } |
1961 | 2 | if (replication_info.live_replicas().placement_uuid() != |
1962 | 2 | cluster_replication_info.live_replicas().placement_uuid()) { |
1963 | |
|
1964 | 0 | return STATUS(InvalidArgument, "Placement uuid for table level replication info " |
1965 | 0 | "must match that of the cluster's live placement info."); |
1966 | 0 | } |
1967 | 2 | return Status::OK(); |
1968 | 2 | } |
1969 | | |
1970 | 4.15k | Result<shared_ptr<TablespaceIdToReplicationInfoMap>> CatalogManager::GetYsqlTablespaceInfo() { |
1971 | 4.15k | auto table_info = GetTableInfo(kPgTablespaceTableId); |
1972 | 4.15k | if (table_info == nullptr) { |
1973 | 166 | return STATUS(InternalError, "pg_tablespace table info not found"); |
1974 | 166 | } |
1975 | | |
1976 | 3.98k | auto tablespace_map = VERIFY_RESULT(sys_catalog_->ReadPgTablespaceInfo()); |
1977 | | |
1978 | | // The tablespace options do not usually contain the placement uuid. |
1979 | | // Populate the current cluster placement uuid into the placement information for |
1980 | | // each tablespace. |
1981 | 0 | string placement_uuid; |
1982 | 3.98k | { |
1983 | 3.98k | auto l = ClusterConfig()->LockForRead(); |
1984 | | // TODO(deepthi.srinivasan): Read-replica placements are not supported as |
1985 | | // of now. |
1986 | 3.98k | placement_uuid = l->pb.replication_info().live_replicas().placement_uuid(); |
1987 | 3.98k | } |
1988 | 3.98k | if (!placement_uuid.empty()) { |
1989 | 4 | for (auto& iter : *tablespace_map) { |
1990 | 4 | if (iter.second) { |
1991 | 0 | iter.second.value().mutable_live_replicas()->set_placement_uuid(placement_uuid); |
1992 | 0 | } |
1993 | 4 | } |
1994 | 2 | } |
1995 | | |
1996 | | // Before updating the tablespace placement map, validate the |
1997 | | // placement policies. |
1998 | 8.39k | for (auto& iter : *tablespace_map) { |
1999 | 8.39k | if (iter.second) { |
2000 | 419 | RETURN_NOT_OK(ValidateTableReplicationInfo(iter.second.value())); |
2001 | 419 | } |
2002 | 8.39k | } |
2003 | | |
2004 | 3.98k | return tablespace_map; |
2005 | 3.98k | } |
2006 | | |
2007 | | boost::optional<TablespaceId> CatalogManager::GetTransactionStatusTableTablespace( |
2008 | 6.90k | const scoped_refptr<TableInfo>& table) { |
2009 | 6.90k | auto lock = table->LockForRead(); |
2010 | 6.90k | if (lock->pb.table_type() != TRANSACTION_STATUS_TABLE_TYPE) { |
2011 | 2 | return boost::none; |
2012 | 2 | } |
2013 | | |
2014 | 6.90k | if (!lock->pb.has_transaction_table_tablespace_id()) { |
2015 | 5.63k | return boost::none; |
2016 | 5.63k | } |
2017 | | |
2018 | 1.26k | return lock->pb.transaction_table_tablespace_id(); |
2019 | 6.90k | } |
2020 | | |
2021 | 7 | void CatalogManager::ClearTransactionStatusTableTablespace(const scoped_refptr<TableInfo>& table) { |
2022 | 7 | auto lock = table->LockForWrite(); |
2023 | 7 | if (lock->pb.table_type() != TRANSACTION_STATUS_TABLE_TYPE) { |
2024 | 0 | return; |
2025 | 0 | } |
2026 | | |
2027 | 7 | lock.mutable_data()->pb.clear_transaction_table_tablespace_id(); |
2028 | 7 | lock.mutable_data()->pb.set_version(lock.mutable_data()->pb.version() + 1); |
2029 | 7 | lock.Commit(); |
2030 | 7 | } |
2031 | | |
2032 | | bool CatalogManager::CheckTransactionStatusTablesWithMissingTablespaces( |
2033 | 3.96k | const TablespaceIdToReplicationInfoMap& tablespace_info) { |
2034 | 3.96k | SharedLock lock(mutex_); |
2035 | 3.96k | for (const auto& table_id : transaction_table_ids_set_) { |
2036 | 2.02k | auto table = table_ids_map_->find(table_id); |
2037 | 2.02k | if (table == table_ids_map_->end()) { |
2038 | 0 | LOG(DFATAL) << "Table uuid " << table_id |
2039 | 0 | << " in transaction_table_ids_set_ but not in table_ids_map_"; |
2040 | 0 | continue; |
2041 | 0 | } |
2042 | 2.02k | auto tablespace_id = GetTransactionStatusTableTablespace(table->second); |
2043 | 2.02k | if (tablespace_id) { |
2044 | 271 | if (!tablespace_info.count(*tablespace_id)) { |
2045 | 3 | return true; |
2046 | 3 | } |
2047 | 271 | } |
2048 | 2.02k | } |
2049 | 3.96k | return false; |
2050 | 3.96k | } |
2051 | | |
2052 | | Status CatalogManager::UpdateTransactionStatusTableTablespaces( |
2053 | 3.96k | const TablespaceIdToReplicationInfoMap& tablespace_info) { |
2054 | 3.96k | if (CheckTransactionStatusTablesWithMissingTablespaces(tablespace_info)) { |
2055 | 3 | { |
2056 | 3 | LockGuard lock(mutex_); |
2057 | 21 | for (const auto& table_id : transaction_table_ids_set_) { |
2058 | 21 | auto table = table_ids_map_->find(table_id); |
2059 | 21 | if (table == table_ids_map_->end()) { |
2060 | 0 | LOG(DFATAL) << "Table uuid " << table_id |
2061 | 0 | << " in transaction_table_ids_set_ but not in table_ids_map_"; |
2062 | 0 | continue; |
2063 | 0 | } |
2064 | 21 | auto tablespace_id = GetTransactionStatusTableTablespace(table->second); |
2065 | 21 | if (tablespace_id) { |
2066 | 12 | if (!tablespace_info.count(*tablespace_id)) { |
2067 | | // TODO: We should also delete the transaction table, see #11123. |
2068 | 7 | LOG(INFO) << "Found transaction status table for tablespace id " << *tablespace_id |
2069 | 7 | << " which doesn't exist, clearing tablespace id"; |
2070 | 7 | ClearTransactionStatusTableTablespace(table->second); |
2071 | 7 | } |
2072 | 12 | } |
2073 | 21 | } |
2074 | 3 | } |
2075 | | |
2076 | | // A tablespace id has been cleared, meaning a transaction table's placement has changed, |
2077 | | // and thus the transaction tables version needs to be incremented. |
2078 | 3 | RETURN_NOT_OK(IncrementTransactionTablesVersion()); |
2079 | 3 | } |
2080 | | |
2081 | 3.96k | return Status::OK(); |
2082 | 3.96k | } |
2083 | | |
2084 | | Result<shared_ptr<TableToTablespaceIdMap>> CatalogManager::GetYsqlTableToTablespaceMap( |
2085 | 201 | const TablespaceIdToReplicationInfoMap& tablespace_info) { |
2086 | 201 | auto table_to_tablespace_map = std::make_shared<TableToTablespaceIdMap>(); |
2087 | | |
2088 | | // First fetch all namespaces. This is because the table_to_tablespace information is only |
2089 | | // found in the pg_class catalog table. There exists a separate pg_class table in each |
2090 | | // namespace. To build in-memory state for all tables, process pg_class table for each |
2091 | | // namespace. |
2092 | 201 | vector<NamespaceId> namespace_id_vec; |
2093 | 201 | set<NamespaceId> colocated_namespaces; |
2094 | 201 | { |
2095 | 201 | SharedLock lock(mutex_); |
2096 | 1.78k | for (const auto& ns : namespace_ids_map_) { |
2097 | 1.78k | if (ns.second->database_type() != YQL_DATABASE_PGSQL) { |
2098 | 603 | continue; |
2099 | 603 | } |
2100 | | |
2101 | 1.18k | if (ns.first == kPgSequencesDataNamespaceId) { |
2102 | | // Skip the database created for sequences system table. |
2103 | 124 | continue; |
2104 | 124 | } |
2105 | | |
2106 | 1.05k | if (ns.second->colocated()) { |
2107 | 20 | colocated_namespaces.insert(ns.first); |
2108 | 20 | } |
2109 | | |
2110 | | // TODO (Deepthi): Investigate if safe to skip template0 and template1 as well. |
2111 | 1.05k | namespace_id_vec.emplace_back(ns.first); |
2112 | 1.05k | } |
2113 | | |
2114 | | // Add local transaction tables corresponding to tablespaces. |
2115 | 487 | for (const auto& table_id : transaction_table_ids_set_) { |
2116 | 487 | auto table = table_ids_map_->find(table_id); |
2117 | 487 | if (table == table_ids_map_->end()) { |
2118 | 0 | LOG(DFATAL) << "Table uuid " << table_id |
2119 | 0 | << " in transaction_table_ids_set_ but not in table_ids_map_"; |
2120 | 0 | continue; |
2121 | 0 | } |
2122 | 487 | auto tablespace_id = GetTransactionStatusTableTablespace(table->second); |
2123 | 487 | if (tablespace_id) { |
2124 | 271 | if (tablespace_info.count(*tablespace_id)) { |
2125 | 271 | (*table_to_tablespace_map)[table_id] = *tablespace_id; |
2126 | 271 | } else { |
2127 | | // It's possible that a new tablespace had its transaction table created then deleted |
2128 | | // between when we checked tablespace ids and now; we ignore it here, and it will be |
2129 | | // caught and cleared in the next tablespace update. |
2130 | 0 | LOG(INFO) << "Found transaction status table for tablespace id " << *tablespace_id |
2131 | 0 | << " which doesn't exist, ignoring"; |
2132 | 0 | } |
2133 | 271 | } |
2134 | 487 | } |
2135 | 201 | } |
2136 | | |
2137 | | // For each namespace, fetch the table->tablespace information by reading pg_class |
2138 | | // table for each namespace. |
2139 | 1.05k | for (const NamespaceId& nsid : namespace_id_vec) { |
2140 | 1.05k | VLOG(1) << "Refreshing placement information for namespace " << nsid0 ; |
2141 | 1.05k | const uint32_t database_oid = CHECK_RESULT(GetPgsqlDatabaseOid(nsid)); |
2142 | 1.05k | const bool is_colocated_database = colocated_namespaces.count(nsid) > 0; |
2143 | 1.05k | Status table_tablespace_status = sys_catalog_->ReadPgClassInfo(database_oid, |
2144 | 1.05k | is_colocated_database, |
2145 | 1.05k | table_to_tablespace_map.get()); |
2146 | 1.05k | if (!table_tablespace_status.ok()) { |
2147 | 7 | LOG(WARNING) << "Refreshing table->tablespace info failed for namespace " |
2148 | 7 | << nsid << " with error: " << table_tablespace_status.ToString(); |
2149 | 7 | } |
2150 | | |
2151 | 1.05k | const bool pg_yb_tablegroup_exists = VERIFY_RESULT(DoesTableExist(FindTableById( |
2152 | 1.05k | GetPgsqlTableId(database_oid, kPgYbTablegroupTableOid)))); |
2153 | | |
2154 | | // no pg_yb_tablegroup means we only need to check pg_class |
2155 | 1.05k | if (table_tablespace_status.ok() && !pg_yb_tablegroup_exists1.04k ) { |
2156 | 0 | VLOG(5) << "Successfully refreshed placement information for namespace " << nsid |
2157 | 0 | << " from pg_class"; |
2158 | 0 | continue; |
2159 | 0 | } |
2160 | | |
2161 | 1.05k | Status tablegroup_tablespace_status = sys_catalog_->ReadTablespaceInfoFromPgYbTablegroup( |
2162 | 1.05k | database_oid, |
2163 | 1.05k | table_to_tablespace_map.get()); |
2164 | 1.05k | if (!tablegroup_tablespace_status.ok()) { |
2165 | 7 | LOG(WARNING) << "Refreshing tablegroup->tablespace info failed for namespace " |
2166 | 7 | << nsid << " with error: " << tablegroup_tablespace_status.ToString(); |
2167 | 7 | } |
2168 | 1.05k | if (table_tablespace_status.ok() && tablegroup_tablespace_status.ok()1.04k ) { |
2169 | 1.04k | VLOG(5) << "Successfully refreshed placement information for namespace " << nsid |
2170 | 0 | << " from pg_class and pg_yb_tablegroup"; |
2171 | 1.04k | } |
2172 | 1.05k | } |
2173 | | |
2174 | 201 | return table_to_tablespace_map; |
2175 | 201 | } |
2176 | | |
2177 | | Status CatalogManager::CreateTransactionStatusTablesForTablespaces( |
2178 | | const TablespaceIdToReplicationInfoMap& tablespace_info, |
2179 | 201 | const TableToTablespaceIdMap& table_to_tablespace_map) { |
2180 | 201 | if (!GetAtomicFlag(&FLAGS_enable_ysql_tablespaces_for_placement) || |
2181 | 201 | !GetAtomicFlag(&FLAGS_auto_create_local_transaction_tables)) { |
2182 | 0 | return Status::OK(); |
2183 | 0 | } |
2184 | | |
2185 | 201 | std::unordered_set<TablespaceId> valid_tablespaces; |
2186 | 1.72k | for (const auto& entry : table_to_tablespace_map) { |
2187 | 1.72k | if (entry.second) { |
2188 | 902 | valid_tablespaces.insert(*entry.second); |
2189 | 902 | } |
2190 | 1.72k | } |
2191 | 787 | for (const auto& entry : tablespace_info) { |
2192 | 787 | if (!entry.second) { |
2193 | 406 | valid_tablespaces.erase(entry.first); |
2194 | 406 | } |
2195 | 787 | } |
2196 | | |
2197 | 272 | for (const auto& tablespace_id : valid_tablespaces) { |
2198 | 272 | RETURN_NOT_OK(CreateLocalTransactionStatusTableIfNeeded(nullptr /* rpc */, tablespace_id)); |
2199 | 272 | } |
2200 | | |
2201 | 201 | return Status::OK(); |
2202 | 201 | } |
2203 | | |
2204 | 117k | void CatalogManager::StartTablespaceBgTaskIfStopped() { |
2205 | 117k | if (GetAtomicFlag(&FLAGS_ysql_tablespace_info_refresh_secs) <= 0 || |
2206 | 117k | !GetAtomicFlag(&FLAGS_enable_ysql_tablespaces_for_placement)) { |
2207 | | // The tablespace bg task is disabled. Nothing to do. |
2208 | 127 | return; |
2209 | 127 | } |
2210 | | |
2211 | 117k | const bool is_task_running = tablespace_bg_task_running_.exchange(true); |
2212 | 117k | if (is_task_running) { |
2213 | | // Task already running, nothing to do. |
2214 | 116k | return; |
2215 | 116k | } |
2216 | | |
2217 | 934 | ScheduleRefreshTablespaceInfoTask(true /* schedule_now */); |
2218 | 934 | } |
2219 | | |
2220 | 5.06k | void CatalogManager::ScheduleRefreshTablespaceInfoTask(const bool schedule_now) { |
2221 | 5.06k | int wait_time = 0; |
2222 | | |
2223 | 5.06k | if (!schedule_now) { |
2224 | 4.13k | wait_time = GetAtomicFlag(&FLAGS_ysql_tablespace_info_refresh_secs); |
2225 | 4.13k | if (wait_time <= 0) { |
2226 | | // The tablespace refresh task has been disabled. |
2227 | 0 | tablespace_bg_task_running_ = false; |
2228 | 0 | return; |
2229 | 0 | } |
2230 | 4.13k | } |
2231 | | |
2232 | 5.06k | refresh_ysql_tablespace_info_task_.Schedule([this](const Status& status) { |
2233 | 4.18k | Status s = background_tasks_thread_pool_->SubmitFunc( |
2234 | 4.18k | std::bind(&CatalogManager::RefreshTablespaceInfoPeriodically, this)); |
2235 | 4.18k | if (!s.IsOk()) { |
2236 | | // Failed to submit task to the thread pool. Mark that the task is now |
2237 | | // no longer running. |
2238 | 0 | LOG(WARNING) << "Failed to schedule: RefreshTablespaceInfoPeriodically"; |
2239 | 0 | tablespace_bg_task_running_ = false; |
2240 | 0 | } |
2241 | 4.18k | }, wait_time * 1s); |
2242 | 5.06k | } |
2243 | | |
2244 | 4.18k | void CatalogManager::RefreshTablespaceInfoPeriodically() { |
2245 | 4.18k | if (!GetAtomicFlag(&FLAGS_enable_ysql_tablespaces_for_placement)) { |
2246 | 2 | tablespace_bg_task_running_ = false; |
2247 | 2 | return; |
2248 | 2 | } |
2249 | | |
2250 | 4.17k | if (!CheckIsLeaderAndReady().IsOk()) { |
2251 | 44 | LOG(INFO) << "No longer the leader, so cancelling tablespace info task"; |
2252 | 44 | tablespace_bg_task_running_ = false; |
2253 | 44 | return; |
2254 | 44 | } |
2255 | | |
2256 | | // Refresh the tablespace info in memory. |
2257 | 4.13k | Status s = DoRefreshTablespaceInfo(); |
2258 | 4.13k | if (!s.IsOk()) { |
2259 | 166 | LOG(WARNING) << "Tablespace refresh task failed with error " << s.ToString(); |
2260 | 166 | } |
2261 | | |
2262 | | // Schedule the next iteration of the task. |
2263 | 4.13k | ScheduleRefreshTablespaceInfoTask(); |
2264 | 4.13k | } |
2265 | | |
2266 | 4.13k | Status CatalogManager::DoRefreshTablespaceInfo() { |
2267 | 4.13k | VLOG(2) << "Running RefreshTablespaceInfoPeriodically task"0 ; |
2268 | | |
2269 | | // First refresh the tablespace info in memory. |
2270 | 4.13k | auto tablespace_info = VERIFY_RESULT3.96k (GetYsqlTablespaceInfo());3.96k |
2271 | | |
2272 | | // Clear tablespace ids for transaction tables mapped to missing tablespaces. |
2273 | 3.96k | RETURN_NOT_OK(UpdateTransactionStatusTableTablespaces(*tablespace_info)); |
2274 | | |
2275 | 3.96k | shared_ptr<TableToTablespaceIdMap> table_to_tablespace_map = nullptr; |
2276 | | |
2277 | 3.96k | if (tablespace_info->size() > kYsqlNumDefaultTablespaces) { |
2278 | | // There exist custom tablespaces in the system. Fetch the table->tablespace |
2279 | | // map from PG catalog tables. |
2280 | 201 | table_to_tablespace_map = VERIFY_RESULT(GetYsqlTableToTablespaceMap(*tablespace_info)); |
2281 | 201 | } |
2282 | | |
2283 | | // Update tablespace_manager_. |
2284 | 3.96k | { |
2285 | 3.96k | LockGuard lock(tablespace_mutex_); |
2286 | 3.96k | tablespace_manager_ = std::make_shared<YsqlTablespaceManager>(tablespace_info, |
2287 | 3.96k | table_to_tablespace_map); |
2288 | 3.96k | } |
2289 | | |
2290 | 3.96k | if (table_to_tablespace_map) { |
2291 | | // Trigger transaction table creates for tablespaces with tables and no transaction tables. |
2292 | 201 | RETURN_NOT_OK(CreateTransactionStatusTablesForTablespaces( |
2293 | 201 | *tablespace_info, *table_to_tablespace_map)); |
2294 | 201 | } |
2295 | | |
2296 | 3.96k | VLOG(3) << "Refreshed tablespace information in memory"0 ; |
2297 | 3.96k | return Status::OK(); |
2298 | 3.96k | } |
2299 | | |
2300 | | Status CatalogManager::AddIndexInfoToTable(const scoped_refptr<TableInfo>& indexed_table, |
2301 | | const IndexInfoPB& index_info, |
2302 | 1.19k | CreateTableResponsePB* resp) { |
2303 | 1.19k | LOG(INFO) << "AddIndexInfoToTable to " << indexed_table->ToString() << " IndexInfo " |
2304 | 1.19k | << yb::ToString(index_info); |
2305 | 1.19k | TRACE("Locking indexed table"); |
2306 | 1.19k | auto l = DCHECK_NOTNULL(indexed_table)->LockForWrite(); |
2307 | 1.19k | RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp)); |
2308 | | |
2309 | | // Make sure that the index appears to not have been added to the table until the tservers apply |
2310 | | // the alter and respond back. |
2311 | | // Heed issue #6233. |
2312 | 1.19k | if (!l->pb.has_fully_applied_schema()) { |
2313 | 1.18k | MultiStageAlterTable::CopySchemaDetailsToFullyApplied(&l.mutable_data()->pb); |
2314 | 1.18k | } |
2315 | | |
2316 | | // Add index info to indexed table and increment schema version. |
2317 | 1.19k | auto& pb = l.mutable_data()->pb; |
2318 | 1.19k | pb.add_indexes()->CopyFrom(index_info); |
2319 | 1.19k | pb.set_version(l.mutable_data()->pb.version() + 1); |
2320 | 1.19k | pb.set_updates_only_index_permissions(false); |
2321 | 1.19k | l.mutable_data()->set_state( |
2322 | 1.19k | SysTablesEntryPB::ALTERING, |
2323 | 1.19k | Format("Add index info version=$0 ts=$1", pb.version(), LocalTimeAsString())); |
2324 | | |
2325 | | // Update sys-catalog with the new indexed table info. |
2326 | 1.19k | TRACE("Updating indexed table metadata on disk"); |
2327 | 1.19k | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), indexed_table)); |
2328 | | |
2329 | | // Update the in-memory state. |
2330 | 1.19k | TRACE("Committing in-memory state"); |
2331 | 1.19k | l.Commit(); |
2332 | | |
2333 | 1.19k | RETURN_NOT_OK(SendAlterTableRequest(indexed_table)); |
2334 | | |
2335 | 1.19k | return Status::OK(); |
2336 | 1.19k | } |
2337 | | |
2338 | | Status CatalogManager::CreateCopartitionedTable(const CreateTableRequestPB& req, |
2339 | | CreateTableResponsePB* resp, |
2340 | | rpc::RpcContext* rpc, |
2341 | | Schema schema, |
2342 | 0 | scoped_refptr<NamespaceInfo> ns) { |
2343 | 0 | scoped_refptr<TableInfo> parent_table_info; |
2344 | 0 | Status s; |
2345 | 0 | PartitionSchema partition_schema; |
2346 | 0 | std::vector<Partition> partitions; |
2347 | |
|
2348 | 0 | const NamespaceId& namespace_id = ns->id(); |
2349 | 0 | const NamespaceName& namespace_name = ns->name(); |
2350 | |
|
2351 | 0 | LockGuard lock(mutex_); |
2352 | 0 | TRACE("Acquired catalog manager lock"); |
2353 | 0 | parent_table_info = FindPtrOrNull(*table_ids_map_, |
2354 | 0 | schema.table_properties().CopartitionTableId()); |
2355 | 0 | if (parent_table_info == nullptr) { |
2356 | 0 | s = STATUS(NotFound, "The object does not exist: copartitioned table with id", |
2357 | 0 | schema.table_properties().CopartitionTableId()); |
2358 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); |
2359 | 0 | } |
2360 | | |
2361 | 0 | TableInfoPtr this_table_info; |
2362 | | // Verify that the table does not exist. |
2363 | 0 | this_table_info = FindPtrOrNull(table_names_map_, {namespace_id, req.name()}); |
2364 | |
|
2365 | 0 | if (this_table_info != nullptr) { |
2366 | 0 | s = STATUS_SUBSTITUTE(AlreadyPresent, |
2367 | 0 | "Object '$0.$1' already exists", |
2368 | 0 | GetNamespaceNameUnlocked(this_table_info), this_table_info->name()); |
2369 | 0 | LOG(WARNING) << "Found table: " << this_table_info->ToStringWithState() |
2370 | 0 | << ". Failed creating copartitioned table with error: " |
2371 | 0 | << s.ToString() << " Request:\n" << req.DebugString(); |
2372 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_ALREADY_PRESENT, s); |
2373 | 0 | } |
2374 | | // Don't add copartitioned tables to Namespaces that aren't running. |
2375 | 0 | if (ns->state() != SysNamespaceEntryPB::RUNNING) { |
2376 | 0 | Status s = STATUS_SUBSTITUTE(TryAgain, |
2377 | 0 | "Namespace not running (State=$0). Cannot create $1.$2", |
2378 | 0 | ns->state(), ns->name(), req.name() ); |
2379 | 0 | return SetupError(resp->mutable_error(), NamespaceMasterError(ns->state()), s); |
2380 | 0 | } |
2381 | | |
2382 | | // TODO: pass index_info for copartitioned index. |
2383 | 0 | RETURN_NOT_OK(CreateTableInMemory( |
2384 | 0 | req, schema, partition_schema, namespace_id, namespace_name, |
2385 | 0 | partitions, nullptr, nullptr, resp, &this_table_info)); |
2386 | | |
2387 | 0 | TRACE("Inserted new table info into CatalogManager maps"); |
2388 | | |
2389 | | // NOTE: the table is already locked for write at this point, |
2390 | | // since the CreateTableInfo function leave it in that state. |
2391 | | // It will get committed at the end of this function. |
2392 | | // Sanity check: the table should be in "preparing" state. |
2393 | 0 | CHECK_EQ(SysTablesEntryPB::PREPARING, this_table_info->metadata().dirty().pb.state()); |
2394 | 0 | TabletInfos tablets = parent_table_info->GetTablets(); |
2395 | 0 | for (auto tablet : tablets) { |
2396 | 0 | tablet->mutable_metadata()->StartMutation(); |
2397 | 0 | tablet->mutable_metadata()->mutable_dirty()->pb.add_table_ids(this_table_info->id()); |
2398 | 0 | } |
2399 | | |
2400 | | // Update Tablets about new table id to sys-tablets. |
2401 | 0 | s = sys_catalog_->Upsert(leader_ready_term(), tablets); |
2402 | 0 | if (PREDICT_FALSE(!s.ok())) { |
2403 | 0 | return AbortTableCreation(this_table_info.get(), tablets, s.CloneAndPrepend( |
2404 | 0 | Substitute("An error occurred while inserting to sys-tablets: $0", s.ToString())), resp); |
2405 | 0 | } |
2406 | 0 | TRACE("Wrote tablets to system table"); |
2407 | | |
2408 | | // Update the on-disk table state to "running". |
2409 | 0 | this_table_info->AddTablets(tablets); |
2410 | 0 | this_table_info->mutable_metadata()->mutable_dirty()->pb.set_state(SysTablesEntryPB::RUNNING); |
2411 | 0 | s = sys_catalog_->Upsert(leader_ready_term(), this_table_info); |
2412 | 0 | if (PREDICT_FALSE(!s.ok())) { |
2413 | 0 | return AbortTableCreation(this_table_info.get(), tablets, s.CloneAndPrepend( |
2414 | 0 | Substitute("An error occurred while inserting to sys-tablets: $0", |
2415 | 0 | s.ToString())), resp); |
2416 | 0 | } |
2417 | 0 | TRACE("Wrote table to system table"); |
2418 | | |
2419 | | // Commit the in-memory state. |
2420 | 0 | this_table_info->mutable_metadata()->CommitMutation(); |
2421 | |
|
2422 | 0 | for (const auto& tablet : tablets) { |
2423 | 0 | tablet->mutable_metadata()->CommitMutation(); |
2424 | 0 | } |
2425 | |
|
2426 | 0 | for (const auto& tablet : tablets) { |
2427 | 0 | SendCopartitionTabletRequest(tablet, this_table_info); |
2428 | 0 | } |
2429 | |
|
2430 | 0 | LOG(INFO) << "Successfully created table " << this_table_info->ToString() |
2431 | 0 | << " per request from " << RequestorString(rpc); |
2432 | 0 | return Status::OK(); |
2433 | 0 | } |
2434 | | |
2435 | | |
2436 | | template <class Req, class Resp, class Action> |
2437 | 0 | Status CatalogManager::PerformOnSysCatalogTablet(const Req& req, Resp* resp, const Action& action) { |
2438 | 0 | auto tablet_peer = sys_catalog_->tablet_peer(); |
2439 | 0 | auto shared_tablet = tablet_peer ? tablet_peer->shared_tablet() : nullptr; |
2440 | 0 | if (!shared_tablet) { |
2441 | 0 | return SetupError( |
2442 | 0 | resp->mutable_error(), |
2443 | 0 | MasterErrorPB::TABLET_NOT_RUNNING, |
2444 | 0 | STATUS(NotFound, "The sys catalog tablet was not found.")); |
2445 | 0 | } |
2446 | | |
2447 | 0 | auto s = action(shared_tablet); |
2448 | 0 | if (!s.ok()) { |
2449 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INTERNAL_ERROR, s); |
2450 | 0 | } |
2451 | | |
2452 | 0 | return Status::OK(); |
2453 | 0 | } Unexecuted instantiation: catalog_manager.cc:yb::Status yb::master::CatalogManager::PerformOnSysCatalogTablet<yb::master::FlushSysCatalogRequestPB const*, yb::master::FlushSysCatalogResponsePB, yb::master::CatalogManager::FlushSysCatalog(yb::master::FlushSysCatalogRequestPB const*, yb::master::FlushSysCatalogResponsePB*, yb::rpc::RpcContext*)::$_4>(yb::master::FlushSysCatalogRequestPB const* const&, yb::master::FlushSysCatalogResponsePB*, yb::master::CatalogManager::FlushSysCatalog(yb::master::FlushSysCatalogRequestPB const*, yb::master::FlushSysCatalogResponsePB*, yb::rpc::RpcContext*)::$_4 const&) Unexecuted instantiation: catalog_manager.cc:yb::Status yb::master::CatalogManager::PerformOnSysCatalogTablet<yb::master::CompactSysCatalogRequestPB const*, yb::master::CompactSysCatalogResponsePB, yb::master::CatalogManager::CompactSysCatalog(yb::master::CompactSysCatalogRequestPB const*, yb::master::CompactSysCatalogResponsePB*, yb::rpc::RpcContext*)::$_5>(yb::master::CompactSysCatalogRequestPB const* const&, yb::master::CompactSysCatalogResponsePB*, yb::master::CatalogManager::CompactSysCatalog(yb::master::CompactSysCatalogRequestPB const*, yb::master::CompactSysCatalogResponsePB*, yb::rpc::RpcContext*)::$_5 const&) |
2454 | | |
2455 | | Status CatalogManager::FlushSysCatalog( |
2456 | | const FlushSysCatalogRequestPB* req, |
2457 | | FlushSysCatalogResponsePB* resp, |
2458 | 0 | rpc::RpcContext* context) { |
2459 | 0 | return PerformOnSysCatalogTablet(req, resp, [](auto shared_tablet) { |
2460 | 0 | return shared_tablet->Flush(tablet::FlushMode::kSync); |
2461 | 0 | }); |
2462 | 0 | } |
2463 | | |
2464 | | Status CatalogManager::CompactSysCatalog( |
2465 | | const CompactSysCatalogRequestPB* req, |
2466 | | CompactSysCatalogResponsePB* resp, |
2467 | 0 | rpc::RpcContext* context) { |
2468 | 0 | return PerformOnSysCatalogTablet(req, resp, [](auto shared_tablet) { |
2469 | 0 | return shared_tablet->ForceFullRocksDBCompact(); |
2470 | 0 | }); |
2471 | 0 | } |
2472 | | |
2473 | | namespace { |
2474 | | |
2475 | | Result<std::array<PartitionPB, kNumSplitParts>> CreateNewTabletsPartition( |
2476 | 142 | const TabletInfo& tablet_info, const std::string& split_partition_key) { |
2477 | 142 | const auto& source_partition = tablet_info.LockForRead()->pb.partition(); |
2478 | | |
2479 | 142 | if (split_partition_key <= source_partition.partition_key_start() || |
2480 | 142 | (!source_partition.partition_key_end().empty() && |
2481 | 142 | split_partition_key >= source_partition.partition_key_end()78 )) { |
2482 | 0 | return STATUS_FORMAT( |
2483 | 0 | InvalidArgument, |
2484 | 0 | "Can't split tablet $0 (partition_key_start: $1 partition_key_end: $2) by partition " |
2485 | 0 | "boundary (split_key: $3)", |
2486 | 0 | tablet_info.tablet_id(), source_partition.partition_key_start(), |
2487 | 0 | source_partition.partition_key_end(), split_partition_key); |
2488 | 0 | } |
2489 | | |
2490 | 142 | std::array<PartitionPB, kNumSplitParts> new_tablets_partition; |
2491 | | |
2492 | 142 | new_tablets_partition.fill(source_partition); |
2493 | | |
2494 | 142 | new_tablets_partition[0].set_partition_key_end(split_partition_key); |
2495 | 142 | new_tablets_partition[1].set_partition_key_start(split_partition_key); |
2496 | 142 | static_assert(kNumSplitParts == 2, "We expect tablet to be split into 2 new tablets here"); |
2497 | | |
2498 | 142 | return new_tablets_partition; |
2499 | 142 | } |
2500 | | |
2501 | | } // namespace |
2502 | | |
2503 | | CHECKED_STATUS CatalogManager::TEST_SplitTablet( |
2504 | | const TabletId& tablet_id, const std::string& split_encoded_key, |
2505 | 0 | const std::string& split_partition_key) { |
2506 | 0 | auto source_tablet_info = VERIFY_RESULT(GetTabletInfo(tablet_id)); |
2507 | 0 | return DoSplitTablet(source_tablet_info, split_encoded_key, split_partition_key, |
2508 | 0 | true /* select_all_tablets_for_split */); |
2509 | 0 | } |
2510 | | |
2511 | | Status CatalogManager::TEST_SplitTablet( |
2512 | 0 | const scoped_refptr<TabletInfo>& source_tablet_info, docdb::DocKeyHash split_hash_code) { |
2513 | 0 | return DoSplitTablet(source_tablet_info, split_hash_code, |
2514 | 0 | true /* select_all_tablets_for_split */); |
2515 | 0 | } |
2516 | | |
2517 | 0 | Status CatalogManager::TEST_IncrementTablePartitionListVersion(const TableId& table_id) { |
2518 | 0 | auto table_info = GetTableInfo(table_id); |
2519 | 0 | SCHECK(table_info != nullptr, NotFound, Format("Table $0 not found", table_id)); |
2520 | | |
2521 | 0 | LockGuard lock(mutex_); |
2522 | 0 | auto table_lock = table_info->LockForWrite(); |
2523 | 0 | auto& table_pb = table_lock.mutable_data()->pb; |
2524 | 0 | table_pb.set_partition_list_version(table_pb.partition_list_version() + 1); |
2525 | 0 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), table_info)); |
2526 | 0 | table_lock.Commit(); |
2527 | 0 | return Status::OK(); |
2528 | 0 | } |
2529 | | |
2530 | | Result<ReplicationInfoPB> CatalogManager::GetTableReplicationInfo( |
2531 | 640 | const TabletInfo& tablet_info) const { |
2532 | 640 | auto table = tablet_info.table(); |
2533 | 640 | { |
2534 | 640 | auto table_lock = table->LockForRead(); |
2535 | 640 | if (table_lock->pb.has_replication_info()) { |
2536 | 0 | return table_lock->pb.replication_info(); |
2537 | 0 | } |
2538 | 640 | } |
2539 | | |
2540 | 640 | auto replication_info_opt = VERIFY_RESULT( |
2541 | 640 | GetTablespaceManager()->GetTableReplicationInfo(table)); |
2542 | 640 | if (replication_info_opt) { |
2543 | 0 | return replication_info_opt.value(); |
2544 | 0 | } |
2545 | | |
2546 | 640 | return ClusterConfig()->LockForRead()->pb.replication_info(); |
2547 | 640 | } |
2548 | | |
2549 | | bool CatalogManager::ShouldSplitValidCandidate( |
2550 | 3.49M | const TabletInfo& tablet_info, const TabletReplicaDriveInfo& drive_info) const { |
2551 | 3.49M | if (drive_info.may_have_orphaned_post_split_data) { |
2552 | 232k | return false; |
2553 | 232k | } |
2554 | 3.26M | ssize_t size = drive_info.sst_files_size; |
2555 | 3.26M | DCHECK(size >= 0) << "Detected overflow in casting sst_files_size to signed int."0 ; |
2556 | 3.26M | if (size < FLAGS_tablet_split_low_phase_size_threshold_bytes) { |
2557 | 3.26M | return false; |
2558 | 3.26M | } |
2559 | 640 | TSDescriptorVector ts_descs = GetAllLiveNotBlacklistedTServers(); |
2560 | | |
2561 | 640 | size_t num_servers = 0; |
2562 | 640 | auto table_replication_info_or_status = GetTableReplicationInfo(tablet_info); |
2563 | | |
2564 | | // If there is custom placement information present then |
2565 | | // only count the tservers which the table has access to |
2566 | | // according to the placement policy |
2567 | 640 | if (table_replication_info_or_status.ok() |
2568 | 640 | && table_replication_info_or_status->has_live_replicas()) { |
2569 | 0 | auto pb = table_replication_info_or_status->live_replicas(); |
2570 | 0 | auto valid_tservers_res = FindTServersForPlacementInfo( |
2571 | 0 | table_replication_info_or_status->live_replicas(), ts_descs); |
2572 | 0 | if (!valid_tservers_res.ok()) { |
2573 | 0 | num_servers = ts_descs.size(); |
2574 | 0 | } else { |
2575 | 0 | num_servers = valid_tservers_res.get().size(); |
2576 | 0 | } |
2577 | 640 | } else { |
2578 | 640 | num_servers = ts_descs.size(); |
2579 | 640 | } |
2580 | | |
2581 | 640 | int64 num_tablets_per_server = tablet_info.table()->NumPartitions() / num_servers; |
2582 | | |
2583 | 640 | if (num_tablets_per_server < FLAGS_tablet_split_low_phase_shard_count_per_node) { |
2584 | 0 | return size > FLAGS_tablet_split_low_phase_size_threshold_bytes; |
2585 | 0 | } |
2586 | 640 | if (num_tablets_per_server < FLAGS_tablet_split_high_phase_shard_count_per_node) { |
2587 | 0 | return size > FLAGS_tablet_split_high_phase_size_threshold_bytes; |
2588 | 0 | } |
2589 | 640 | return size > FLAGS_tablet_force_split_threshold_bytes; |
2590 | 640 | } |
2591 | | |
2592 | | Status CatalogManager::DoSplitTablet( |
2593 | | const scoped_refptr<TabletInfo>& source_tablet_info, std::string split_encoded_key, |
2594 | 142 | std::string split_partition_key, bool select_all_tablets_for_split) { |
2595 | 142 | auto source_table_lock = source_tablet_info->table()->LockForWrite(); |
2596 | 142 | auto source_tablet_lock = source_tablet_info->LockForWrite(); |
2597 | | |
2598 | | // We must re-validate the split candidate here *after* grabbing locks on the table and tablet to |
2599 | | // ensure a backfill does not happen before we modify catalog metadata to include new subtablets. |
2600 | | // This process adds new subtablets in the CREATING state, which if encountered by backfill code |
2601 | | // will block the backfill process. |
2602 | 142 | RETURN_NOT_OK(tablet_split_manager_.ValidateSplitCandidateTable(*source_tablet_info->table())); |
2603 | 142 | RETURN_NOT_OK(tablet_split_manager_.ValidateSplitCandidateTablet(*source_tablet_info)); |
2604 | | |
2605 | 142 | auto drive_info = VERIFY_RESULT(source_tablet_info->GetLeaderReplicaDriveInfo()); |
2606 | 142 | if (!select_all_tablets_for_split && |
2607 | 142 | !ShouldSplitValidCandidate(*source_tablet_info, drive_info)133 ) { |
2608 | | // It is possible that we queued up a split candidate in TabletSplitManager which was, at the |
2609 | | // time, a valid split candidate, but by the time the candidate was actually processed here, the |
2610 | | // cluster may have changed, putting us in a new split threshold phase, and it may no longer be |
2611 | | // a valid candidate. This is not an unexpected error, but we should bail out of splitting this |
2612 | | // tablet regardless. |
2613 | 0 | return STATUS_FORMAT( |
2614 | 0 | InvalidArgument, |
2615 | 0 | "Tablet split candidate $0 is no longer a valid split candidate.", |
2616 | 0 | source_tablet_info->tablet_id()); |
2617 | 0 | } |
2618 | | |
2619 | | // Check if at least one child tablet already registered |
2620 | 142 | if (source_tablet_lock->pb.split_tablet_ids().size() > 0) { |
2621 | 98 | const auto child_tablet_id = source_tablet_lock->pb.split_tablet_ids(0); |
2622 | 98 | const auto child_tablet = VERIFY_RESULT(GetTabletInfo(child_tablet_id)); |
2623 | 0 | const auto parent_partition = source_tablet_lock->pb.partition(); |
2624 | 98 | const auto child_partition = child_tablet->LockForRead()->pb.partition(); |
2625 | | |
2626 | 98 | if (parent_partition.partition_key_start() == child_partition.partition_key_start()) { |
2627 | 98 | split_partition_key = child_partition.partition_key_end(); |
2628 | 98 | } else { |
2629 | 0 | SCHECK_EQ(parent_partition.partition_key_end(), child_partition.partition_key_end(), |
2630 | 0 | IllegalState, "Parent partion key end does not equal child partition key end"); |
2631 | 0 | split_partition_key = child_partition.partition_key_start(); |
2632 | 0 | } |
2633 | | |
2634 | | // Re-compute the encoded key |
2635 | | // to ensure we use the same partition boundary for both child tablets |
2636 | 98 | split_encoded_key = PartitionSchema::GetEncodedKeyPrefix( |
2637 | 98 | split_partition_key, source_table_lock->pb.partition_schema()); |
2638 | 98 | } |
2639 | | |
2640 | 142 | LOG(INFO) << "Starting tablet split: " << source_tablet_info->ToString() |
2641 | 142 | << " by partition key: " << Slice(split_partition_key).ToDebugHexString(); |
2642 | | |
2643 | 142 | std::array<PartitionPB, kNumSplitParts> new_tablets_partition = VERIFY_RESULT( |
2644 | 142 | CreateNewTabletsPartition(*source_tablet_info, split_partition_key)); |
2645 | | |
2646 | 0 | std::array<TabletId, kNumSplitParts> new_tablet_ids; |
2647 | 424 | for (int i = 0; i < kNumSplitParts; ++i282 ) { |
2648 | 282 | if (i < source_tablet_lock->pb.split_tablet_ids_size()) { |
2649 | | // Post-split tablet `i` has been already registered. |
2650 | 194 | new_tablet_ids[i] = source_tablet_lock->pb.split_tablet_ids(i); |
2651 | 194 | } else { |
2652 | 88 | auto new_tablet_info = VERIFY_RESULT(RegisterNewTabletForSplit( |
2653 | 88 | source_tablet_info.get(), new_tablets_partition[i], |
2654 | 88 | &source_table_lock, &source_tablet_lock)); |
2655 | | |
2656 | 0 | new_tablet_ids[i] = new_tablet_info->id(); |
2657 | 88 | } |
2658 | 282 | } |
2659 | 142 | source_tablet_lock.Commit(); |
2660 | 142 | source_table_lock.Commit(); |
2661 | | |
2662 | | // TODO(tsplit): what if source tablet will be deleted before or during TS leader is processing |
2663 | | // split? Add unit-test. |
2664 | 142 | RETURN_NOT_OK(SendSplitTabletRequest( |
2665 | 142 | source_tablet_info, new_tablet_ids, split_encoded_key, split_partition_key)); |
2666 | | |
2667 | 142 | return Status::OK(); |
2668 | 142 | } |
2669 | | |
2670 | | Status CatalogManager::DoSplitTablet( |
2671 | | const scoped_refptr<TabletInfo>& source_tablet_info, docdb::DocKeyHash split_hash_code, |
2672 | 0 | bool select_all_tablets_for_split) { |
2673 | 0 | docdb::KeyBytes split_encoded_key; |
2674 | 0 | docdb::DocKeyEncoderAfterTableIdStep(&split_encoded_key) |
2675 | 0 | .Hash(split_hash_code, std::vector<docdb::PrimitiveValue>()); |
2676 | |
|
2677 | 0 | const auto split_partition_key = PartitionSchema::EncodeMultiColumnHashValue(split_hash_code); |
2678 | |
|
2679 | 0 | return DoSplitTablet(source_tablet_info, split_encoded_key.ToStringBuffer(), split_partition_key, |
2680 | 0 | select_all_tablets_for_split); |
2681 | 0 | } |
2682 | | |
2683 | 247 | Result<scoped_refptr<TabletInfo>> CatalogManager::GetTabletInfo(const TabletId& tablet_id) { |
2684 | 247 | LockGuard lock(mutex_); |
2685 | 247 | TRACE("Acquired catalog manager lock"); |
2686 | | |
2687 | 247 | const auto tablet_info = FindPtrOrNull(*tablet_map_, tablet_id); |
2688 | 247 | SCHECK(tablet_info != nullptr, NotFound, Format("Tablet $0 not found", tablet_id)); |
2689 | | |
2690 | 247 | return tablet_info; |
2691 | 247 | } |
2692 | | |
2693 | | void CatalogManager::SplitTabletWithKey( |
2694 | | const scoped_refptr<TabletInfo>& tablet, const std::string& split_encoded_key, |
2695 | 142 | const std::string& split_partition_key, const bool select_all_tablets_for_split) { |
2696 | | // Note that DoSplitTablet() will trigger an async SplitTablet task, and will only return not OK() |
2697 | | // if it failed to submit that task. In other words, any failures here are not retriable, and |
2698 | | // success indicates that an async and automatically retrying task was submitted. |
2699 | 142 | auto s = DoSplitTablet( |
2700 | 142 | tablet, split_encoded_key, split_partition_key, select_all_tablets_for_split); |
2701 | 142 | WARN_NOT_OK(s, Format("Failed to split tablet with GetSplitKey result for tablet: $0", |
2702 | 142 | tablet->tablet_id())); |
2703 | 142 | } |
2704 | | |
2705 | 143 | Status CatalogManager::SplitTablet(const TabletId& tablet_id, bool select_all_tablets_for_split) { |
2706 | 143 | LOG(INFO) << "Got tablet to split: " << tablet_id; |
2707 | | |
2708 | 143 | const auto tablet = VERIFY_RESULT(GetTabletInfo(tablet_id)); |
2709 | | |
2710 | 0 | VLOG(2) << "Scheduling GetSplitKey request to leader tserver for source tablet ID: " |
2711 | 0 | << tablet->tablet_id(); |
2712 | 143 | auto call = std::make_shared<AsyncGetTabletSplitKey>( |
2713 | 143 | master_, AsyncTaskPool(), tablet, |
2714 | 143 | [this, tablet, select_all_tablets_for_split] |
2715 | 143 | (const Result<AsyncGetTabletSplitKey::Data>& result) { |
2716 | 143 | if (result.ok()) { |
2717 | 142 | SplitTabletWithKey(tablet, result->split_encoded_key, result->split_partition_key, |
2718 | 142 | select_all_tablets_for_split); |
2719 | 142 | } else if (1 tserver::TabletServerError(result.status()) == |
2720 | 1 | tserver::TabletServerErrorPB::TABLET_SPLIT_DISABLED_TTL_EXPIRY) { |
2721 | 0 | tablet_split_manager()->MarkTtlTableForSplitIgnore(tablet->table()->id()); |
2722 | 0 | LOG(INFO) << "AsyncGetTabletSplitKey task failed for tablet " << tablet->tablet_id() |
2723 | 0 | << ". Tablet split not supported for tablets with TTL file expiration."; |
2724 | 1 | } else { |
2725 | 1 | LOG(WARNING) << "AsyncGetTabletSplitKey task failed with status: " << result.status(); |
2726 | 1 | } |
2727 | 143 | }); |
2728 | 143 | tablet->table()->AddTask(call); |
2729 | 143 | return ScheduleTask(call); |
2730 | 143 | } |
2731 | | |
2732 | | Status CatalogManager::SplitTablet( |
2733 | 10 | const SplitTabletRequestPB* req, SplitTabletResponsePB* resp, rpc::RpcContext* rpc) { |
2734 | 10 | const auto source_tablet_id = req->tablet_id(); |
2735 | 10 | return SplitTablet(source_tablet_id, true /* select_all_tablets_for_split */); |
2736 | 10 | } |
2737 | | |
2738 | | Status CatalogManager::DeleteNotServingTablet( |
2739 | | const DeleteNotServingTabletRequestPB* req, DeleteNotServingTabletResponsePB* resp, |
2740 | 6 | rpc::RpcContext* rpc) { |
2741 | 6 | const auto& tablet_id = req->tablet_id(); |
2742 | 6 | const auto tablet_info = VERIFY_RESULT(GetTabletInfo(tablet_id)); |
2743 | | |
2744 | 6 | if (PREDICT_FALSE(FLAGS_TEST_reject_delete_not_serving_tablet_rpc)) { |
2745 | 0 | TEST_SYNC_POINT("CatalogManager::DeleteNotServingTablet:Reject"); |
2746 | 0 | return STATUS( |
2747 | 0 | InvalidArgument, "Rejecting due to FLAGS_TEST_reject_delete_not_serving_tablet_rpc"); |
2748 | 0 | } |
2749 | | |
2750 | 6 | const auto& table_info = tablet_info->table(); |
2751 | | |
2752 | 6 | RETURN_NOT_OK(CheckIfForbiddenToDeleteTabletOf(table_info)); |
2753 | | |
2754 | 6 | RETURN_NOT_OK(CatalogManagerUtil::CheckIfCanDeleteSingleTablet(tablet_info)); |
2755 | | |
2756 | 6 | auto schedules_to_tables_map = VERIFY_RESULT( |
2757 | 6 | MakeSnapshotSchedulesToObjectIdsMap(SysRowEntryType::TABLE)); |
2758 | 0 | RepeatedBytes retained_by_snapshot_schedules; |
2759 | 6 | FillRetainedBySnapshotSchedules( |
2760 | 6 | schedules_to_tables_map, table_info->id(), &retained_by_snapshot_schedules); |
2761 | | |
2762 | 6 | return DeleteTabletListAndSendRequests( |
2763 | 6 | { tablet_info }, "Not serving tablet deleted upon request at " + LocalTimeAsString(), |
2764 | 6 | retained_by_snapshot_schedules); |
2765 | 6 | } |
2766 | | |
2767 | | Status CatalogManager::DdlLog( |
2768 | 1 | const DdlLogRequestPB* req, DdlLogResponsePB* resp, rpc::RpcContext* rpc) { |
2769 | 1 | return sys_catalog_->FetchDdlLog(resp->mutable_entries()); |
2770 | 1 | } |
2771 | | |
2772 | | namespace { |
2773 | | |
2774 | 21.8k | CHECKED_STATUS ValidateCreateTableSchema(const Schema& schema, CreateTableResponsePB* resp) { |
2775 | 21.8k | if (schema.num_key_columns() <= 0) { |
2776 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, |
2777 | 0 | STATUS(InvalidArgument, "Must specify at least one key column")); |
2778 | 0 | } |
2779 | 61.3k | for (size_t i = 0; 21.8k i < schema.num_key_columns(); i++39.5k ) { |
2780 | 39.5k | if (!IsTypeAllowableInKey(schema.column(i).type_info())) { |
2781 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, |
2782 | 0 | STATUS(InvalidArgument, "Invalid datatype for primary key column")); |
2783 | 0 | } |
2784 | 39.5k | } |
2785 | 21.8k | return Status::OK(); |
2786 | 21.8k | } |
2787 | | |
2788 | | // Extract a colocation ID from request if explicitly passed, or generate a new valid one. |
2789 | | // Will error if requested ID is taken or invalid. |
2790 | | template<typename ContainsColocationIdFn> |
2791 | | Result<ColocationId> ConceiveColocationId(const CreateTableRequestPB& req, |
2792 | | CreateTableResponsePB* resp, |
2793 | 129 | ContainsColocationIdFn contains_colocation_id) { |
2794 | 129 | ColocationId colocation_id; |
2795 | | |
2796 | 129 | if (req.has_colocation_id()) { |
2797 | 24 | colocation_id = req.colocation_id(); |
2798 | 24 | if (colocation_id < kFirstNormalColocationId) { |
2799 | 0 | Status s = STATUS_SUBSTITUTE(InvalidArgument, |
2800 | 0 | "Colocation ID cannot be less than $0", |
2801 | 0 | kFirstNormalColocationId); |
2802 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
2803 | 0 | } |
2804 | 24 | if (contains_colocation_id(colocation_id)) { |
2805 | 2 | Status s = |
2806 | 2 | STATUS_SUBSTITUTE(InvalidArgument, |
2807 | 2 | "Colocation group already contains a table with colocation ID $0", |
2808 | 2 | colocation_id); |
2809 | 2 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
2810 | 2 | } |
2811 | 105 | } else { |
2812 | | // Generate a random colocation ID unique within colocation group. |
2813 | 105 | colocation_id = 20000; // In agreement with sequential_colocation_ids flag. |
2814 | 125 | do { |
2815 | 125 | if (PREDICT_FALSE(FLAGS_TEST_sequential_colocation_ids)) { |
2816 | 40 | colocation_id++; |
2817 | 85 | } else { |
2818 | | // See comment on kFirstNormalColocationId. |
2819 | 85 | colocation_id = |
2820 | 85 | RandomUniformInt<ColocationId>(kFirstNormalColocationId, |
2821 | 85 | std::numeric_limits<ColocationId>::max()); |
2822 | 85 | } |
2823 | 125 | } while (contains_colocation_id(colocation_id)); |
2824 | 105 | } |
2825 | | |
2826 | 127 | return colocation_id; |
2827 | 129 | } catalog_manager.cc:yb::Result<unsigned int> yb::master::(anonymous namespace)::ConceiveColocationId<yb::master::CatalogManager::CreateTable(yb::master::CreateTableRequestPB const*, yb::master::CreateTableResponsePB*, yb::rpc::RpcContext*)::$_7>(yb::master::CreateTableRequestPB const&, yb::master::CreateTableResponsePB*, yb::master::CatalogManager::CreateTable(yb::master::CreateTableRequestPB const*, yb::master::CreateTableResponsePB*, yb::rpc::RpcContext*)::$_7) Line | Count | Source | 2793 | 89 | ContainsColocationIdFn contains_colocation_id) { | 2794 | 89 | ColocationId colocation_id; | 2795 | | | 2796 | 89 | if (req.has_colocation_id()) { | 2797 | 21 | colocation_id = req.colocation_id(); | 2798 | 21 | if (colocation_id < kFirstNormalColocationId) { | 2799 | 0 | Status s = STATUS_SUBSTITUTE(InvalidArgument, | 2800 | 0 | "Colocation ID cannot be less than $0", | 2801 | 0 | kFirstNormalColocationId); | 2802 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); | 2803 | 0 | } | 2804 | 21 | if (contains_colocation_id(colocation_id)) { | 2805 | 2 | Status s = | 2806 | 2 | STATUS_SUBSTITUTE(InvalidArgument, | 2807 | 2 | "Colocation group already contains a table with colocation ID $0", | 2808 | 2 | colocation_id); | 2809 | 2 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); | 2810 | 2 | } | 2811 | 68 | } else { | 2812 | | // Generate a random colocation ID unique within colocation group. | 2813 | 68 | colocation_id = 20000; // In agreement with sequential_colocation_ids flag. | 2814 | 86 | do { | 2815 | 86 | if (PREDICT_FALSE(FLAGS_TEST_sequential_colocation_ids)) { | 2816 | 33 | colocation_id++; | 2817 | 53 | } else { | 2818 | | // See comment on kFirstNormalColocationId. | 2819 | 53 | colocation_id = | 2820 | 53 | RandomUniformInt<ColocationId>(kFirstNormalColocationId, | 2821 | 53 | std::numeric_limits<ColocationId>::max()); | 2822 | 53 | } | 2823 | 86 | } while (contains_colocation_id(colocation_id)); | 2824 | 68 | } | 2825 | | | 2826 | 87 | return colocation_id; | 2827 | 89 | } |
catalog_manager.cc:yb::Result<unsigned int> yb::master::(anonymous namespace)::ConceiveColocationId<yb::master::CatalogManager::CreateTable(yb::master::CreateTableRequestPB const*, yb::master::CreateTableResponsePB*, yb::rpc::RpcContext*)::$_8>(yb::master::CreateTableRequestPB const&, yb::master::CreateTableResponsePB*, yb::master::CatalogManager::CreateTable(yb::master::CreateTableRequestPB const*, yb::master::CreateTableResponsePB*, yb::rpc::RpcContext*)::$_8) Line | Count | Source | 2793 | 40 | ContainsColocationIdFn contains_colocation_id) { | 2794 | 40 | ColocationId colocation_id; | 2795 | | | 2796 | 40 | if (req.has_colocation_id()) { | 2797 | 3 | colocation_id = req.colocation_id(); | 2798 | 3 | if (colocation_id < kFirstNormalColocationId) { | 2799 | 0 | Status s = STATUS_SUBSTITUTE(InvalidArgument, | 2800 | 0 | "Colocation ID cannot be less than $0", | 2801 | 0 | kFirstNormalColocationId); | 2802 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); | 2803 | 0 | } | 2804 | 3 | if (contains_colocation_id(colocation_id)) { | 2805 | 0 | Status s = | 2806 | 0 | STATUS_SUBSTITUTE(InvalidArgument, | 2807 | 0 | "Colocation group already contains a table with colocation ID $0", | 2808 | 0 | colocation_id); | 2809 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); | 2810 | 0 | } | 2811 | 37 | } else { | 2812 | | // Generate a random colocation ID unique within colocation group. | 2813 | 37 | colocation_id = 20000; // In agreement with sequential_colocation_ids flag. | 2814 | 39 | do { | 2815 | 39 | if (PREDICT_FALSE(FLAGS_TEST_sequential_colocation_ids)) { | 2816 | 7 | colocation_id++; | 2817 | 32 | } else { | 2818 | | // See comment on kFirstNormalColocationId. | 2819 | 32 | colocation_id = | 2820 | 32 | RandomUniformInt<ColocationId>(kFirstNormalColocationId, | 2821 | 32 | std::numeric_limits<ColocationId>::max()); | 2822 | 32 | } | 2823 | 39 | } while (contains_colocation_id(colocation_id)); | 2824 | 37 | } | 2825 | | | 2826 | 40 | return colocation_id; | 2827 | 40 | } |
|
2828 | | |
2829 | | } // namespace |
2830 | | |
2831 | | Status CatalogManager::CreateYsqlSysTable(const CreateTableRequestPB* req, |
2832 | 13.3k | CreateTableResponsePB* resp) { |
2833 | 13.3k | LOG(INFO) << "CreateYsqlSysTable: " << req->name(); |
2834 | | // Lookup the namespace and verify if it exists. |
2835 | 13.3k | TRACE("Looking up namespace"); |
2836 | 13.3k | auto ns = VERIFY_RESULT(FindNamespace(req->namespace_())); |
2837 | 0 | const NamespaceId& namespace_id = ns->id(); |
2838 | 13.3k | const NamespaceName& namespace_name = ns->name(); |
2839 | | |
2840 | 13.3k | Schema schema; |
2841 | 13.3k | RETURN_NOT_OK(SchemaFromPB(req->schema(), &schema)); |
2842 | | // If the schema contains column ids, we are copying a Postgres table from one namespace to |
2843 | | // another. Anyway, validate the schema. |
2844 | 13.3k | RETURN_NOT_OK(ValidateCreateTableSchema(schema, resp)); |
2845 | 13.3k | if (!schema.has_column_ids()) { |
2846 | 313 | schema.InitColumnIdsByDefault(); |
2847 | 313 | } |
2848 | 13.3k | schema.mutable_table_properties()->set_is_ysql_catalog_table(true); |
2849 | | |
2850 | | // Verify no hash partition schema is specified. |
2851 | 13.3k | if (req->partition_schema().has_hash_schema()) { |
2852 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, |
2853 | 0 | STATUS(InvalidArgument, |
2854 | 0 | "PostgreSQL system catalog tables are non-partitioned")); |
2855 | 0 | } |
2856 | | |
2857 | 13.3k | if (req->table_type() != TableType::PGSQL_TABLE_TYPE) { |
2858 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, |
2859 | 0 | STATUS_FORMAT( |
2860 | 0 | InvalidArgument, |
2861 | 0 | "Expected table type to be PGSQL_TABLE_TYPE ($0), got $1 ($2)", |
2862 | 0 | PGSQL_TABLE_TYPE, |
2863 | 0 | TableType_Name(req->table_type()))); |
2864 | |
|
2865 | 0 | } |
2866 | | |
2867 | | // Create partition schema and one partition. |
2868 | 13.3k | PartitionSchema partition_schema; |
2869 | 13.3k | vector<Partition> partitions; |
2870 | 13.3k | RETURN_NOT_OK(partition_schema.CreatePartitions(1, &partitions)); |
2871 | | |
2872 | | // Create table info in memory. |
2873 | 13.3k | scoped_refptr<TableInfo> table; |
2874 | 13.3k | scoped_refptr<TabletInfo> sys_catalog_tablet; |
2875 | 13.3k | { |
2876 | 13.3k | LockGuard lock(mutex_); |
2877 | 13.3k | TRACE("Acquired catalog manager lock"); |
2878 | | |
2879 | | // Verify that the table does not exist, or has been deleted. |
2880 | 13.3k | table = FindPtrOrNull(*table_ids_map_, req->table_id()); |
2881 | 13.3k | if (table != nullptr && !table->is_deleted()1 ) { |
2882 | 0 | Status s = STATUS_SUBSTITUTE(AlreadyPresent, |
2883 | 0 | "YSQL table '$0.$1' (ID: $2) already exists", ns->name(), table->name(), table->id()); |
2884 | 0 | LOG(WARNING) << "Found table: " << table->ToStringWithState() |
2885 | 0 | << ". Failed creating YSQL system table with error: " |
2886 | 0 | << s.ToString() << " Request:\n" << req->DebugString(); |
2887 | | // Technically, client already knows table ID, but we set it anyway for unified handling of |
2888 | | // AlreadyPresent errors. See comment in CreateTable() |
2889 | 0 | resp->set_table_id(table->id()); |
2890 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_ALREADY_PRESENT, s); |
2891 | 0 | } |
2892 | | |
2893 | 13.3k | RETURN_NOT_OK(CreateTableInMemory( |
2894 | 13.3k | *req, schema, partition_schema, namespace_id, namespace_name, |
2895 | 13.3k | partitions, nullptr /* index_info */, nullptr /* tablets */, resp, &table)); |
2896 | | |
2897 | 13.3k | sys_catalog_tablet = tablet_map_->find(kSysCatalogTabletId)->second; |
2898 | 13.3k | } |
2899 | | |
2900 | | // Tables with a transaction should be rolled back if the transaction does not get committed. |
2901 | | // Store this on the table persistent state until the transaction has been a verified success. |
2902 | 0 | TransactionMetadata txn; |
2903 | 13.3k | if (req->has_transaction() && FLAGS_enable_transactional_ddl_gc55 ) { |
2904 | 55 | table->mutable_metadata()->mutable_dirty()->pb.mutable_transaction()-> |
2905 | 55 | CopyFrom(req->transaction()); |
2906 | 55 | txn = VERIFY_RESULT(TransactionMetadata::FromPB(req->transaction())); |
2907 | 55 | RSTATUS_DCHECK(!txn.status_tablet.empty(), Corruption, "Given incomplete Transaction"); |
2908 | 55 | } |
2909 | | |
2910 | 13.3k | { |
2911 | 13.3k | auto tablet_lock = sys_catalog_tablet->LockForWrite(); |
2912 | 13.3k | tablet_lock.mutable_data()->pb.add_table_ids(table->id()); |
2913 | | |
2914 | 13.3k | Status s = sys_catalog_->Upsert(leader_ready_term(), sys_catalog_tablet); |
2915 | 13.3k | if (PREDICT_FALSE(!s.ok())) { |
2916 | 1 | return AbortTableCreation(table.get(), {}, s.CloneAndPrepend( |
2917 | 1 | "An error occurred while inserting to sys-tablets: "), resp); |
2918 | 1 | } |
2919 | 13.3k | table->set_is_system(); |
2920 | 13.3k | table->AddTablet(sys_catalog_tablet.get()); |
2921 | 13.3k | tablet_lock.Commit(); |
2922 | 13.3k | } |
2923 | 13.3k | TRACE("Inserted new table info into CatalogManager maps"); |
2924 | | |
2925 | | // Update the on-disk table state to "running". |
2926 | 13.3k | table->mutable_metadata()->mutable_dirty()->pb.set_state(SysTablesEntryPB::RUNNING); |
2927 | 13.3k | Status s = sys_catalog_->Upsert(leader_ready_term(), table); |
2928 | 13.3k | if (PREDICT_FALSE(!s.ok())) { |
2929 | 0 | return AbortTableCreation(table.get(), {}, s.CloneAndPrepend( |
2930 | 0 | "An error occurred while inserting to sys-tablets: "), resp); |
2931 | 0 | } |
2932 | 13.3k | TRACE("Wrote table to system table"); |
2933 | | |
2934 | | // Commit the in-memory state. |
2935 | 13.3k | table->mutable_metadata()->CommitMutation(); |
2936 | | |
2937 | | // Verify Transaction gets committed, which occurs after table create finishes. |
2938 | 13.3k | if (req->has_transaction() && PREDICT_TRUE55 (FLAGS_enable_transactional_ddl_gc)) { |
2939 | 55 | LOG(INFO) << "Enqueuing table for Transaction Verification: " << req->name(); |
2940 | 55 | std::function<Status(bool)> when_done = |
2941 | 55 | std::bind(&CatalogManager::VerifyTablePgLayer, this, table, _1); |
2942 | 55 | WARN_NOT_OK(background_tasks_thread_pool_->SubmitFunc( |
2943 | 55 | std::bind(&YsqlTransactionDdl::VerifyTransaction, ysql_transaction_.get(), txn, when_done)), |
2944 | 55 | "Could not submit VerifyTransaction to thread pool"); |
2945 | 55 | } |
2946 | | |
2947 | 13.3k | tablet::ChangeMetadataRequestPB change_req; |
2948 | 13.3k | change_req.set_tablet_id(kSysCatalogTabletId); |
2949 | 13.3k | auto& add_table = *change_req.mutable_add_table(); |
2950 | | |
2951 | 13.3k | add_table.set_table_id(req->table_id()); |
2952 | 13.3k | add_table.set_table_type(TableType::PGSQL_TABLE_TYPE); |
2953 | 13.3k | add_table.set_table_name(req->name()); |
2954 | 13.3k | SchemaToPB(schema, add_table.mutable_schema()); |
2955 | 13.3k | add_table.set_schema_version(0); |
2956 | | |
2957 | 13.3k | partition_schema.ToPB(add_table.mutable_partition_schema()); |
2958 | | |
2959 | 13.3k | RETURN_NOT_OK(tablet::SyncReplicateChangeMetadataOperation( |
2960 | 13.3k | &change_req, sys_catalog_->tablet_peer().get(), leader_ready_term())); |
2961 | | |
2962 | 13.3k | if (initial_snapshot_writer_) { |
2963 | 1.12k | initial_snapshot_writer_->AddMetadataChange(change_req); |
2964 | 1.12k | } |
2965 | 13.3k | return Status::OK(); |
2966 | 13.3k | } |
2967 | | |
2968 | | Status CatalogManager::ReservePgsqlOids(const ReservePgsqlOidsRequestPB* req, |
2969 | | ReservePgsqlOidsResponsePB* resp, |
2970 | 805 | rpc::RpcContext* rpc) { |
2971 | 805 | VLOG(1) << "ReservePgsqlOids request: " << req->ShortDebugString()0 ; |
2972 | | |
2973 | | // Lookup namespace |
2974 | 805 | scoped_refptr<NamespaceInfo> ns; |
2975 | 805 | { |
2976 | 805 | SharedLock lock(mutex_); |
2977 | 805 | ns = FindPtrOrNull(namespace_ids_map_, req->namespace_id()); |
2978 | 805 | } |
2979 | 805 | if (!ns) { |
2980 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, |
2981 | 0 | STATUS(NotFound, "Namespace not found", req->namespace_id())); |
2982 | 0 | } |
2983 | | |
2984 | | // Reserve oids. |
2985 | 805 | auto l = ns->LockForWrite(); |
2986 | | |
2987 | 805 | uint32_t begin_oid = l->pb.next_pg_oid(); |
2988 | 805 | if (begin_oid < req->next_oid()) { |
2989 | 709 | begin_oid = req->next_oid(); |
2990 | 709 | } |
2991 | 805 | if (begin_oid == std::numeric_limits<uint32_t>::max()) { |
2992 | 0 | LOG(WARNING) << Format("No more object identifier is available for Postgres database $0 ($1)", |
2993 | 0 | l->pb.name(), req->namespace_id()); |
2994 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::UNKNOWN_ERROR, |
2995 | 0 | STATUS(InvalidArgument, "No more object identifier is available")); |
2996 | 0 | } |
2997 | | |
2998 | 805 | uint32_t end_oid = begin_oid + req->count(); |
2999 | 805 | if (end_oid < begin_oid) { |
3000 | 0 | end_oid = std::numeric_limits<uint32_t>::max(); // Handle wraparound. |
3001 | 0 | } |
3002 | | |
3003 | 805 | resp->set_begin_oid(begin_oid); |
3004 | 805 | resp->set_end_oid(end_oid); |
3005 | 805 | l.mutable_data()->pb.set_next_pg_oid(end_oid); |
3006 | | |
3007 | | // Update the on-disk state. |
3008 | 805 | const Status s = sys_catalog_->Upsert(leader_ready_term(), ns); |
3009 | 805 | if (!s.ok()) { |
3010 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::UNKNOWN_ERROR, s); |
3011 | 0 | } |
3012 | | |
3013 | | // Commit the in-memory state. |
3014 | 805 | l.Commit(); |
3015 | | |
3016 | 805 | VLOG(1) << "ReservePgsqlOids response: " << resp->ShortDebugString()0 ; |
3017 | | |
3018 | 805 | return Status::OK(); |
3019 | 805 | } |
3020 | | |
3021 | | Status CatalogManager::GetYsqlCatalogConfig(const GetYsqlCatalogConfigRequestPB* req, |
3022 | | GetYsqlCatalogConfigResponsePB* resp, |
3023 | 22 | rpc::RpcContext* rpc) { |
3024 | 22 | VLOG(1) << "GetYsqlCatalogConfig request: " << req->ShortDebugString()0 ; |
3025 | 22 | auto l = CHECK_NOTNULL(ysql_catalog_config_.get())->LockForRead(); |
3026 | 22 | resp->set_version(l->pb.ysql_catalog_config().version()); |
3027 | | |
3028 | 22 | return Status::OK(); |
3029 | 22 | } |
3030 | | |
3031 | | Status CatalogManager::CopyPgsqlSysTables(const NamespaceId& namespace_id, |
3032 | 123 | const std::vector<scoped_refptr<TableInfo>>& tables) { |
3033 | 123 | const uint32_t database_oid = CHECK_RESULT(GetPgsqlDatabaseOid(namespace_id)); |
3034 | 123 | vector<TableId> source_table_ids; |
3035 | 123 | vector<TableId> target_table_ids; |
3036 | 15.4k | for (const auto& table : tables) { |
3037 | 15.4k | CreateTableRequestPB table_req; |
3038 | 15.4k | CreateTableResponsePB table_resp; |
3039 | | |
3040 | 15.4k | const uint32_t table_oid = VERIFY_RESULT(GetPgsqlTableOid(table->id())); |
3041 | 0 | const TableId table_id = GetPgsqlTableId(database_oid, table_oid); |
3042 | | |
3043 | | // Hold read lock until rows from the table are copied also. |
3044 | 15.4k | auto l = table->LockForRead(); |
3045 | | |
3046 | | // Skip shared table. |
3047 | 15.4k | if (l->pb.is_pg_shared_table()) { |
3048 | 2.37k | continue; |
3049 | 2.37k | } |
3050 | | |
3051 | 13.0k | table_req.set_name(l->pb.name()); |
3052 | 13.0k | table_req.mutable_namespace_()->set_id(namespace_id); |
3053 | 13.0k | table_req.set_table_type(PGSQL_TABLE_TYPE); |
3054 | 13.0k | table_req.mutable_schema()->CopyFrom(l->schema()); |
3055 | 13.0k | table_req.set_is_pg_catalog_table(true); |
3056 | 13.0k | table_req.set_table_id(table_id); |
3057 | | |
3058 | 13.0k | if (IsIndex(l->pb)) { |
3059 | 5.80k | const uint32_t indexed_table_oid = |
3060 | 5.80k | VERIFY_RESULT(GetPgsqlTableOid(GetIndexedTableId(l->pb))); |
3061 | 0 | const TableId indexed_table_id = GetPgsqlTableId(database_oid, indexed_table_oid); |
3062 | | |
3063 | | // Set index_info. |
3064 | | // Previously created INDEX wouldn't have the attribute index_info. |
3065 | 5.80k | if (l->pb.has_index_info()) { |
3066 | 5.80k | table_req.mutable_index_info()->CopyFrom(l->pb.index_info()); |
3067 | 5.80k | table_req.mutable_index_info()->set_indexed_table_id(indexed_table_id); |
3068 | 5.80k | } |
3069 | | |
3070 | | // Set deprecated field for index_info. |
3071 | 5.80k | table_req.set_indexed_table_id(indexed_table_id); |
3072 | 5.80k | table_req.set_is_local_index(PROTO_GET_IS_LOCAL(l->pb)); |
3073 | 5.80k | table_req.set_is_unique_index(PROTO_GET_IS_UNIQUE(l->pb)); |
3074 | 5.80k | } |
3075 | | |
3076 | 13.0k | auto s = CreateYsqlSysTable(&table_req, &table_resp); |
3077 | 13.0k | if (!s.ok()) { |
3078 | 1 | return s.CloneAndPrepend(Substitute( |
3079 | 1 | "Failure when creating PGSQL System Tables: $0", table_resp.error().ShortDebugString())); |
3080 | 1 | } |
3081 | | |
3082 | 13.0k | source_table_ids.push_back(table->id()); |
3083 | 13.0k | target_table_ids.push_back(table_id); |
3084 | 13.0k | } |
3085 | 122 | RETURN_NOT_OK( |
3086 | 122 | sys_catalog_->CopyPgsqlTables(source_table_ids, target_table_ids, leader_ready_term())); |
3087 | 122 | return Status::OK(); |
3088 | 122 | } |
3089 | | |
3090 | 1.35k | size_t CatalogManager::GetNumLiveTServersForPlacement(const PlacementId& placement_id) { |
3091 | 1.35k | auto blacklist = BlacklistSetFromPB(); |
3092 | 1.35k | TSDescriptorVector ts_descs; |
3093 | 1.35k | master_->ts_manager()->GetAllLiveDescriptorsInCluster( |
3094 | 1.35k | &ts_descs, placement_id, (blacklist.ok() ? *blacklist : BlacklistSet()0 )); |
3095 | 1.35k | return ts_descs.size(); |
3096 | 1.35k | } |
3097 | | |
3098 | 196k | TSDescriptorVector CatalogManager::GetAllLiveNotBlacklistedTServers() const { |
3099 | 196k | TSDescriptorVector ts_descs; |
3100 | 196k | auto blacklist = BlacklistSetFromPB(); |
3101 | 196k | master_->ts_manager()->GetAllLiveDescriptors( |
3102 | 196k | &ts_descs, blacklist.ok() ? *blacklist : BlacklistSet()0 ); |
3103 | 196k | return ts_descs; |
3104 | 196k | } |
3105 | | |
3106 | | namespace { |
3107 | | |
3108 | 429k | size_t GetNumReplicasFromPlacementInfo(const PlacementInfoPB& placement_info) { |
3109 | 429k | return placement_info.num_replicas() > 0 ? |
3110 | 417k | placement_info.num_replicas()11.6k : FLAGS_replication_factor; |
3111 | 429k | } |
3112 | | |
3113 | | Status CheckNumReplicas(const PlacementInfoPB& placement_info, |
3114 | | const TSDescriptorVector& ts_descs, |
3115 | | const vector<Partition>& partitions, |
3116 | 8.44k | CreateTableResponsePB* resp) { |
3117 | 8.44k | auto max_tablets = FLAGS_max_create_tablets_per_ts * ts_descs.size(); |
3118 | 8.44k | auto num_replicas = GetNumReplicasFromPlacementInfo(placement_info); |
3119 | 8.44k | if (num_replicas > 1 && max_tablets > 07.58k && partitions.size() > max_tablets7.54k ) { |
3120 | 0 | std::string msg = Substitute("The requested number of tablets ($0) is over the permitted " |
3121 | 0 | "maximum ($1)", partitions.size(), max_tablets); |
3122 | 0 | Status s = STATUS(InvalidArgument, msg); |
3123 | 0 | LOG(WARNING) << msg; |
3124 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::TOO_MANY_TABLETS, s); |
3125 | 0 | } |
3126 | | |
3127 | 8.44k | return Status::OK(); |
3128 | 8.44k | } |
3129 | | |
3130 | | } // namespace |
3131 | | |
3132 | | // Create a new table. |
3133 | | // See README file in this directory for a description of the design. |
3134 | | Status CatalogManager::CreateTable(const CreateTableRequestPB* orig_req, |
3135 | | CreateTableResponsePB* resp, |
3136 | 8.77k | rpc::RpcContext* rpc) { |
3137 | 8.77k | DVLOG(3) << __PRETTY_FUNCTION__ << " Begin. " << orig_req->DebugString()0 ; |
3138 | | |
3139 | 8.77k | const bool is_pg_table = orig_req->table_type() == PGSQL_TABLE_TYPE; |
3140 | 8.77k | const bool is_pg_catalog_table = is_pg_table && orig_req->is_pg_catalog_table()5.22k ; |
3141 | 8.77k | if (!is_pg_catalog_table || !FLAGS_hide_pg_catalog_table_creation_logs313 ) { |
3142 | 8.51k | LOG(INFO) << "CreateTable from " << RequestorString(rpc) |
3143 | 8.51k | << ":\n" << orig_req->DebugString(); |
3144 | 8.51k | } else { |
3145 | 256 | LOG(INFO) << "CreateTable from " << RequestorString(rpc) << ": " << orig_req->name(); |
3146 | 256 | } |
3147 | | |
3148 | 8.77k | const bool is_transactional = orig_req->schema().table_properties().is_transactional(); |
3149 | | // If this is a transactional table, we need to create the transaction status table (if it does |
3150 | | // not exist already). |
3151 | 8.77k | if (is_transactional && (5.66k !is_pg_catalog_table5.66k || !FLAGS_create_initial_sys_catalog_snapshot313 )) { |
3152 | 5.40k | Status s = CreateGlobalTransactionStatusTableIfNeeded(rpc); |
3153 | 5.40k | if (!s.ok()) { |
3154 | 0 | return s.CloneAndPrepend("Error while creating transaction status table"); |
3155 | 0 | } |
3156 | 5.40k | } else { |
3157 | 3.36k | VLOG(1) |
3158 | 0 | << "Not attempting to create a transaction status table:\n" |
3159 | 0 | << " " << EXPR_VALUE_FOR_LOG(is_transactional) << "\n " |
3160 | 0 | << " " << EXPR_VALUE_FOR_LOG(is_pg_catalog_table) << "\n " |
3161 | 0 | << " " << EXPR_VALUE_FOR_LOG(FLAGS_create_initial_sys_catalog_snapshot); |
3162 | 3.36k | } |
3163 | | |
3164 | | // If this is a transactional table and there is a associated tablespace, try to create a |
3165 | | // local transaction status table for the tablespace if there is a placement attached to it |
3166 | | // (and if it does not exist already). |
3167 | 8.77k | if (GetAtomicFlag(&FLAGS_auto_create_local_transaction_tables)) { |
3168 | 8.77k | if (is_transactional && orig_req->has_tablespace_id()5.66k ) { |
3169 | 169 | const auto& tablespace_id = orig_req->tablespace_id(); |
3170 | 169 | auto tablespace_pb = VERIFY_RESULT(GetTablespaceReplicationInfoWithRetry(tablespace_id)); |
3171 | 169 | if (tablespace_pb) { |
3172 | 58 | RETURN_NOT_OK(CreateLocalTransactionStatusTableIfNeeded(rpc, tablespace_id)); |
3173 | 111 | } else { |
3174 | 111 | VLOG(1) |
3175 | 0 | << "Not attempting to create a local transaction status table: " |
3176 | 0 | << "tablespace " << EXPR_VALUE_FOR_LOG(tablespace_id) << " has no placement\n"; |
3177 | 111 | } |
3178 | 8.60k | } else { |
3179 | 8.60k | VLOG(1) |
3180 | 0 | << "Not attempting to create a local transaction status table:\n" |
3181 | 0 | << " " << EXPR_VALUE_FOR_LOG(is_transactional) << "\n " |
3182 | 0 | << " " << EXPR_VALUE_FOR_LOG(orig_req->has_tablespace_id()); |
3183 | 8.60k | } |
3184 | 8.77k | } |
3185 | | |
3186 | 8.76k | if (is_pg_catalog_table) { |
3187 | 313 | return CreateYsqlSysTable(orig_req, resp); |
3188 | 313 | } |
3189 | | |
3190 | 8.45k | Status s; |
3191 | 8.45k | const char* const object_type = PROTO_PTR_IS_TABLE(orig_req) ? "table"7.24k : "index"1.20k ; |
3192 | | |
3193 | | // Copy the request, so we can fill in some defaults. |
3194 | 8.45k | CreateTableRequestPB req = *orig_req; |
3195 | | |
3196 | | // Lookup the namespace and verify if it exists. |
3197 | 8.45k | TRACE("Looking up namespace"); |
3198 | 8.45k | auto ns = VERIFY_RESULT8.44k (FindNamespace(req.namespace_()));8.44k |
3199 | 0 | bool colocated; |
3200 | 8.44k | NamespaceId namespace_id; |
3201 | 8.44k | NamespaceName namespace_name; |
3202 | 8.44k | { |
3203 | 8.44k | auto ns_lock = ns->LockForRead(); |
3204 | 8.44k | if (ns->database_type() != GetDatabaseTypeForTable(req.table_type())) { |
3205 | 0 | Status s = STATUS(NotFound, "Namespace not found"); |
3206 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s); |
3207 | 0 | } |
3208 | 8.44k | namespace_id = ns->id(); |
3209 | 8.44k | namespace_name = ns->name(); |
3210 | 8.44k | colocated = ns->colocated(); |
3211 | 8.44k | } |
3212 | | |
3213 | | // For index table, find the table info |
3214 | 0 | scoped_refptr<TableInfo> indexed_table; |
3215 | 8.44k | if (IsIndex(req)) { |
3216 | 1.20k | TRACE("Looking up indexed table"); |
3217 | 1.20k | indexed_table = GetTableInfo(req.indexed_table_id()); |
3218 | 1.20k | if (indexed_table == nullptr) { |
3219 | 0 | return STATUS_SUBSTITUTE( |
3220 | 0 | NotFound, "The indexed table $0 does not exist", req.indexed_table_id()); |
3221 | 0 | } |
3222 | | |
3223 | 1.20k | TRACE("Locking indexed table"); |
3224 | 1.20k | RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(indexed_table->LockForRead(), resp)); |
3225 | 1.20k | } |
3226 | | |
3227 | | // Determine if this table should be colocated. If not specified, the table should be colocated if |
3228 | | // and only if the namespace is colocated. |
3229 | 8.44k | if (!req.colocated()) { |
3230 | | // Opt out of colocation if the request says so. |
3231 | 3.97k | colocated = false; |
3232 | 4.47k | } else if (indexed_table && !indexed_table->colocated()1.18k ) { |
3233 | | // Opt out of colocation if the indexed table opted out of colocation. |
3234 | 1.17k | colocated = false; |
3235 | 1.17k | } |
3236 | | |
3237 | | // TODO: If this is a colocated index table in a colocated database, convert any hash partition |
3238 | | // columns into range partition columns. This is because postgres does not know that this index |
3239 | | // table is in a colocated database. When we get to the "tablespaces" step where we store this |
3240 | | // into PG metadata, then PG will know if db/table is colocated and do the work there. |
3241 | 8.44k | if ((colocated || req.has_tablegroup_id()8.38k ) && IsIndex(req)203 ) { |
3242 | 65 | for (auto& col_pb : *req.mutable_schema()->mutable_columns()) { |
3243 | 65 | col_pb.set_is_hash_key(false); |
3244 | 65 | } |
3245 | 29 | } |
3246 | | |
3247 | | // Validate schema. |
3248 | 8.44k | Schema schema; |
3249 | 8.44k | RETURN_NOT_OK(SchemaFromPB(req.schema(), &schema)); |
3250 | 8.44k | RETURN_NOT_OK(ValidateCreateTableSchema(schema, resp)); |
3251 | | |
3252 | | // checking that referenced user-defined types (if any) exist. |
3253 | 8.44k | { |
3254 | 8.44k | SharedLock lock(mutex_); |
3255 | 31.0k | for (size_t i = 0; i < schema.num_columns(); i++22.6k ) { |
3256 | 22.6k | for (const auto &udt_id : schema.column(i).type()->GetUserDefinedTypeIds()) { |
3257 | 73 | if (FindPtrOrNull(udtype_ids_map_, udt_id) == nullptr) { |
3258 | 0 | Status s = STATUS(InvalidArgument, "Referenced user-defined type not found"); |
3259 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
3260 | 0 | } |
3261 | 73 | } |
3262 | 22.6k | } |
3263 | 8.44k | } |
3264 | | // TODO (ENG-1860) The referenced namespace and types retrieved/checked above could be deleted |
3265 | | // some time between this point and table creation below. |
3266 | | |
3267 | | // Usually the column ids are available if it's called on the backup-restoring code path |
3268 | | // (from CatalogManager::RecreateTable). Else the column ids must be empty in the client schema. |
3269 | 8.44k | if (!schema.has_column_ids()) { |
3270 | 8.44k | schema.InitColumnIdsByDefault(); |
3271 | 8.44k | } |
3272 | | |
3273 | 8.44k | if (schema.table_properties().HasCopartitionTableId()) { |
3274 | 0 | return CreateCopartitionedTable(req, resp, rpc, schema, ns); |
3275 | 0 | } |
3276 | | |
3277 | 8.44k | if (colocated || req.has_tablegroup_id()8.38k ) { |
3278 | | // If the table is colocated, then there should be no hash partition columns. |
3279 | | // Do the same for tables that are being placed in tablegroups. |
3280 | 203 | if (schema.num_hash_key_columns() > 0) { |
3281 | 3 | Status s = STATUS(InvalidArgument, "Cannot colocate hash partitioned table"); |
3282 | 3 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
3283 | 3 | } |
3284 | 8.24k | } else if ( |
3285 | 8.24k | !req.partition_schema().has_hash_schema() && !req.partition_schema().has_range_schema()3.78k ) { |
3286 | | // If neither hash nor range schema have been specified by the protobuf request, we assume the |
3287 | | // table uses a hash schema, and we use the table_type and hash_key to determine the hashing |
3288 | | // scheme (redis or multi-column) that should be used. |
3289 | 3.49k | if (req.table_type() == REDIS_TABLE_TYPE) { |
3290 | 418 | req.mutable_partition_schema()->set_hash_schema(PartitionSchemaPB::REDIS_HASH_SCHEMA); |
3291 | 3.07k | } else if (schema.num_hash_key_columns() > 0) { |
3292 | 3.07k | req.mutable_partition_schema()->set_hash_schema(PartitionSchemaPB::MULTI_COLUMN_HASH_SCHEMA); |
3293 | 3.07k | } else { |
3294 | 0 | Status s = STATUS(InvalidArgument, "Unknown table type or partitioning method"); |
3295 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
3296 | 0 | } |
3297 | 3.49k | } |
3298 | | |
3299 | | // Verify that custom placement policy has not been specified for colocated table. |
3300 | 8.44k | const bool is_replication_info_set = IsReplicationInfoSet(req.replication_info()); |
3301 | 8.44k | if (is_replication_info_set && colocated1 ) { |
3302 | 0 | Status s = STATUS(InvalidArgument, "Custom placement policy should not be set for " |
3303 | 0 | "colocated tables"); |
3304 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_TABLE_REPLICATION_INFO, s); |
3305 | 0 | } |
3306 | | |
3307 | 8.44k | if (is_replication_info_set && req.table_type() == PGSQL_TABLE_TYPE1 ) { |
3308 | 0 | const Status s = STATUS(InvalidArgument, "Cannot set placement policy for YSQL tables " |
3309 | 0 | "use Tablespaces instead"); |
3310 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s); |
3311 | 0 | } |
3312 | | |
3313 | | // Get placement info. |
3314 | 8.44k | const ReplicationInfoPB& replication_info = VERIFY_RESULT( |
3315 | 8.44k | GetTableReplicationInfo(req.replication_info(), req.tablespace_id())); |
3316 | 0 | const PlacementInfoPB& placement_info = replication_info.live_replicas(); |
3317 | | |
3318 | | // Calculate number of tablets to be used. Priorities: |
3319 | | // 1. Use Internally specified value from 'CreateTableRequestPB::num_tablets'. |
3320 | | // 2. Use User specified value from |
3321 | | // 'CreateTableRequestPB::SchemaPB::TablePropertiesPB::num_tablets'. |
3322 | | // Note, that the number will be saved in schema stored in the master persistent |
3323 | | // SysCatalog irrespective of which way we choose the number of tablets to create. |
3324 | | // If nothing is specified in this field, nothing will be stored in the table |
3325 | | // TablePropertiesPB for number of tablets |
3326 | | // 3. Calculate own value. |
3327 | 8.44k | int num_tablets = 0; |
3328 | 8.44k | if (req.has_num_tablets()) { |
3329 | 6.69k | num_tablets = req.num_tablets(); // Internal request. |
3330 | 6.69k | } |
3331 | | |
3332 | 8.44k | if (num_tablets <= 0 && schema.table_properties().HasNumTablets()1.74k ) { |
3333 | 1.15k | num_tablets = schema.table_properties().num_tablets(); // User request. |
3334 | 1.15k | } |
3335 | | |
3336 | 8.44k | if (num_tablets <= 0) { |
3337 | | // Use default as client could have gotten the value before any tserver had heartbeated |
3338 | | // to (a new) master leader. |
3339 | 599 | const auto num_live_tservers = |
3340 | 599 | GetNumLiveTServersForPlacement(placement_info.placement_uuid()); |
3341 | 599 | num_tablets = narrow_cast<int>( |
3342 | 599 | num_live_tservers * (is_pg_table ? FLAGS_ysql_num_shards_per_tserver71 |
3343 | 599 | : FLAGS_yb_num_shards_per_tserver528 )); |
3344 | 599 | LOG(INFO) << "Setting default tablets to " << num_tablets << " with " |
3345 | 599 | << num_live_tservers << " primary servers"; |
3346 | 599 | } |
3347 | | |
3348 | | // Create partitions. |
3349 | 8.44k | PartitionSchema partition_schema; |
3350 | 8.44k | vector<Partition> partitions; |
3351 | 8.44k | if (colocated || req.has_tablegroup_id()8.38k ) { |
3352 | 200 | RETURN_NOT_OK(partition_schema.CreatePartitions(1, &partitions)); |
3353 | 200 | req.clear_partition_schema(); |
3354 | 200 | num_tablets = 1; |
3355 | 8.24k | } else { |
3356 | 8.24k | RETURN_NOT_OK(PartitionSchema::FromPB(req.partition_schema(), schema, &partition_schema)); |
3357 | 8.24k | if (req.partitions_size() > 0) { |
3358 | 1 | if (req.partitions_size() != num_tablets) { |
3359 | 0 | Status s = STATUS(InvalidArgument, "Partitions are not defined for all tablets"); |
3360 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
3361 | 0 | } |
3362 | 1 | string last; |
3363 | 2 | for (const auto& p : req.partitions()) { |
3364 | 2 | Partition np; |
3365 | 2 | Partition::FromPB(p, &np); |
3366 | 2 | if (np.partition_key_start() != last) { |
3367 | 0 | Status s = STATUS(InvalidArgument, |
3368 | 0 | "Partitions does not cover the full partition keyspace"); |
3369 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
3370 | 0 | } |
3371 | 2 | last = np.partition_key_end(); |
3372 | 2 | partitions.push_back(std::move(np)); |
3373 | 2 | } |
3374 | 8.24k | } else { |
3375 | | // Supplied number of partitions is merely a suggestion, actual number of |
3376 | | // created partitions might differ. |
3377 | 8.24k | RETURN_NOT_OK(partition_schema.CreatePartitions(num_tablets, &partitions)); |
3378 | 8.24k | } |
3379 | | // The vector 'partitions' contains real setup partitions, so the variable |
3380 | | // should be updated. |
3381 | 8.24k | num_tablets = narrow_cast<int>(partitions.size()); |
3382 | 8.24k | } |
3383 | | |
3384 | 8.44k | TSDescriptorVector all_ts_descs; |
3385 | 8.44k | master_->ts_manager()->GetAllLiveDescriptors(&all_ts_descs); |
3386 | 8.44k | RETURN_NOT_OK(CheckNumReplicas(placement_info, all_ts_descs, partitions, resp)); |
3387 | | |
3388 | 8.44k | if (!FLAGS_TEST_skip_placement_validation_createtable_api) { |
3389 | 8.43k | ValidateReplicationInfoRequestPB validate_req; |
3390 | 8.43k | validate_req.mutable_replication_info()->CopyFrom(replication_info); |
3391 | 8.43k | ValidateReplicationInfoResponsePB validate_resp; |
3392 | 8.43k | RETURN_NOT_OK(ValidateReplicationInfo(&validate_req, &validate_resp)); |
3393 | 8.43k | } |
3394 | | |
3395 | 8.42k | LOG(INFO) << "Set number of tablets: " << num_tablets; |
3396 | 8.42k | req.set_num_tablets(num_tablets); |
3397 | | |
3398 | | // For index table, populate the index info. |
3399 | 8.42k | IndexInfoPB index_info; |
3400 | | |
3401 | 8.42k | const bool index_backfill_enabled = |
3402 | 8.42k | IsIndexBackfillEnabled(orig_req->table_type(), is_transactional); |
3403 | 8.42k | if (req.has_index_info()) { |
3404 | | // Current message format. |
3405 | 1.18k | index_info.CopyFrom(req.index_info()); |
3406 | | |
3407 | | // Assign column-ids that have just been computed and assigned to "index_info". |
3408 | 1.18k | if (!is_pg_table) { |
3409 | 444 | DCHECK_EQ(index_info.columns().size(), schema.num_columns()) |
3410 | 0 | << "Number of columns are not the same between index_info and index_schema"; |
3411 | 2.18k | for (size_t colidx = 0; colidx < schema.num_columns(); colidx++1.73k ) { |
3412 | 1.73k | index_info.mutable_columns(narrow_cast<int>(colidx))->set_column_id( |
3413 | 1.73k | schema.column_id(colidx)); |
3414 | 1.73k | } |
3415 | 444 | } |
3416 | 7.24k | } else if (req.has_indexed_table_id()) { |
3417 | | // Old client message format when rolling upgrade (Not having "index_info"). |
3418 | 18 | IndexInfoBuilder index_info_builder(&index_info); |
3419 | 18 | index_info_builder.ApplyProperties(req.indexed_table_id(), |
3420 | 18 | req.is_local_index(), req.is_unique_index()); |
3421 | 18 | if (orig_req->table_type() != PGSQL_TABLE_TYPE) { |
3422 | 18 | Schema indexed_schema; |
3423 | 18 | RETURN_NOT_OK(indexed_table->GetSchema(&indexed_schema)); |
3424 | 18 | RETURN_NOT_OK(index_info_builder.ApplyColumnMapping(indexed_schema, schema)); |
3425 | 18 | } |
3426 | 18 | } |
3427 | | |
3428 | 8.42k | if ((req.has_index_info() || req.has_indexed_table_id()7.24k ) && |
3429 | 8.42k | index_backfill_enabled1.20k && |
3430 | 8.42k | !req.skip_index_backfill()1.12k ) { |
3431 | | // Start off the index table with major compactions disabled. We need this to retain the delete |
3432 | | // markers until the backfill process is completed. No need to set index_permissions in the |
3433 | | // index table. |
3434 | 937 | schema.SetRetainDeleteMarkers(true); |
3435 | 937 | } |
3436 | | |
3437 | 8.42k | LOG(INFO) << "CreateTable with IndexInfo " << AsString(index_info); |
3438 | | |
3439 | 8.42k | scoped_refptr<TableInfo> table; |
3440 | 8.42k | TabletInfos tablets; |
3441 | 8.42k | bool tablets_exist; |
3442 | 8.42k | bool tablegroup_tablets_exist = false; |
3443 | | |
3444 | 8.42k | { |
3445 | 8.42k | LockGuard lock(mutex_); |
3446 | 8.42k | auto ns_lock = ns->LockForRead(); |
3447 | 8.42k | TRACE("Acquired catalog manager lock"); |
3448 | | |
3449 | 8.42k | tablets_exist = |
3450 | 8.42k | colocated && colocated_tablet_ids_map_.find(ns->id()) != colocated_tablet_ids_map_.end()57 ; |
3451 | | // Verify that the table does not exist. |
3452 | 8.42k | table = FindPtrOrNull(table_names_map_, {namespace_id, req.name()}); |
3453 | | |
3454 | 8.42k | if (table != nullptr) { |
3455 | 4 | s = STATUS_SUBSTITUTE(AlreadyPresent, |
3456 | 4 | "Object '$0.$1' already exists", ns->name(), table->name()); |
3457 | 4 | LOG(WARNING) << "Found table: " << table->ToStringWithState() |
3458 | 4 | << ". Failed creating table with error: " |
3459 | 4 | << s.ToString() << " Request:\n" << orig_req->DebugString(); |
3460 | | // If the table already exists, we set the response table_id field to the id of the table that |
3461 | | // already exists. This is necessary because before we return the error to the client (or |
3462 | | // success in case of a "CREATE TABLE IF NOT EXISTS" request) we want to wait for the existing |
3463 | | // table to be available to receive requests. And we need the table id for that. |
3464 | 4 | resp->set_table_id(table->id()); |
3465 | 4 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_ALREADY_PRESENT, s); |
3466 | 4 | } |
3467 | | |
3468 | | // Namespace state validity check: |
3469 | | // 1. Allow Namespaces that are RUNNING |
3470 | | // 2. Allow Namespaces that are PREPARING under 2 situations |
3471 | | // 2a. System Namespaces. |
3472 | | // 2b. The parent table from a Colocated Namespace. |
3473 | 8.42k | const auto parent_table_name = ns->id() + kColocatedParentTableNameSuffix; |
3474 | 8.42k | bool valid_ns_state = (ns->state() == SysNamespaceEntryPB::RUNNING) || |
3475 | 8.42k | (18 ns->state() == SysNamespaceEntryPB::PREPARING18 && |
3476 | 18 | (ns->name() == kSystemNamespaceName || req.name() == parent_table_name)); |
3477 | 8.42k | if (!valid_ns_state) { |
3478 | 1 | Status s = STATUS_SUBSTITUTE(TryAgain, "Invalid Namespace State ($0). Cannot create $1.$2", |
3479 | 1 | SysNamespaceEntryPB::State_Name(ns->state()), ns->name(), req.name()); |
3480 | 1 | return SetupError(resp->mutable_error(), NamespaceMasterError(ns->state()), s); |
3481 | 1 | } |
3482 | | |
3483 | | // Check whether this CREATE TABLE request which has a tablegroup_id is for a normal user table |
3484 | | // or the request to create the parent table for the tablegroup. This is done by checking the |
3485 | | // catalog manager maps. |
3486 | 8.42k | if (req.has_tablegroup_id() && |
3487 | 8.42k | tablegroup_tablet_ids_map_.find(ns->id()) != tablegroup_tablet_ids_map_.end()143 && |
3488 | 8.42k | tablegroup_tablet_ids_map_[ns->id()].find(req.tablegroup_id()) != |
3489 | 124 | tablegroup_tablet_ids_map_[ns->id()].end()) { |
3490 | 89 | tablegroup_tablets_exist = true; |
3491 | 89 | } |
3492 | | |
3493 | | // Generate colocation ID in advance in order to fail before CreateTableInMemory is called. |
3494 | 8.42k | ColocationId colocation_id = kColocationIdNotSet; |
3495 | 8.42k | if (req.has_tablegroup_id() && tablegroup_tablets_exist143 ) { |
3496 | 89 | auto tablegroup = tablegroup_ids_map_[req.tablegroup_id()]; |
3497 | | |
3498 | 89 | colocation_id = VERIFY_RESULT87 ( |
3499 | 87 | ConceiveColocationId(req, resp, [tablegroup](auto colocation_id) { |
3500 | 87 | return tablegroup->HasChildTable(colocation_id); |
3501 | 87 | })); |
3502 | 8.33k | } else if (colocated && tablets_exist57 ) { |
3503 | 40 | auto tablet = colocated_tablet_ids_map_[ns->id()]; |
3504 | 40 | auto tablet_lock = tablet->LockForWrite(); |
3505 | | |
3506 | 40 | std::set<ColocationId> colocation_ids; |
3507 | 40 | if (!req.has_colocation_id()) { |
3508 | 148 | for (const TableId& table_id : tablet_lock.data().pb.table_ids()) { |
3509 | 148 | DCHECK(!table_id.empty()); |
3510 | 148 | const auto colocated_table_info = GetTableInfoUnlocked(table_id); |
3511 | 148 | if (!colocated_table_info) { |
3512 | | // Needed because of #11129, should be replaced with DCHECK after the fix. |
3513 | 0 | continue; |
3514 | 0 | } |
3515 | 148 | Schema colocated_table_schema; |
3516 | 148 | RETURN_NOT_OK(colocated_table_info->GetSchema(&colocated_table_schema)); |
3517 | 148 | colocation_ids.insert(colocated_table_schema.colocation_id()); |
3518 | 148 | } |
3519 | 37 | } |
3520 | | |
3521 | 40 | colocation_id = VERIFY_RESULT( |
3522 | 40 | ConceiveColocationId(req, resp, [&colocation_ids](auto colocation_id) { |
3523 | 40 | return ContainsKey(colocation_ids, colocation_id); |
3524 | 40 | })); |
3525 | 40 | } |
3526 | | |
3527 | 8.42k | RETURN_NOT_OK(CreateTableInMemory( |
3528 | 8.42k | req, schema, partition_schema, namespace_id, namespace_name, partitions, &index_info, |
3529 | 8.42k | (!tablets_exist && !tablegroup_tablets_exist) ? &tablets : nullptr, resp, &table)); |
3530 | | |
3531 | | // Section is executed when a table is either the parent table or a user table in a tablegroup. |
3532 | | // It additionally sets the table metadata (and tablet metadata if this is the parent table) |
3533 | | // to have the colocated property so we can take advantage of code reuse. |
3534 | 8.42k | if (req.has_tablegroup_id()) { |
3535 | 141 | table->mutable_metadata()->mutable_dirty()->pb.set_colocated(true); |
3536 | 141 | if (tablegroup_tablets_exist) { |
3537 | | // If the table is not a tablegroup parent table, it performs a lookup for the proper tablet |
3538 | | // to place the table on as a child table. |
3539 | 87 | auto tablet = tablegroup_tablet_ids_map_[ns->id()][req.tablegroup_id()]; |
3540 | 87 | RSTATUS_DCHECK( |
3541 | 87 | tablet->colocated(), InternalError, |
3542 | 87 | "The tablet for tablegroup should be colocated."); |
3543 | 87 | tablets.push_back(tablet.get()); |
3544 | 87 | auto tablet_lock = tablet->LockForWrite(); |
3545 | 87 | tablet_lock.mutable_data()->pb.add_table_ids(table->id()); |
3546 | 87 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), tablet)); |
3547 | 87 | tablet_lock.Commit(); |
3548 | | |
3549 | 87 | auto tablegroup = tablegroup_ids_map_[req.tablegroup_id()]; |
3550 | | |
3551 | 87 | CHECK(colocation_id != kColocationIdNotSet); |
3552 | 87 | table->mutable_metadata()->mutable_dirty()-> |
3553 | 87 | pb.mutable_schema()->mutable_colocated_table_id()->set_colocation_id(colocation_id); |
3554 | | |
3555 | 87 | tablet->mutable_metadata()->StartMutation(); |
3556 | 87 | table->AddTablet(tablet); |
3557 | 87 | tablegroup->AddChildTable(table->id(), colocation_id); |
3558 | | |
3559 | 87 | table_tablegroup_ids_map_[table->id()] = tablegroup->id(); |
3560 | 87 | } else { |
3561 | | // If the table is a tablegroup parent table, it creates a dummy tablet for the tablegroup |
3562 | | // along with updating the catalog manager maps. |
3563 | 54 | RSTATUS_DCHECK_EQ( |
3564 | 54 | tablets.size(), 1U, InternalError, |
3565 | 54 | "Only one tablet should be created for each tablegroup"); |
3566 | 54 | tablets[0]->mutable_metadata()->mutable_dirty()->pb.set_colocated(true); |
3567 | | // Update catalog manager maps for tablegroups |
3568 | 54 | tablegroup_tablet_ids_map_[ns->id()][req.tablegroup_id()] = |
3569 | 54 | tablet_map_->find(tablets[0]->id())->second; |
3570 | 54 | } |
3571 | 8.28k | } else if (colocated) { |
3572 | 57 | table->mutable_metadata()->mutable_dirty()->pb.set_colocated(true); |
3573 | | // if the tablet already exists, add the tablet to tablets |
3574 | 57 | if (tablets_exist) { |
3575 | 40 | auto tablet = colocated_tablet_ids_map_[ns->id()]; |
3576 | 40 | RSTATUS_DCHECK( |
3577 | 40 | tablet->colocated(), InternalError, |
3578 | 40 | "The tablet for colocated database should be colocated."); |
3579 | 40 | tablets.push_back(tablet.get()); |
3580 | | |
3581 | 40 | auto tablet_lock = tablet->LockForWrite(); |
3582 | | |
3583 | 40 | tablet_lock.mutable_data()->pb.add_table_ids(table->id()); |
3584 | 40 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), tablet)); |
3585 | 40 | tablet_lock.Commit(); |
3586 | | |
3587 | 40 | CHECK(colocation_id != kColocationIdNotSet); |
3588 | 40 | table->mutable_metadata()->mutable_dirty()-> |
3589 | 40 | pb.mutable_schema()->mutable_colocated_table_id()->set_colocation_id(colocation_id); |
3590 | | |
3591 | 40 | tablet->mutable_metadata()->StartMutation(); |
3592 | 40 | table->AddTablet(tablet); |
3593 | 40 | } else { // Record the tablet |
3594 | 17 | RSTATUS_DCHECK_EQ( |
3595 | 17 | tablets.size(), 1U, InternalError, |
3596 | 17 | "Only one tablet should be created for each colocated database"); |
3597 | 17 | tablets[0]->mutable_metadata()->mutable_dirty()->pb.set_colocated(true); |
3598 | 17 | colocated_tablet_ids_map_[ns->id()] = tablet_map_->find(tablets[0]->id())->second; |
3599 | 17 | } |
3600 | 57 | } |
3601 | 8.42k | if (req.has_matview_pg_table_id()) { |
3602 | 24 | matview_pg_table_ids_map_[req.table_id()] = req.matview_pg_table_id(); |
3603 | 24 | } |
3604 | 8.42k | } |
3605 | | |
3606 | | // For create transaction table requests with tablespace id, save the tablespace id. |
3607 | 0 | const auto is_transaction_status_table = |
3608 | 8.42k | orig_req->table_type() == TableType::TRANSACTION_STATUS_TABLE_TYPE; |
3609 | 8.42k | if (is_transaction_status_table && req.has_tablespace_id()1.09k ) { |
3610 | 22 | table->mutable_metadata()->mutable_dirty()->pb.set_transaction_table_tablespace_id( |
3611 | 22 | req.tablespace_id()); |
3612 | 22 | } |
3613 | | |
3614 | | // Tables with a transaction should be rolled back if the transaction does not get committed. |
3615 | | // Store this on the table persistent state until the transaction has been a verified success. |
3616 | 8.42k | TransactionMetadata txn; |
3617 | 8.42k | if (req.has_transaction() && FLAGS_enable_transactional_ddl_gc4.72k ) { |
3618 | 4.72k | table->mutable_metadata()->mutable_dirty()->pb.mutable_transaction()-> |
3619 | 4.72k | CopyFrom(req.transaction()); |
3620 | 4.72k | txn = VERIFY_RESULT(TransactionMetadata::FromPB(req.transaction())); |
3621 | 4.72k | RSTATUS_DCHECK(!txn.status_tablet.empty(), Corruption, "Given incomplete Transaction"); |
3622 | 4.72k | } |
3623 | | |
3624 | 8.42k | if (PREDICT_FALSE(FLAGS_TEST_simulate_slow_table_create_secs > 0) && |
3625 | 8.42k | req.table_type() != TableType::TRANSACTION_STATUS_TABLE_TYPE19 ) { |
3626 | 12 | LOG(INFO) << "Simulating slow table creation"; |
3627 | 12 | SleepFor(MonoDelta::FromSeconds(FLAGS_TEST_simulate_slow_table_create_secs)); |
3628 | 12 | } |
3629 | | |
3630 | | // NOTE: the table and tablets are already locked for write at this point, |
3631 | | // since the CreateTableInfo/CreateTabletInfo functions leave them in that state. |
3632 | | // They will get committed at the end of this function. |
3633 | | // Sanity check: the tables and tablets should all be in "preparing" state. |
3634 | 8.42k | CHECK_EQ(SysTablesEntryPB::PREPARING, table->metadata().dirty().pb.state()); |
3635 | | // Update the on-disk table state to "running". |
3636 | 8.42k | table->mutable_metadata()->mutable_dirty()->pb.set_state(SysTablesEntryPB::RUNNING); |
3637 | 8.42k | TRACE("Inserted new table and tablet info into CatalogManager maps"); |
3638 | 8.42k | VLOG_WITH_PREFIX0 (1) << "Inserted new table and tablet info into CatalogManager maps"0 ; |
3639 | | |
3640 | 8.42k | if (!tablets_exist && !tablegroup_tablets_exist8.38k ) { |
3641 | | // Write Tablets to sys-tablets (in "preparing" state). |
3642 | 48.4k | for (const auto& tablet : tablets) { |
3643 | 48.4k | CHECK_EQ(SysTabletsEntryPB::PREPARING, tablet->metadata().dirty().pb.state()); |
3644 | 48.4k | } |
3645 | 8.29k | } |
3646 | | |
3647 | 8.42k | s = sys_catalog_->Upsert(leader_ready_term(), table, tablets); |
3648 | 8.42k | if (PREDICT_FALSE(!s.ok())) { |
3649 | 6 | return AbortTableCreation( |
3650 | 6 | table.get(), tablets, s.CloneAndPrepend("An error occurred while inserting to sys-tablets"), |
3651 | 6 | resp); |
3652 | 6 | } |
3653 | 8.41k | TRACE("Wrote table and tablets to system table"); |
3654 | | |
3655 | | // For index table, insert index info in the indexed table. |
3656 | 8.41k | if ((req.has_index_info() || req.has_indexed_table_id()7.22k )) { |
3657 | 1.19k | if (index_backfill_enabled && !req.skip_index_backfill()1.12k ) { |
3658 | 931 | if (is_pg_table) { |
3659 | | // YSQL: start at some permission before backfill. The real enforcement happens with |
3660 | | // pg_index system table's indislive and indisready columns. Choose WRITE_AND_DELETE |
3661 | | // because it will probably be less confusing. |
3662 | 552 | index_info.set_index_permissions(INDEX_PERM_WRITE_AND_DELETE); |
3663 | 552 | } else { |
3664 | | // YCQL |
3665 | 379 | index_info.set_index_permissions(INDEX_PERM_DELETE_ONLY); |
3666 | 379 | } |
3667 | 931 | } |
3668 | 1.19k | s = AddIndexInfoToTable(indexed_table, index_info, resp); |
3669 | 1.19k | if (PREDICT_FALSE(!s.ok())) { |
3670 | 0 | return AbortTableCreation( |
3671 | 0 | table.get(), tablets, s.CloneAndPrepend("An error occurred while inserting index info"), |
3672 | 0 | resp); |
3673 | 0 | } |
3674 | 1.19k | } |
3675 | | |
3676 | | // Commit the in-memory state. |
3677 | 8.41k | table->mutable_metadata()->CommitMutation(); |
3678 | | |
3679 | 48.5k | for (const auto& tablet : tablets) { |
3680 | 48.5k | tablet->mutable_metadata()->CommitMutation(); |
3681 | 48.5k | } |
3682 | | |
3683 | 8.41k | if ((colocated && tablets_exist57 ) || (8.36k req.has_tablegroup_id()8.36k && tablegroup_tablets_exist141 )) { |
3684 | 127 | auto call = |
3685 | 127 | std::make_shared<AsyncAddTableToTablet>(master_, AsyncTaskPool(), tablets[0], table); |
3686 | 127 | table->AddTask(call); |
3687 | 127 | WARN_NOT_OK(ScheduleTask(call), "Failed to send AddTableToTablet request"); |
3688 | 127 | } |
3689 | | |
3690 | 8.41k | if (req.has_creator_role_name()) { |
3691 | 242 | const NamespaceName& keyspace_name = req.namespace_().name(); |
3692 | 242 | const TableName& table_name = req.name(); |
3693 | 242 | RETURN_NOT_OK(permissions_manager_->GrantPermissions( |
3694 | 242 | req.creator_role_name(), |
3695 | 242 | get_canonical_table(keyspace_name, table_name), |
3696 | 242 | table_name, |
3697 | 242 | keyspace_name, |
3698 | 242 | all_permissions_for_resource(ResourceType::TABLE), |
3699 | 242 | ResourceType::TABLE, |
3700 | 242 | resp)); |
3701 | 242 | } |
3702 | | |
3703 | | // Verify Transaction gets committed, which occurs after table create finishes. |
3704 | 8.41k | if (req.has_transaction() && PREDICT_TRUE4.71k (FLAGS_enable_transactional_ddl_gc)) { |
3705 | 4.71k | LOG(INFO) << "Enqueuing table for Transaction Verification: " << req.name(); |
3706 | 4.71k | std::function<Status(bool)> when_done = |
3707 | 4.71k | std::bind(&CatalogManager::VerifyTablePgLayer, this, table, _1); |
3708 | 4.71k | WARN_NOT_OK(background_tasks_thread_pool_->SubmitFunc( |
3709 | 4.71k | std::bind(&YsqlTransactionDdl::VerifyTransaction, ysql_transaction_.get(), txn, when_done)), |
3710 | 4.71k | "Could not submit VerifyTransaction to thread pool"); |
3711 | 4.71k | } |
3712 | | |
3713 | 8.41k | LOG(INFO) << "Successfully created " << object_type << " " << table->ToString() << " in " |
3714 | 8.41k | << ns->ToString() << " per request from " << RequestorString(rpc); |
3715 | 8.41k | background_tasks_->Wake(); |
3716 | | |
3717 | 8.41k | if (FLAGS_master_enable_metrics_snapshotter && |
3718 | 8.41k | !(2 req.table_type() == TableType::YQL_TABLE_TYPE2 && |
3719 | 2 | namespace_id == kSystemNamespaceId1 && |
3720 | 2 | req.name() == kMetricsSnapshotsTableName1 )) { |
3721 | 1 | Status s = CreateMetricsSnapshotsTableIfNeeded(rpc); |
3722 | 1 | if (!s.ok()) { |
3723 | 0 | return s.CloneAndPrepend("Error while creating metrics snapshots table"); |
3724 | 0 | } |
3725 | 1 | } |
3726 | | |
3727 | | // Increment transaction status version if needed. |
3728 | 8.41k | if (is_transaction_status_table) { |
3729 | 1.09k | RETURN_NOT_OK(IncrementTransactionTablesVersion()); |
3730 | 1.09k | } |
3731 | | |
3732 | 8.41k | DVLOG(3) << __PRETTY_FUNCTION__ << " Done."16 ; |
3733 | 8.41k | return Status::OK(); |
3734 | 8.41k | } |
3735 | | |
3736 | 4.76k | Status CatalogManager::VerifyTablePgLayer(scoped_refptr<TableInfo> table, bool rpc_success) { |
3737 | | // Upon Transaction completion, check pg system table using OID to ensure SUCCESS. |
3738 | 4.76k | const uint32_t database_oid = VERIFY_RESULT(GetPgsqlDatabaseOidByTableId(table->id())); |
3739 | 0 | const auto pg_table_id = GetPgsqlTableId(database_oid, kPgClassTableOid); |
3740 | 4.76k | auto table_storage_id = GetPgsqlTableOid(table->id()); |
3741 | 4.76k | { |
3742 | 4.76k | SharedLock lock(mutex_); |
3743 | 4.76k | if (matview_pg_table_ids_map_.find(table->id()) != matview_pg_table_ids_map_.end()) { |
3744 | 24 | table_storage_id = GetPgsqlTableOid(matview_pg_table_ids_map_[table->id()]); |
3745 | 24 | } |
3746 | 4.76k | } |
3747 | 4.76k | auto entry_exists = VERIFY_RESULT( |
3748 | 4.76k | ysql_transaction_->PgEntryExists(pg_table_id, table_storage_id)); |
3749 | 0 | auto l = table->LockForWrite(); |
3750 | 4.76k | auto& metadata = table->mutable_metadata()->mutable_dirty()->pb; |
3751 | | |
3752 | 4.76k | SCHECK(metadata.state() == SysTablesEntryPB::RUNNING || |
3753 | 4.76k | metadata.state() == SysTablesEntryPB::ALTERING, Aborted, |
3754 | 4.76k | Substitute("Unexpected table state ($0), abandoning transaction GC work for $1", |
3755 | 4.76k | SysTablesEntryPB_State_Name(metadata.state()), table->ToString())); |
3756 | | |
3757 | | // #5981: Mark un-retryable rpc failures as pass to avoid infinite retry of GC'd txns. |
3758 | 4.67k | const bool txn_check_passed = entry_exists || !rpc_success63 ; |
3759 | | |
3760 | 4.67k | if (txn_check_passed) { |
3761 | | // Remove the transaction from the entry since we're done processing it. |
3762 | 4.61k | metadata.clear_transaction(); |
3763 | 4.61k | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), table)); |
3764 | 4.61k | if (4.61k entry_exists4.61k ) { |
3765 | 4.61k | LOG_WITH_PREFIX(INFO) << "Table transaction succeeded: " << table->ToString(); |
3766 | 18.4E | } else { |
3767 | 18.4E | LOG_WITH_PREFIX(WARNING) |
3768 | 18.4E | << "Unknown RPC failure, removing transaction on table: " << table->ToString(); |
3769 | 18.4E | } |
3770 | | // Commit the in-memory state. |
3771 | 4.61k | l.Commit(); |
3772 | 4.61k | } else { |
3773 | 64 | LOG(INFO) << "Table transaction failed, deleting: " << table->ToString(); |
3774 | | // Async enqueue delete. |
3775 | 64 | DeleteTableRequestPB del_tbl_req; |
3776 | 64 | del_tbl_req.mutable_table()->set_table_name(table->name()); |
3777 | 64 | del_tbl_req.mutable_table()->set_table_id(table->id()); |
3778 | 64 | del_tbl_req.set_is_index_table(table->is_index()); |
3779 | | |
3780 | 64 | RETURN_NOT_OK(background_tasks_thread_pool_->SubmitFunc( [this, del_tbl_req]() { |
3781 | 64 | DeleteTableResponsePB del_tbl_resp; |
3782 | 64 | WARN_NOT_OK(DeleteTable(&del_tbl_req, &del_tbl_resp, nullptr), |
3783 | 64 | "Failed to Delete Table with failed transaction"); |
3784 | 64 | })); |
3785 | 64 | } |
3786 | 4.67k | return Status::OK(); |
3787 | 4.67k | } |
3788 | | |
3789 | | Result<TabletInfos> CatalogManager::CreateTabletsFromTable(const vector<Partition>& partitions, |
3790 | 42.8k | const TableInfoPtr& table) { |
3791 | 42.8k | TabletInfos tablets; |
3792 | | // Create the TabletInfo objects in state PREPARING. |
3793 | 83.0k | for (const Partition& partition : partitions) { |
3794 | 83.0k | PartitionPB partition_pb; |
3795 | 83.0k | partition.ToPB(&partition_pb); |
3796 | 83.0k | tablets.push_back(CreateTabletInfo(table.get(), partition_pb)); |
3797 | 83.0k | } |
3798 | | |
3799 | | // Add the table/tablets to the in-memory map for the assignment. |
3800 | 42.8k | table->AddTablets(tablets); |
3801 | 42.8k | auto tablet_map_checkout = tablet_map_.CheckOut(); |
3802 | 83.0k | for (const TabletInfoPtr& tablet : tablets) { |
3803 | 83.0k | InsertOrDie(tablet_map_checkout.get_ptr(), tablet->tablet_id(), tablet); |
3804 | 83.0k | } |
3805 | | |
3806 | 42.8k | return tablets; |
3807 | 42.8k | } |
3808 | | |
3809 | | Status CatalogManager::CheckValidPlacementInfo(const PlacementInfoPB& placement_info, |
3810 | | const TSDescriptorVector& ts_descs, |
3811 | 56.8k | ValidateReplicationInfoResponsePB* resp) { |
3812 | 56.8k | size_t num_live_tservers = ts_descs.size(); |
3813 | 56.8k | size_t num_replicas = GetNumReplicasFromPlacementInfo(placement_info); |
3814 | 56.8k | Status s; |
3815 | 56.8k | string msg; |
3816 | | |
3817 | | // Verify that the number of replicas isn't larger than the required number of live tservers. |
3818 | | // To ensure quorum, we need n/2 + 1 live tservers. |
3819 | 56.8k | size_t replica_quorum_needed = num_replicas / 2 + 1; |
3820 | 56.8k | if (FLAGS_catalog_manager_check_ts_count_for_create_table && |
3821 | 56.8k | replica_quorum_needed > num_live_tservers56.5k ) { |
3822 | 2 | msg = Substitute("Not enough live tablet servers to create table with replication factor $0. " |
3823 | 2 | "Need at least $1 tablet servers whereas $2 are alive.", |
3824 | 2 | num_replicas, replica_quorum_needed, num_live_tservers); |
3825 | 2 | LOG(WARNING) << msg |
3826 | 2 | << ". Placement info: " << placement_info.ShortDebugString() |
3827 | 2 | << ", replication factor flag: " << FLAGS_replication_factor; |
3828 | 2 | s = STATUS(InvalidArgument, msg); |
3829 | 2 | return SetupError(resp->mutable_error(), MasterErrorPB::REPLICATION_FACTOR_TOO_HIGH, s); |
3830 | 2 | } |
3831 | | |
3832 | | // Verify that placement requests are reasonable. |
3833 | 56.8k | if (!placement_info.placement_blocks().empty()) { |
3834 | 874 | size_t minimum_sum = 0; |
3835 | 1.19k | for (const auto& pb : placement_info.placement_blocks()) { |
3836 | 1.19k | minimum_sum += pb.min_num_replicas(); |
3837 | 1.19k | if (!pb.has_cloud_info()) { |
3838 | 1 | msg = Substitute("Got placement info without cloud info set: $0", pb.ShortDebugString()); |
3839 | 1 | s = STATUS(InvalidArgument, msg); |
3840 | 1 | LOG(WARNING) << msg; |
3841 | 1 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
3842 | 1 | } |
3843 | 1.19k | } |
3844 | | // Total replicas requested should be at least the sum of minimums |
3845 | | // requested in individual placement blocks. |
3846 | 873 | if (minimum_sum > num_replicas) { |
3847 | 1 | msg = Substitute("Sum of minimum replicas per placement ($0) is greater than num_replicas " |
3848 | 1 | " ($1)", minimum_sum, num_replicas); |
3849 | 1 | s = STATUS(InvalidArgument, msg); |
3850 | 1 | LOG(WARNING) << msg; |
3851 | 1 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
3852 | 1 | } |
3853 | | |
3854 | | // Verify that there are enough TServers in the requested placements |
3855 | | // to match the total required replication factor. |
3856 | 872 | auto allowed_ts = VERIFY_RESULT(FindTServersForPlacementInfo(placement_info, ts_descs)); |
3857 | | |
3858 | | // Fail if we don't have enough tablet servers in the areas requested. |
3859 | | // We need n/2 + 1 for quorum. |
3860 | 872 | if (allowed_ts.size() < replica_quorum_needed) { |
3861 | 29 | msg = Substitute("Not enough tablet servers in the requested placements. " |
3862 | 29 | "Need at least $0, have $1", |
3863 | 29 | replica_quorum_needed, allowed_ts.size()); |
3864 | 29 | s = STATUS(InvalidArgument, msg); |
3865 | 29 | LOG(WARNING) << msg; |
3866 | 29 | return SetupError(resp->mutable_error(), MasterErrorPB::REPLICATION_FACTOR_TOO_HIGH, s); |
3867 | 29 | } |
3868 | | |
3869 | | // Try allocating tservers for the replicas and see if we can place a quorum |
3870 | | // number of replicas. |
3871 | | // Essentially, the logic is: |
3872 | | // 1. We satisfy whatever we can from the minimums. |
3873 | | // 2. We then satisfy whatever we can from the slack. |
3874 | | // Here it doesn't whether where we put the slack replicas as long as |
3875 | | // the tservers are chosen from any of the valid placement blocks. |
3876 | | // Overall, if in this process we are able to place n/2 + 1 replicas |
3877 | | // then we succeed otherwise we fail. |
3878 | 843 | size_t total_extra_replicas = num_replicas - minimum_sum; |
3879 | 843 | size_t total_feasible_replicas = 0; |
3880 | 843 | size_t total_extra_servers = 0; |
3881 | 1.14k | for (const auto& pb : placement_info.placement_blocks()) { |
3882 | 1.14k | auto allowed_ts = VERIFY_RESULT(FindTServersForPlacementBlock(pb, ts_descs)); |
3883 | 0 | size_t allowed_ts_size = allowed_ts.size(); |
3884 | 1.14k | size_t min_num_replicas = pb.min_num_replicas(); |
3885 | | // For every placement block, we can only satisfy upto the number of |
3886 | | // tservers present in that particular placement block. |
3887 | 1.14k | total_feasible_replicas += min(allowed_ts_size, min_num_replicas); |
3888 | | // Extra tablet servers beyond min_num_replicas will be used to place |
3889 | | // the extra replicas over and above the minimums. |
3890 | 1.14k | if (allowed_ts_size > min_num_replicas) { |
3891 | 377 | total_extra_servers += allowed_ts_size - min_num_replicas; |
3892 | 377 | } |
3893 | 1.14k | } |
3894 | | // The total number of extra replicas that we can put cannot be more than |
3895 | | // the total tablet servers that are extra. |
3896 | 843 | total_feasible_replicas += min(total_extra_replicas, total_extra_servers); |
3897 | | |
3898 | | // If we place the replicas in accordance with above, we should be able to place |
3899 | | // at least replica_quorum_needed otherwise we fail. |
3900 | 843 | if (total_feasible_replicas < replica_quorum_needed) { |
3901 | 1 | msg = Substitute("Not enough tablet servers in the requested placements. " |
3902 | 1 | "Can only find $0 tablet servers for the replicas but need at least " |
3903 | 1 | "$1.", total_feasible_replicas, replica_quorum_needed); |
3904 | 1 | s = STATUS(InvalidArgument, msg); |
3905 | 1 | LOG(WARNING) << msg; |
3906 | 1 | return SetupError(resp->mutable_error(), MasterErrorPB::REPLICATION_FACTOR_TOO_HIGH, s); |
3907 | 1 | } |
3908 | 843 | } |
3909 | | |
3910 | 56.8k | return Status::OK(); |
3911 | 56.8k | } |
3912 | | |
3913 | | Status CatalogManager::CreateTableInMemory(const CreateTableRequestPB& req, |
3914 | | const Schema& schema, |
3915 | | const PartitionSchema& partition_schema, |
3916 | | const NamespaceId& namespace_id, |
3917 | | const NamespaceName& namespace_name, |
3918 | | const std::vector<Partition>& partitions, |
3919 | | IndexInfoPB* index_info, |
3920 | | TabletInfos* tablets, |
3921 | | CreateTableResponsePB* resp, |
3922 | 56.3k | scoped_refptr<TableInfo>* table) { |
3923 | | // Add the new table in "preparing" state. |
3924 | 56.3k | *table = CreateTableInfo(req, schema, partition_schema, namespace_id, namespace_name, index_info); |
3925 | 56.3k | const TableId& table_id = (*table)->id(); |
3926 | | |
3927 | 56.3k | VLOG_WITH_PREFIX_AND_FUNC0 (2) |
3928 | 0 | << "Table: " << (**table).ToString() << ", create_tablets: " << (tablets ? "YES" : "NO"); |
3929 | | |
3930 | 56.3k | auto table_ids_map_checkout = table_ids_map_.CheckOut(); |
3931 | 56.3k | (*table_ids_map_checkout)[table_id] = *table; |
3932 | | // Do not add Postgres tables to the name map as the table name is not unique in a namespace. |
3933 | 56.3k | if (req.table_type() != PGSQL_TABLE_TYPE) { |
3934 | 38.1k | table_names_map_[{namespace_id, req.name()}] = *table; |
3935 | 38.1k | } |
3936 | | |
3937 | 56.3k | if (req.table_type() == TRANSACTION_STATUS_TABLE_TYPE) { |
3938 | 1.09k | transaction_table_ids_set_.insert(table_id); |
3939 | 1.09k | } |
3940 | | |
3941 | 56.3k | if (tablets) { |
3942 | 42.8k | *tablets = VERIFY_RESULT(CreateTabletsFromTable(partitions, *table)); |
3943 | 42.8k | } |
3944 | | |
3945 | 56.3k | if (resp != nullptr) { |
3946 | 21.8k | resp->set_table_id(table_id); |
3947 | 21.8k | } |
3948 | | |
3949 | 56.3k | HandleNewTableId(table_id); |
3950 | | |
3951 | 56.3k | return Status::OK(); |
3952 | 56.3k | } |
3953 | | |
3954 | | Result<bool> CatalogManager::TableExists( |
3955 | 6.64k | const std::string& namespace_name, const std::string& table_name) const { |
3956 | 6.64k | TableIdentifierPB table_id_pb; |
3957 | 6.64k | table_id_pb.set_table_name(table_name); |
3958 | 6.64k | table_id_pb.mutable_namespace_()->set_name(namespace_name); |
3959 | 6.64k | return DoesTableExist(FindTable(table_id_pb)); |
3960 | 6.64k | } |
3961 | | |
3962 | | CHECKED_STATUS CatalogManager::CreateTransactionStatusTable( |
3963 | | const CreateTransactionStatusTableRequestPB* req, CreateTransactionStatusTableResponsePB* resp, |
3964 | 0 | rpc::RpcContext *rpc) { |
3965 | 0 | const string& table_name = req->table_name(); |
3966 | 0 | Status s = CreateTransactionStatusTableInternal(rpc, table_name, nullptr /* tablespace_id */); |
3967 | 0 | if (s.IsAlreadyPresent()) { |
3968 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_ALREADY_PRESENT, s); |
3969 | 0 | } |
3970 | 0 | if (!s.ok()) { |
3971 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INTERNAL_ERROR, s); |
3972 | 0 | } |
3973 | 0 | return Status::OK(); |
3974 | 0 | } |
3975 | | |
3976 | | CHECKED_STATUS CatalogManager::CreateTransactionStatusTableInternal( |
3977 | 6.33k | rpc::RpcContext *rpc, const string& table_name, const TablespaceId* tablespace_id) { |
3978 | 6.33k | if (VERIFY_RESULT(TableExists(kSystemNamespaceName, table_name))) { |
3979 | 5.23k | return STATUS_SUBSTITUTE(AlreadyPresent, "Table already exists: $0", table_name); |
3980 | 5.23k | } |
3981 | | |
3982 | 1.10k | LOG(INFO) << "Creating transaction status table " << table_name; |
3983 | | // Set up a CreateTable request internally. |
3984 | 1.10k | CreateTableRequestPB req; |
3985 | 1.10k | CreateTableResponsePB resp; |
3986 | 1.10k | req.set_name(table_name); |
3987 | 1.10k | req.mutable_namespace_()->set_name(kSystemNamespaceName); |
3988 | 1.10k | req.set_table_type(TableType::TRANSACTION_STATUS_TABLE_TYPE); |
3989 | 1.10k | if (tablespace_id) { |
3990 | 28 | req.set_tablespace_id(*tablespace_id); |
3991 | 28 | } |
3992 | | |
3993 | | // Explicitly set the number tablets if the corresponding flag is set, otherwise CreateTable |
3994 | | // will use the same defaults as for regular tables. |
3995 | 1.10k | int num_tablets; |
3996 | 1.10k | if (FLAGS_transaction_table_num_tablets > 0) { |
3997 | 349 | num_tablets = FLAGS_transaction_table_num_tablets; |
3998 | 756 | } else { |
3999 | 756 | auto placement_uuid = |
4000 | 756 | ClusterConfig()->LockForRead()->pb.replication_info().live_replicas().placement_uuid(); |
4001 | 756 | num_tablets = narrow_cast<int>(GetNumLiveTServersForPlacement(placement_uuid) * |
4002 | 756 | FLAGS_transaction_table_num_tablets_per_tserver); |
4003 | 756 | } |
4004 | 1.10k | req.mutable_schema()->mutable_table_properties()->set_num_tablets(num_tablets); |
4005 | | |
4006 | 1.10k | ColumnSchema hash(kRedisKeyColumnName, BINARY, /* is_nullable */ false, /* is_hash_key */ true); |
4007 | 1.10k | ColumnSchemaToPB(hash, req.mutable_schema()->mutable_columns()->Add()); |
4008 | | |
4009 | 1.10k | Status s = CreateTable(&req, &resp, rpc); |
4010 | | // We do not lock here so it is technically possible that the table was already created. |
4011 | | // If so, there is nothing to do so we just ignore the "AlreadyPresent" error. |
4012 | 1.10k | if (!s.ok() && !s.IsAlreadyPresent()8 ) { |
4013 | 8 | return s; |
4014 | 8 | } |
4015 | | |
4016 | 1.09k | return Status::OK(); |
4017 | 1.10k | } |
4018 | | |
4019 | 330 | bool CatalogManager::DoesTransactionTableExistForTablespace(const TablespaceId& tablespace_id) { |
4020 | 330 | SharedLock lock(mutex_); |
4021 | 553 | for (const auto& table_id : transaction_table_ids_set_) { |
4022 | 553 | auto table = table_ids_map_->find(table_id); |
4023 | 553 | if (table == table_ids_map_->end()) { |
4024 | 0 | LOG(DFATAL) << "Table uuid " << table_id |
4025 | 0 | << " in transaction_table_ids_set_ but not in table_ids_map_"; |
4026 | 0 | continue; |
4027 | 0 | } |
4028 | 553 | auto this_tablespace_id = GetTransactionStatusTableTablespace(table->second); |
4029 | 553 | if (this_tablespace_id && *this_tablespace_id == tablespace_id467 ) { |
4030 | 302 | return true; |
4031 | 302 | } |
4032 | 553 | } |
4033 | 28 | return false; |
4034 | 330 | } |
4035 | | |
4036 | | CHECKED_STATUS CatalogManager::CreateLocalTransactionStatusTableIfNeeded( |
4037 | 330 | rpc::RpcContext *rpc, const TablespaceId& tablespace_id) { |
4038 | 330 | std::lock_guard<std::mutex> lock(tablespace_transaction_table_creation_mutex_); |
4039 | | |
4040 | 330 | if (DoesTransactionTableExistForTablespace(tablespace_id)) { |
4041 | 302 | VLOG(1) << "Transaction status table already exists, not creating."0 ; |
4042 | 302 | return Status::OK(); |
4043 | 302 | } |
4044 | | |
4045 | 28 | std::string table_name; |
4046 | 28 | if (FLAGS_TEST_name_transaction_tables_with_tablespace_id) { |
4047 | 12 | uint32_t tablespace_oid = VERIFY_RESULT(GetPgsqlTablespaceOid(tablespace_id)); |
4048 | 0 | table_name = kTransactionTablePrefix + std::to_string(tablespace_oid); |
4049 | 16 | } else { |
4050 | 16 | std::string uuid; |
4051 | 16 | RETURN_NOT_OK(yb::Uuid::Generate().ToString(&uuid)); |
4052 | 16 | table_name = kTransactionTablePrefix + uuid; |
4053 | 16 | } |
4054 | | |
4055 | 28 | return CreateTransactionStatusTableInternal(rpc, table_name, &tablespace_id); |
4056 | 28 | } |
4057 | | |
4058 | 6.31k | CHECKED_STATUS CatalogManager::CreateGlobalTransactionStatusTableIfNeeded(rpc::RpcContext *rpc) { |
4059 | 6.31k | Status s = CreateTransactionStatusTableInternal( |
4060 | 6.31k | rpc, kGlobalTransactionsTableName, nullptr /* tablespace_id */); |
4061 | 6.31k | if (s.IsAlreadyPresent()) { |
4062 | 5.23k | VLOG(1) << "Transaction status table already exists, not creating."0 ; |
4063 | 5.23k | return Status::OK(); |
4064 | 5.23k | } |
4065 | 1.07k | return s; |
4066 | 6.31k | } |
4067 | | |
4068 | 6.71k | Result<TableInfoPtr> CatalogManager::GetGlobalTransactionStatusTable() { |
4069 | 6.71k | TableIdentifierPB global_txn_table_identifier; |
4070 | 6.71k | global_txn_table_identifier.set_table_name(kGlobalTransactionsTableName); |
4071 | 6.71k | global_txn_table_identifier.mutable_namespace_()->set_name(kSystemNamespaceName); |
4072 | 6.71k | return FindTable(global_txn_table_identifier); |
4073 | 6.71k | } |
4074 | | |
4075 | | CHECKED_STATUS CatalogManager::GetGlobalTransactionStatusTablets( |
4076 | 3.25k | GetTransactionStatusTabletsResponsePB* resp) { |
4077 | 3.25k | auto global_txn_table = VERIFY_RESULT(GetGlobalTransactionStatusTable()); |
4078 | | |
4079 | 0 | auto l = global_txn_table->LockForRead(); |
4080 | 3.25k | RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp)); |
4081 | | |
4082 | 61.8k | for (const auto& tablet : global_txn_table->GetTablets())3.25k { |
4083 | 61.8k | TabletLocationsPB locs_pb; |
4084 | 61.8k | RETURN_NOT_OK(BuildLocationsForTablet(tablet, &locs_pb)); |
4085 | 61.8k | resp->add_global_tablet_id(tablet->tablet_id()); |
4086 | 61.8k | } |
4087 | | |
4088 | 3.25k | return Status::OK(); |
4089 | 3.25k | } |
4090 | | |
4091 | | Result<std::vector<TableInfoPtr>> CatalogManager::GetPlacementLocalTransactionStatusTables( |
4092 | 3.38k | const CloudInfoPB& placement) { |
4093 | 3.38k | std::vector<TableInfoPtr> same_placement_transaction_tables; |
4094 | 3.38k | auto tablespace_manager = GetTablespaceManager(); |
4095 | | |
4096 | 3.38k | SharedLock lock(mutex_); |
4097 | 3.75k | for (const auto& table_id : transaction_table_ids_set_) { |
4098 | 3.75k | auto table = table_ids_map_->find(table_id); |
4099 | 3.75k | if (table == table_ids_map_->end()) { |
4100 | 0 | LOG(DFATAL) << "Table uuid " << table_id |
4101 | 0 | << " in transaction_table_ids_set_ but not in table_ids_map_"; |
4102 | 0 | continue; |
4103 | 0 | } |
4104 | | // system.transaction is filtered out because it cannot have a placement set. |
4105 | 3.75k | auto table_info = table->second; |
4106 | 3.75k | auto lock = table_info->LockForRead(); |
4107 | 3.75k | auto tablespace_id = GetTransactionStatusTableTablespace(table_info); |
4108 | 3.75k | auto cloud_info = lock->pb.replication_info(); |
4109 | 3.81k | if (!IsReplicationInfoSet(cloud_info)3.75k ) { |
4110 | 3.81k | if (tablespace_id) { |
4111 | 245 | const auto result = tablespace_manager->GetTablespaceReplicationInfo(*tablespace_id); |
4112 | 245 | if (!result.ok() || !*result185 || !IsReplicationInfoSet(**result)185 ) { |
4113 | 60 | continue; |
4114 | 60 | } |
4115 | 185 | cloud_info = **result; |
4116 | 185 | } |
4117 | 3.81k | } |
4118 | 3.69k | const auto& txn_table_replicas = cloud_info.live_replicas(); |
4119 | | // Skip transaction tables spanning multiple regions, since using them will incur global |
4120 | | // latencies. See #11268. |
4121 | 3.69k | if (CatalogManagerUtil::DoesPlacementInfoSpanMultipleRegions(txn_table_replicas)) { |
4122 | 9 | continue; |
4123 | 9 | } |
4124 | 3.68k | if (CatalogManagerUtil::DoesPlacementInfoContainCloudInfo(txn_table_replicas, placement)) { |
4125 | 161 | same_placement_transaction_tables.push_back(table_info); |
4126 | 161 | } |
4127 | 3.68k | } |
4128 | | |
4129 | 3.38k | return same_placement_transaction_tables; |
4130 | 3.38k | } |
4131 | | |
4132 | | CHECKED_STATUS CatalogManager::GetPlacementLocalTransactionStatusTablets( |
4133 | | const std::vector<TableInfoPtr>& placement_local_tables, |
4134 | 3.31k | GetTransactionStatusTabletsResponsePB* resp) { |
4135 | 3.31k | if (placement_local_tables.empty()) { |
4136 | 3.22k | return Status::OK(); |
4137 | 3.22k | } |
4138 | | |
4139 | 86 | SharedLock lock(mutex_); |
4140 | 86 | for (const auto& table_info : placement_local_tables) { |
4141 | 86 | auto lock = table_info->LockForRead(); |
4142 | 2.49k | for (const auto& tablet : table_info->GetTablets()) { |
4143 | 2.49k | TabletLocationsPB locs_pb; |
4144 | 2.49k | RETURN_NOT_OK(BuildLocationsForTablet(tablet, &locs_pb)); |
4145 | 2.49k | resp->add_placement_local_tablet_id(tablet->tablet_id()); |
4146 | 2.49k | } |
4147 | 86 | } |
4148 | | |
4149 | 86 | return Status::OK(); |
4150 | 86 | } |
4151 | | |
4152 | | CHECKED_STATUS CatalogManager::GetTransactionStatusTablets( |
4153 | | const GetTransactionStatusTabletsRequestPB* req, |
4154 | | GetTransactionStatusTabletsResponsePB* resp, |
4155 | 3.36k | rpc::RpcContext *rpc) { |
4156 | 3.44k | for (;;) { |
4157 | 3.44k | SCOPED_LEADER_SHARED_LOCK(lock, this); |
4158 | 3.44k | auto global_txn_table = VERIFY_RESULT(GetGlobalTransactionStatusTable()); |
4159 | 3.44k | if (!VERIFY_RESULT(IsCreateTableDone(global_txn_table))) { |
4160 | 61 | lock.Unlock(); |
4161 | 61 | RETURN_NOT_OK(WaitForCreateTableToFinish(global_txn_table->id(), rpc->GetClientDeadline())); |
4162 | 61 | continue; |
4163 | 61 | } |
4164 | | |
4165 | 3.38k | std::vector<TableInfoPtr> local_tables; |
4166 | 3.38k | if (req->has_placement()) { |
4167 | 3.37k | local_tables = VERIFY_RESULT(GetPlacementLocalTransactionStatusTables(req->placement())); |
4168 | 0 | bool need_restart = false; |
4169 | 3.37k | for (const auto& table : local_tables) { |
4170 | 161 | if (!VERIFY_RESULT(IsCreateTableDone(table))) { |
4171 | 75 | if (!need_restart) { |
4172 | 75 | need_restart = true; |
4173 | 75 | lock.Unlock(); |
4174 | 75 | } |
4175 | 75 | RETURN_NOT_OK(WaitForCreateTableToFinish(table->id(), rpc->GetClientDeadline())); |
4176 | 75 | } |
4177 | 161 | } |
4178 | 3.37k | if (need_restart) { |
4179 | 75 | continue; |
4180 | 75 | } |
4181 | 3.37k | } |
4182 | | |
4183 | 3.31k | RETURN_NOT_OK(GetGlobalTransactionStatusTablets(resp)); |
4184 | 3.31k | RETURN_NOT_OK(GetPlacementLocalTransactionStatusTablets(local_tables, resp)); |
4185 | | |
4186 | 3.31k | return Status::OK(); |
4187 | 3.31k | } |
4188 | 3.36k | } |
4189 | | |
4190 | 1 | Status CatalogManager::CreateMetricsSnapshotsTableIfNeeded(rpc::RpcContext *rpc) { |
4191 | 1 | if (VERIFY_RESULT(TableExists(kSystemNamespaceName, kMetricsSnapshotsTableName))) { |
4192 | 0 | return Status::OK(); |
4193 | 0 | } |
4194 | | |
4195 | | // Set up a CreateTable request internally. |
4196 | 1 | CreateTableRequestPB req; |
4197 | 1 | CreateTableResponsePB resp; |
4198 | 1 | req.set_name(kMetricsSnapshotsTableName); |
4199 | 1 | req.mutable_namespace_()->set_name(kSystemNamespaceName); |
4200 | 1 | req.set_table_type(TableType::YQL_TABLE_TYPE); |
4201 | | |
4202 | | // Explicitly set the number tablets if the corresponding flag is set, otherwise CreateTable |
4203 | | // will use the same defaults as for regular tables. |
4204 | 1 | if (FLAGS_metrics_snapshots_table_num_tablets > 0) { |
4205 | 0 | req.mutable_schema()->mutable_table_properties()->set_num_tablets( |
4206 | 0 | FLAGS_metrics_snapshots_table_num_tablets); |
4207 | 0 | } |
4208 | | |
4209 | | // Schema description: "node" refers to tserver uuid. "entity_type" can be either |
4210 | | // "tserver" or "table". "entity_id" is uuid of corresponding tserver or table. |
4211 | | // "metric" is the name of the metric and "value" is its val. "ts" is time at |
4212 | | // which the snapshot was recorded. "details" is a json column for future extensibility. |
4213 | | |
4214 | 1 | YBSchemaBuilder schemaBuilder; |
4215 | 1 | schemaBuilder.AddColumn("node")->Type(STRING)->HashPrimaryKey()->NotNull(); |
4216 | 1 | schemaBuilder.AddColumn("entity_type")->Type(STRING)->PrimaryKey()->NotNull(); |
4217 | 1 | schemaBuilder.AddColumn("entity_id")->Type(STRING)->PrimaryKey()->NotNull(); |
4218 | 1 | schemaBuilder.AddColumn("metric")->Type(STRING)->PrimaryKey()->NotNull(); |
4219 | 1 | schemaBuilder.AddColumn("ts")->Type(TIMESTAMP)->PrimaryKey()->NotNull()-> |
4220 | 1 | SetSortingType(SortingType::kDescending); |
4221 | 1 | schemaBuilder.AddColumn("value")->Type(INT64); |
4222 | 1 | schemaBuilder.AddColumn("details")->Type(JSONB); |
4223 | | |
4224 | 1 | YBSchema ybschema; |
4225 | 1 | CHECK_OK(schemaBuilder.Build(&ybschema)); |
4226 | | |
4227 | 1 | auto schema = yb::client::internal::GetSchema(ybschema); |
4228 | 1 | SchemaToPB(schema, req.mutable_schema()); |
4229 | | |
4230 | 1 | Status s = CreateTable(&req, &resp, rpc); |
4231 | | // We do not lock here so it is technically possible that the table was already created. |
4232 | | // If so, there is nothing to do so we just ignore the "AlreadyPresent" error. |
4233 | 1 | if (s.IsAlreadyPresent()) { |
4234 | 0 | return Status::OK(); |
4235 | 0 | } |
4236 | 1 | return s; |
4237 | 1 | } |
4238 | | |
4239 | 40.3k | Result<bool> CatalogManager::IsCreateTableDone(const TableInfoPtr& table) { |
4240 | 40.3k | TRACE("Locking table"); |
4241 | 40.3k | auto l = table->LockForRead(); |
4242 | 40.3k | RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l)); |
4243 | 40.2k | const auto& pb = l->pb; |
4244 | | |
4245 | | // 2. Verify if the create is in-progress. |
4246 | 40.2k | TRACE("Verify if the table creation is in progress for $0", table->ToString()); |
4247 | 40.2k | auto result = !table->IsCreateInProgress(); |
4248 | | |
4249 | | // 3. Set any current errors, if we are experiencing issues creating the table. This will be |
4250 | | // bubbled up to the MasterService layer. If it is an error, it gets wrapped around in |
4251 | | // MasterErrorPB::UNKNOWN_ERROR. |
4252 | 40.2k | RETURN_NOT_OK(table->GetCreateTableErrorStatus()); |
4253 | | |
4254 | | // 4. If this is an index, we are not done until the index is in the indexed table's schema. An |
4255 | | // exception is YSQL system table indexes, which don't get added to their indexed tables' schemas. |
4256 | 40.2k | if (result && IsIndex(pb)16.8k ) { |
4257 | 1.43k | auto& indexed_table_id = GetIndexedTableId(pb); |
4258 | | // For user indexes (which add index info to indexed table's schema), |
4259 | | // - if this index is created without backfill, |
4260 | | // - waiting for the index to be in the indexed table's schema is sufficient, and, by that |
4261 | | // point, things are fully created. |
4262 | | // - if this index is created with backfill |
4263 | | // - and it's YCQL, |
4264 | | // - waiting for the index to be in the indexed table's schema means waiting for the |
4265 | | // DELETE_ONLY index permission, and it's fine to return to the client before the index |
4266 | | // gets the rest of the permissions because the expectation is that backfill will be |
4267 | | // completed asynchronously. |
4268 | | // - and it's YSQL, |
4269 | | // - waiting for the index to be in the indexed table's schema means just that (DocDB index |
4270 | | // permissions don't really matter for YSQL besides being used for backfill purposes), and |
4271 | | // it's a signal for postgres to continue the index backfill process, activating index |
4272 | | // state flags then later triggering backfill and so on. |
4273 | | // For YSQL system indexes (which don't add index info to indexed table's schema), |
4274 | | // - there's nothing additional to wait on. |
4275 | | // Therefore, the only thing needed here is to check whether the index info is in the indexed |
4276 | | // table's schema for user indexes. |
4277 | 1.43k | if (pb.table_type() == YQL_TABLE_TYPE || |
4278 | 1.43k | (912 pb.table_type() == PGSQL_TABLE_TYPE912 && IsUserCreatedTable(*table)912 )) { |
4279 | 1.31k | GetTableSchemaRequestPB get_schema_req; |
4280 | 1.31k | GetTableSchemaResponsePB get_schema_resp; |
4281 | 1.31k | get_schema_req.mutable_table()->set_table_id(indexed_table_id); |
4282 | 1.31k | const bool get_fully_applied_indexes = true; |
4283 | 1.31k | RETURN_NOT_OK(GetTableSchemaInternal( |
4284 | 1.31k | &get_schema_req, &get_schema_resp, get_fully_applied_indexes)); |
4285 | | |
4286 | 1.31k | result = false; |
4287 | 2.22k | for (const auto& index : get_schema_resp.indexes()) { |
4288 | 2.22k | if (index.has_table_id() && index.table_id() == table->id()) { |
4289 | 1.19k | result = true; |
4290 | 1.19k | break; |
4291 | 1.19k | } |
4292 | 2.22k | } |
4293 | 1.31k | } |
4294 | 1.43k | } |
4295 | | |
4296 | | // Sanity check that this table is present in system.partitions if it is a YCQL table. |
4297 | | // Only check if we are automatically generating the vtable on changes. If we are creating via |
4298 | | // the bg task, then there may be a delay. |
4299 | 40.2k | if (DCHECK_IS_ON() && |
4300 | 40.2k | result40.2k && |
4301 | 40.2k | IsYcqlTable(*table)16.7k && |
4302 | 40.2k | YQLPartitionsVTable::GeneratePartitionsVTableOnChanges()2.01k && |
4303 | 40.2k | FLAGS_TEST_catalog_manager_check_yql_partitions_exist_for_is_create_table_done2.01k ) { |
4304 | 2.01k | Schema schema; |
4305 | 2.01k | RETURN_NOT_OK(table->GetSchema(&schema)); |
4306 | | // Copartitioned tables don't actually create tablets currently (unimplemented), so ignore them. |
4307 | 2.01k | if (!schema.table_properties().HasCopartitionTableId()) { |
4308 | 2.01k | DCHECK(GetYqlPartitionsVtable().CheckTableIsPresent(table->id(), table->NumPartitions())); |
4309 | 2.01k | } |
4310 | 2.01k | } |
4311 | | |
4312 | | // If this is a transactional table we are not done until the transaction status table is created. |
4313 | | // However, if we are currently initializing the system catalog snapshot, we don't create the |
4314 | | // transactions table. |
4315 | 40.2k | if (!FLAGS_create_initial_sys_catalog_snapshot && |
4316 | 40.2k | result39.9k && pb.schema().table_properties().is_transactional()16.5k ) { |
4317 | 5.59k | result = VERIFY_RESULT(IsTransactionStatusTableCreated()); |
4318 | 5.59k | } |
4319 | | |
4320 | | // We are not done until the metrics snapshots table is created. |
4321 | 40.2k | if (FLAGS_master_enable_metrics_snapshotter && result0 && |
4322 | 40.2k | !(0 table->GetTableType() == TableType::YQL_TABLE_TYPE0 && |
4323 | 0 | table->namespace_id() == kSystemNamespaceId && |
4324 | 0 | table->name() == kMetricsSnapshotsTableName)) { |
4325 | 0 | result = VERIFY_RESULT(IsMetricsSnapshotsTableCreated()); |
4326 | 0 | } |
4327 | | |
4328 | | // If this is a colocated table and there is a pending AddTableToTablet task then we are not done. |
4329 | 40.2k | if (result && pb.colocated()16.6k ) { |
4330 | 294 | result = !table->HasTasks(MonitoredTask::Type::ASYNC_ADD_TABLE_TO_TABLET); |
4331 | 294 | } |
4332 | | |
4333 | 40.2k | return result; |
4334 | 40.2k | } |
4335 | | |
4336 | | Status CatalogManager::IsCreateTableDone(const IsCreateTableDoneRequestPB* req, |
4337 | 31.1k | IsCreateTableDoneResponsePB* resp) { |
4338 | 31.1k | TRACE("Looking up table"); |
4339 | | // 1. Lookup the table and verify if it exists. |
4340 | 31.1k | scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTable(req->table())); |
4341 | 31.0k | resp->set_done(VERIFY_RESULT(IsCreateTableDone(table))); |
4342 | 0 | return Status::OK(); |
4343 | 31.1k | } |
4344 | | |
4345 | | Status CatalogManager::IsCreateTableInProgress(const TableId& table_id, |
4346 | | CoarseTimePoint deadline, |
4347 | 1.33k | bool* create_in_progress) { |
4348 | 1.33k | DCHECK_ONLY_NOTNULL(create_in_progress); |
4349 | 1.33k | DCHECK(!table_id.empty()); |
4350 | | |
4351 | 1.33k | IsCreateTableDoneRequestPB req; |
4352 | 1.33k | IsCreateTableDoneResponsePB resp; |
4353 | 1.33k | req.mutable_table()->set_table_id(table_id); |
4354 | 1.33k | RETURN_NOT_OK(IsCreateTableDone(&req, &resp)); |
4355 | | |
4356 | 1.33k | if (resp.has_error()) { |
4357 | 0 | return StatusFromPB(resp.error().status()); |
4358 | 0 | } |
4359 | | |
4360 | 1.33k | *create_in_progress = !resp.done(); |
4361 | 1.33k | return Status::OK(); |
4362 | 1.33k | } |
4363 | | |
4364 | | Status CatalogManager::WaitForCreateTableToFinish( |
4365 | 136 | const TableId& table_id, CoarseTimePoint deadline) { |
4366 | 136 | return client::RetryFunc( |
4367 | 136 | deadline, "Waiting on Create Table to be completed", "Timed out waiting for Table Creation", |
4368 | 136 | std::bind(&CatalogManager::IsCreateTableInProgress, this, table_id, _1, _2)); |
4369 | 136 | } |
4370 | | |
4371 | 5.59k | Result<bool> CatalogManager::IsTransactionStatusTableCreated() { |
4372 | 5.59k | TableIdentifierPB table_id; |
4373 | | |
4374 | 5.59k | table_id.set_table_name(kGlobalTransactionsTableName); |
4375 | 5.59k | table_id.mutable_namespace_()->set_name(kSystemNamespaceName); |
4376 | | |
4377 | 5.59k | return IsCreateTableDone(VERIFY_RESULT(FindTable(table_id))); |
4378 | 5.59k | } |
4379 | | |
4380 | 0 | Result<bool> CatalogManager::IsMetricsSnapshotsTableCreated() { |
4381 | 0 | TableIdentifierPB table_id; |
4382 | |
|
4383 | 0 | table_id.set_table_name(kMetricsSnapshotsTableName); |
4384 | 0 | table_id.mutable_namespace_()->set_name(kSystemNamespaceName); |
4385 | 0 | table_id.mutable_namespace_()->set_database_type(YQLDatabase::YQL_DATABASE_CQL); |
4386 | |
|
4387 | 0 | return IsCreateTableDone(VERIFY_RESULT(FindTable(table_id))); |
4388 | 0 | } |
4389 | | |
4390 | 10 | std::string CatalogManager::GenerateId(boost::optional<const SysRowEntryType> entity_type) { |
4391 | 10 | SharedLock lock(mutex_); |
4392 | 10 | return GenerateIdUnlocked(entity_type); |
4393 | 10 | } |
4394 | | |
4395 | | std::string CatalogManager::GenerateIdUnlocked( |
4396 | 124k | boost::optional<const SysRowEntryType> entity_type) { |
4397 | 124k | while (true) { |
4398 | | // Generate id and make sure it is unique within its category. |
4399 | 124k | std::string id = GenerateObjectId(); |
4400 | 124k | if (!entity_type) { |
4401 | 10 | return id; |
4402 | 10 | } |
4403 | 124k | switch (*entity_type) { |
4404 | 2.41k | case SysRowEntryType::NAMESPACE: |
4405 | 2.41k | if (FindPtrOrNull(namespace_ids_map_, id) == nullptr) return id; |
4406 | 0 | break; |
4407 | 38.1k | case SysRowEntryType::TABLE: |
4408 | 38.1k | if (FindPtrOrNull(*table_ids_map_, id) == nullptr) return id; |
4409 | 0 | break; |
4410 | 83.1k | case SysRowEntryType::TABLET: |
4411 | 83.1k | if (FindPtrOrNull(*tablet_map_, id) == nullptr) return id; |
4412 | 0 | break; |
4413 | 46 | case SysRowEntryType::UDTYPE: |
4414 | 46 | if (FindPtrOrNull(udtype_ids_map_, id) == nullptr) return id; |
4415 | 0 | break; |
4416 | 0 | case SysRowEntryType::SNAPSHOT: |
4417 | 0 | return id; |
4418 | 310 | case SysRowEntryType::CDC_STREAM: |
4419 | 310 | if (!CDCStreamExistsUnlocked(id)) return id; |
4420 | 0 | break; |
4421 | 0 | case SysRowEntryType::CLUSTER_CONFIG: FALLTHROUGH_INTENDED; |
4422 | 0 | case SysRowEntryType::ROLE: FALLTHROUGH_INTENDED; |
4423 | 0 | case SysRowEntryType::REDIS_CONFIG: FALLTHROUGH_INTENDED; |
4424 | 0 | case SysRowEntryType::UNIVERSE_REPLICATION: FALLTHROUGH_INTENDED; |
4425 | 0 | case SysRowEntryType::SYS_CONFIG: FALLTHROUGH_INTENDED; |
4426 | 0 | case SysRowEntryType::SNAPSHOT_SCHEDULE: FALLTHROUGH_INTENDED; |
4427 | 0 | case SysRowEntryType::DDL_LOG_ENTRY: FALLTHROUGH_INTENDED; |
4428 | 0 | case SysRowEntryType::UNKNOWN: |
4429 | 0 | LOG(DFATAL) << "Invalid id type: " << *entity_type; |
4430 | 0 | return id; |
4431 | 124k | } |
4432 | 124k | } |
4433 | 124k | } |
4434 | | |
4435 | | scoped_refptr<TableInfo> CatalogManager::CreateTableInfo(const CreateTableRequestPB& req, |
4436 | | const Schema& schema, |
4437 | | const PartitionSchema& partition_schema, |
4438 | | const NamespaceId& namespace_id, |
4439 | | const NamespaceName& namespace_name, |
4440 | 56.3k | IndexInfoPB* index_info) { |
4441 | 56.3k | DCHECK(schema.has_column_ids()); |
4442 | 56.3k | TableId table_id |
4443 | 56.3k | = !req.table_id().empty() ? req.table_id()18.2k : GenerateIdUnlocked(SysRowEntryType::TABLE)38.1k ; |
4444 | 56.3k | scoped_refptr<TableInfo> table = NewTableInfo(table_id); |
4445 | 56.3k | if (req.has_tablespace_id()) { |
4446 | 186 | table->SetTablespaceIdForTableCreation(req.tablespace_id()); |
4447 | 186 | } |
4448 | 56.3k | table->mutable_metadata()->StartMutation(); |
4449 | 56.3k | SysTablesEntryPB *metadata = &table->mutable_metadata()->mutable_dirty()->pb; |
4450 | 56.3k | metadata->set_state(SysTablesEntryPB::PREPARING); |
4451 | 56.3k | metadata->set_name(req.name()); |
4452 | 56.3k | metadata->set_table_type(req.table_type()); |
4453 | 56.3k | metadata->set_namespace_id(namespace_id); |
4454 | 56.3k | metadata->set_namespace_name(namespace_name); |
4455 | 56.3k | metadata->set_version(0); |
4456 | 56.3k | metadata->set_next_column_id(ColumnId(schema.max_col_id() + 1)); |
4457 | 56.3k | if (req.has_replication_info()) { |
4458 | 1 | metadata->mutable_replication_info()->CopyFrom(req.replication_info()); |
4459 | 1 | } |
4460 | | // Use the Schema object passed in, since it has the column IDs already assigned, |
4461 | | // whereas the user request PB does not. |
4462 | 56.3k | SchemaToPB(schema, metadata->mutable_schema()); |
4463 | 56.3k | partition_schema.ToPB(metadata->mutable_partition_schema()); |
4464 | | // For index table, set index details (indexed table id and whether the index is local). |
4465 | 56.3k | if (req.has_index_info()) { |
4466 | 7.10k | metadata->mutable_index_info()->CopyFrom(req.index_info()); |
4467 | | |
4468 | | // Set the deprecated fields also for compatibility reasons. |
4469 | 7.10k | metadata->set_indexed_table_id(req.index_info().indexed_table_id()); |
4470 | 7.10k | metadata->set_is_local_index(req.index_info().is_local()); |
4471 | 7.10k | metadata->set_is_unique_index(req.index_info().is_unique()); |
4472 | | |
4473 | | // Setup index info. |
4474 | 7.10k | if (index_info != nullptr) { |
4475 | 1.18k | index_info->set_table_id(table->id()); |
4476 | 1.18k | metadata->mutable_index_info()->CopyFrom(*index_info); |
4477 | 1.18k | } |
4478 | 49.2k | } else if (req.has_indexed_table_id()) { |
4479 | | // Read data from the deprecated field and update the new fields. |
4480 | 18 | metadata->mutable_index_info()->set_indexed_table_id(req.indexed_table_id()); |
4481 | 18 | metadata->mutable_index_info()->set_is_local(req.is_local_index()); |
4482 | 18 | metadata->mutable_index_info()->set_is_unique(req.is_unique_index()); |
4483 | | |
4484 | | // Set the deprecated fields also for compatibility reasons. |
4485 | 18 | metadata->set_indexed_table_id(req.indexed_table_id()); |
4486 | 18 | metadata->set_is_local_index(req.is_local_index()); |
4487 | 18 | metadata->set_is_unique_index(req.is_unique_index()); |
4488 | | |
4489 | | // Setup index info. |
4490 | 18 | if (index_info != nullptr) { |
4491 | 18 | index_info->set_table_id(table->id()); |
4492 | 18 | metadata->mutable_index_info()->CopyFrom(*index_info); |
4493 | 18 | } |
4494 | 18 | } |
4495 | | |
4496 | 56.3k | if (req.is_pg_shared_table()) { |
4497 | 50 | metadata->set_is_pg_shared_table(true); |
4498 | 50 | } |
4499 | | |
4500 | 56.3k | return table; |
4501 | 56.3k | } |
4502 | | |
4503 | | TabletInfoPtr CatalogManager::CreateTabletInfo(TableInfo* table, |
4504 | 83.1k | const PartitionPB& partition) { |
4505 | 83.1k | auto tablet = make_scoped_refptr<TabletInfo>(table, GenerateIdUnlocked(SysRowEntryType::TABLET)); |
4506 | 83.1k | VLOG_WITH_PREFIX_AND_FUNC0 (2) |
4507 | 0 | << "Table: " << table->ToString() << ", tablet: " << tablet->ToString(); |
4508 | | |
4509 | 83.1k | tablet->mutable_metadata()->StartMutation(); |
4510 | 83.1k | SysTabletsEntryPB *metadata = &tablet->mutable_metadata()->mutable_dirty()->pb; |
4511 | 83.1k | metadata->set_state(SysTabletsEntryPB::PREPARING); |
4512 | 83.1k | metadata->mutable_partition()->CopyFrom(partition); |
4513 | 83.1k | metadata->set_table_id(table->id()); |
4514 | | // This is important: we are setting the first table id in the table_ids list |
4515 | | // to be the id of the original table that creates the tablet. |
4516 | 83.1k | metadata->add_table_ids(table->id()); |
4517 | 83.1k | return tablet; |
4518 | 83.1k | } |
4519 | | |
4520 | | Status CatalogManager::RemoveTableIdsFromTabletInfo( |
4521 | | TabletInfoPtr tablet_info, |
4522 | 99 | std::unordered_set<TableId> tables_to_remove) { |
4523 | 99 | auto tablet_lock = tablet_info->LockForWrite(); |
4524 | | |
4525 | 99 | google::protobuf::RepeatedPtrField<std::string> new_table_ids; |
4526 | 54.0k | for (const auto& table_id : tablet_lock->pb.table_ids()) { |
4527 | 54.0k | if (tables_to_remove.find(table_id) == tables_to_remove.end()) { |
4528 | 46.0k | *new_table_ids.Add() = std::move(table_id); |
4529 | 46.0k | } |
4530 | 54.0k | } |
4531 | 99 | tablet_lock.mutable_data()->pb.mutable_table_ids()->Swap(&new_table_ids); |
4532 | | |
4533 | 99 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), tablet_info)); |
4534 | 95 | tablet_lock.Commit(); |
4535 | 95 | return Status::OK(); |
4536 | 99 | } |
4537 | | |
4538 | | Result<scoped_refptr<TableInfo>> CatalogManager::FindTable( |
4539 | 640k | const TableIdentifierPB& table_identifier) const { |
4540 | 640k | SharedLock lock(mutex_); |
4541 | 640k | return FindTableUnlocked(table_identifier); |
4542 | 640k | } |
4543 | | |
4544 | | Result<scoped_refptr<TableInfo>> CatalogManager::FindTableUnlocked( |
4545 | 640k | const TableIdentifierPB& table_identifier) const { |
4546 | 640k | if (table_identifier.has_table_id()) { |
4547 | 508k | return FindTableByIdUnlocked(table_identifier.table_id()); |
4548 | 508k | } |
4549 | | |
4550 | 131k | if (table_identifier.has_table_name()) { |
4551 | 131k | auto namespace_info = VERIFY_RESULT131k (FindNamespaceUnlocked(table_identifier.namespace_()));131k |
4552 | | |
4553 | | // We can't lookup YSQL table by name because Postgres concept of "schemas" |
4554 | | // introduces ambiguity. |
4555 | 131k | if (namespace_info->database_type() == YQL_DATABASE_PGSQL) { |
4556 | 0 | return STATUS(InvalidArgument, "Cannot lookup YSQL table by name"); |
4557 | 0 | } |
4558 | | |
4559 | 131k | auto it = table_names_map_.find({namespace_info->id(), table_identifier.table_name()}); |
4560 | 131k | if (it == table_names_map_.end()) { |
4561 | 3.57k | return STATUS_EC_FORMAT( |
4562 | 3.57k | NotFound, MasterError(MasterErrorPB::OBJECT_NOT_FOUND), |
4563 | 3.57k | "Table $0.$1 not found", namespace_info->name(), table_identifier.table_name()); |
4564 | 3.57k | } |
4565 | 128k | return it->second; |
4566 | 131k | } |
4567 | | |
4568 | 34 | return STATUS(InvalidArgument, "Neither table id or table name are specified", |
4569 | 131k | table_identifier.ShortDebugString()); |
4570 | 131k | } |
4571 | | |
4572 | | Result<scoped_refptr<TableInfo>> CatalogManager::FindTableById( |
4573 | 8.24k | const TableId& table_id) const { |
4574 | 8.24k | SharedLock lock(mutex_); |
4575 | 8.24k | return FindTableByIdUnlocked(table_id); |
4576 | 8.24k | } |
4577 | | |
4578 | | Result<scoped_refptr<TableInfo>> CatalogManager::FindTableByIdUnlocked( |
4579 | 516k | const TableId& table_id) const { |
4580 | 516k | auto it = table_ids_map_->find(table_id); |
4581 | 516k | if (it == table_ids_map_->end()) { |
4582 | 429 | return STATUS_EC_FORMAT( |
4583 | 429 | NotFound, MasterError(MasterErrorPB::OBJECT_NOT_FOUND), |
4584 | 429 | "Table with identifier $0 not found", table_id); |
4585 | 429 | } |
4586 | 516k | return it->second; |
4587 | 516k | } |
4588 | | |
4589 | | Result<scoped_refptr<NamespaceInfo>> CatalogManager::FindNamespaceById( |
4590 | 678k | const NamespaceId& id) const { |
4591 | 678k | SharedLock lock(mutex_); |
4592 | 678k | return FindNamespaceByIdUnlocked(id); |
4593 | 678k | } |
4594 | | |
4595 | | Result<scoped_refptr<NamespaceInfo>> CatalogManager::FindNamespaceByIdUnlocked( |
4596 | 775k | const NamespaceId& id) const { |
4597 | 775k | auto it = namespace_ids_map_.find(id); |
4598 | 775k | if (it == namespace_ids_map_.end()) { |
4599 | 3 | VLOG_WITH_FUNC0 (4) << "Not found: " << id << "\n" << GetStackTrace()0 ; |
4600 | 3 | return STATUS(NotFound, "Keyspace identifier not found", id, |
4601 | 3 | MasterError(MasterErrorPB::NAMESPACE_NOT_FOUND)); |
4602 | 3 | } |
4603 | 775k | return it->second; |
4604 | 775k | } |
4605 | | |
4606 | | Result<scoped_refptr<NamespaceInfo>> CatalogManager::FindNamespaceUnlocked( |
4607 | 236k | const NamespaceIdentifierPB& ns_identifier) const { |
4608 | 236k | if (ns_identifier.has_id()) { |
4609 | 96.7k | return FindNamespaceByIdUnlocked(ns_identifier.id()); |
4610 | 96.7k | } |
4611 | | |
4612 | 139k | if (ns_identifier.has_name()) { |
4613 | 139k | auto db = GetDatabaseType(ns_identifier); |
4614 | 139k | auto it = namespace_names_mapper_[db].find(ns_identifier.name()); |
4615 | 139k | if (it == namespace_names_mapper_[db].end()) { |
4616 | 1.91k | return STATUS(NotFound, "Keyspace name not found", ns_identifier.name(), |
4617 | 1.91k | MasterError(MasterErrorPB::NAMESPACE_NOT_FOUND)); |
4618 | 1.91k | } |
4619 | 137k | return it->second; |
4620 | 139k | } |
4621 | | |
4622 | 26 | LOG(DFATAL) << __func__ << ": " << ns_identifier.ShortDebugString() << ", \n" << GetStackTrace(); |
4623 | 26 | return STATUS(NotFound, "Neither keyspace id nor keyspace name is specified", |
4624 | 139k | ns_identifier.ShortDebugString(), MasterError(MasterErrorPB::NAMESPACE_NOT_FOUND)); |
4625 | 139k | } |
4626 | | |
4627 | | Result<scoped_refptr<NamespaceInfo>> CatalogManager::FindNamespace( |
4628 | 36.3k | const NamespaceIdentifierPB& ns_identifier) const { |
4629 | 36.3k | SharedLock lock(mutex_); |
4630 | 36.3k | return FindNamespaceUnlocked(ns_identifier); |
4631 | 36.3k | } |
4632 | | |
4633 | | Result<TableDescription> CatalogManager::DescribeTable( |
4634 | 0 | const TableIdentifierPB& table_identifier, bool succeed_if_create_in_progress) { |
4635 | 0 | TRACE("Looking up table"); |
4636 | 0 | return DescribeTable(VERIFY_RESULT(FindTable(table_identifier)), succeed_if_create_in_progress); |
4637 | 0 | } |
4638 | | |
4639 | | Result<TableDescription> CatalogManager::DescribeTable( |
4640 | 45 | const TableInfoPtr& table_info, bool succeed_if_create_in_progress) { |
4641 | 45 | TableDescription result; |
4642 | 45 | result.table_info = table_info; |
4643 | 45 | NamespaceId namespace_id; |
4644 | 45 | { |
4645 | 45 | TRACE("Locking table"); |
4646 | 45 | auto l = table_info->LockForRead(); |
4647 | | |
4648 | 45 | if (!succeed_if_create_in_progress && table_info->IsCreateInProgress()14 ) { |
4649 | 0 | return STATUS(IllegalState, "Table creation is in progress", table_info->ToString(), |
4650 | 0 | MasterError(MasterErrorPB::TABLE_CREATION_IS_IN_PROGRESS)); |
4651 | 0 | } |
4652 | | |
4653 | 45 | result.tablet_infos = table_info->GetTablets(); |
4654 | | |
4655 | 45 | namespace_id = table_info->namespace_id(); |
4656 | 45 | } |
4657 | | |
4658 | 45 | TRACE("Looking up namespace"); |
4659 | 45 | result.namespace_info = VERIFY_RESULT(FindNamespaceById(namespace_id)); |
4660 | | |
4661 | 0 | return result; |
4662 | 45 | } |
4663 | | |
4664 | 0 | Result<string> CatalogManager::GetPgSchemaName(const TableInfoPtr& table_info) { |
4665 | 0 | RSTATUS_DCHECK_EQ(table_info->GetTableType(), PGSQL_TABLE_TYPE, InternalError, |
4666 | 0 | Format("Expected YSQL table, got: $0", table_info->GetTableType())); |
4667 | |
|
4668 | 0 | const uint32_t database_oid = VERIFY_RESULT(GetPgsqlDatabaseOid(table_info->namespace_id())); |
4669 | 0 | uint32_t table_oid = VERIFY_RESULT(GetPgsqlTableOid(table_info->id())); |
4670 | 0 | { |
4671 | 0 | if (matview_pg_table_ids_map_.find(table_info->id()) != matview_pg_table_ids_map_.end()) { |
4672 | 0 | table_oid = VERIFY_RESULT(GetPgsqlTableOid(matview_pg_table_ids_map_[table_info->id()])); |
4673 | 0 | } |
4674 | 0 | } |
4675 | 0 | const uint32_t relnamespace_oid = VERIFY_RESULT( |
4676 | 0 | sys_catalog_->ReadPgClassRelnamespace(database_oid, table_oid)); |
4677 | 0 | return sys_catalog_->ReadPgNamespaceNspname(database_oid, relnamespace_oid); |
4678 | 0 | } |
4679 | | |
4680 | | // Truncate a Table. |
4681 | | Status CatalogManager::TruncateTable(const TruncateTableRequestPB* req, |
4682 | | TruncateTableResponsePB* resp, |
4683 | 12.6k | rpc::RpcContext* rpc) { |
4684 | 12.6k | LOG(INFO) << "Servicing TruncateTable request from " << RequestorString(rpc) |
4685 | 12.6k | << ": " << req->ShortDebugString(); |
4686 | | |
4687 | 16.4k | for (int i = 0; i < req->table_ids_size(); i++3.79k ) { |
4688 | 3.79k | RETURN_NOT_OK(TruncateTable(req->table_ids(i), resp, rpc)); |
4689 | 3.79k | } |
4690 | | |
4691 | 12.6k | return Status::OK(); |
4692 | 12.6k | } |
4693 | | |
4694 | | Status CatalogManager::TruncateTable(const TableId& table_id, |
4695 | | TruncateTableResponsePB* resp, |
4696 | 7.17k | rpc::RpcContext* rpc) { |
4697 | | // Lookup the table and verify if it exists. |
4698 | 7.17k | TRACE(Substitute("Looking up object by id $0", table_id)); |
4699 | 7.17k | scoped_refptr<TableInfo> table; |
4700 | 7.17k | { |
4701 | 7.17k | SharedLock lock(mutex_); |
4702 | 7.17k | table = FindPtrOrNull(*table_ids_map_, table_id); |
4703 | 7.17k | if (table == nullptr) { |
4704 | 0 | Status s = STATUS_SUBSTITUTE(NotFound, "The object with id $0 does not exist", table_id); |
4705 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); |
4706 | 0 | } |
4707 | 7.17k | } |
4708 | | |
4709 | 7.17k | TRACE(Substitute("Locking object with id $0", table_id)); |
4710 | 7.17k | auto l = table->LockForRead(); |
4711 | 7.17k | RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp)); |
4712 | | |
4713 | | // Truncate on a colocated table should not hit master because it should be handled by a write |
4714 | | // DML that creates a table-level tombstone. |
4715 | 7.17k | LOG_IF(WARNING, table->IsColocatedUserTable()) << "cannot truncate a colocated table on master"0 ; |
4716 | | |
4717 | 7.17k | if (!FLAGS_enable_delete_truncate_xcluster_replicated_table && IsCdcEnabled(*table)) { |
4718 | 0 | return STATUS(NotSupported, |
4719 | 0 | "Cannot truncate a table in replication.", |
4720 | 0 | table_id, |
4721 | 0 | MasterError(MasterErrorPB::INVALID_REQUEST)); |
4722 | 0 | } |
4723 | | |
4724 | | // Send a Truncate() request to each tablet in the table. |
4725 | 7.17k | SendTruncateTableRequest(table); |
4726 | | |
4727 | 7.17k | LOG(INFO) << "Successfully initiated TRUNCATE for " << table->ToString() << " per request from " |
4728 | 7.17k | << RequestorString(rpc); |
4729 | 7.17k | background_tasks_->Wake(); |
4730 | | |
4731 | | // Truncate indexes also. |
4732 | | // Note: PG table does not have references to indexes in the base table, so associated indexes |
4733 | | // must be truncated from the PG code separately. |
4734 | 7.17k | const bool is_index = IsIndex(l->pb); |
4735 | 7.17k | DCHECK(!is_index || l->pb.indexes().empty()) << "indexes should be empty for index table"0 ; |
4736 | 7.17k | for (const auto& index_info : l->pb.indexes()) { |
4737 | 3.38k | RETURN_NOT_OK(TruncateTable(index_info.table_id(), resp, rpc)); |
4738 | 3.38k | } |
4739 | | |
4740 | 7.17k | return Status::OK(); |
4741 | 7.17k | } |
4742 | | |
4743 | 7.17k | void CatalogManager::SendTruncateTableRequest(const scoped_refptr<TableInfo>& table) { |
4744 | 57.2k | for (const auto& tablet : table->GetTablets()) { |
4745 | 57.2k | SendTruncateTabletRequest(tablet); |
4746 | 57.2k | } |
4747 | 7.17k | } |
4748 | | |
4749 | 57.2k | void CatalogManager::SendTruncateTabletRequest(const scoped_refptr<TabletInfo>& tablet) { |
4750 | 57.2k | LOG_WITH_PREFIX(INFO) << "Truncating tablet " << tablet->id(); |
4751 | 57.2k | auto call = std::make_shared<AsyncTruncate>(master_, AsyncTaskPool(), tablet); |
4752 | 57.2k | tablet->table()->AddTask(call); |
4753 | 57.2k | WARN_NOT_OK( |
4754 | 57.2k | ScheduleTask(call), |
4755 | 57.2k | Substitute("Failed to send truncate request for tablet $0", tablet->id())); |
4756 | 57.2k | } |
4757 | | |
4758 | | Status CatalogManager::IsTruncateTableDone(const IsTruncateTableDoneRequestPB* req, |
4759 | 10.5k | IsTruncateTableDoneResponsePB* resp) { |
4760 | 10.5k | LOG(INFO) << "Servicing IsTruncateTableDone request for table id " << req->table_id(); |
4761 | | |
4762 | | // Lookup the truncated table. |
4763 | 10.5k | TRACE("Looking up table $0", req->table_id()); |
4764 | 10.5k | scoped_refptr<TableInfo> table; |
4765 | 10.5k | { |
4766 | 10.5k | SharedLock lock(mutex_); |
4767 | 10.5k | table = FindPtrOrNull(*table_ids_map_, req->table_id()); |
4768 | 10.5k | } |
4769 | | |
4770 | 10.5k | if (table == nullptr) { |
4771 | 0 | Status s = STATUS(NotFound, "The object does not exist: table with id", req->table_id()); |
4772 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); |
4773 | 0 | } |
4774 | | |
4775 | 10.5k | TRACE("Locking table"); |
4776 | 10.5k | RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(table->LockForRead(), resp)); |
4777 | | |
4778 | 10.5k | resp->set_done(!table->HasTasks(MonitoredTask::Type::ASYNC_TRUNCATE_TABLET)); |
4779 | 10.5k | return Status::OK(); |
4780 | 10.5k | } |
4781 | | |
4782 | | // Note: only used by YSQL as of 2020-10-29. |
4783 | | Status CatalogManager::BackfillIndex( |
4784 | | const BackfillIndexRequestPB* req, |
4785 | | BackfillIndexResponsePB* resp, |
4786 | 540 | rpc::RpcContext* rpc) { |
4787 | 540 | const TableIdentifierPB& index_table_identifier = req->index_identifier(); |
4788 | | |
4789 | 540 | scoped_refptr<TableInfo> index_table = VERIFY_RESULT(FindTable(index_table_identifier)); |
4790 | | |
4791 | 540 | if (index_table->GetTableType() != PGSQL_TABLE_TYPE) { |
4792 | | // This request is only supported for YSQL for now. YCQL has its own mechanism. |
4793 | 0 | return STATUS( |
4794 | 0 | InvalidArgument, |
4795 | 0 | "Unexpected non-YSQL table", |
4796 | 0 | index_table_identifier.ShortDebugString()); |
4797 | 0 | } |
4798 | | |
4799 | | // Collect indexed_table. |
4800 | 540 | scoped_refptr<TableInfo> indexed_table; |
4801 | 540 | { |
4802 | 540 | auto l = index_table->LockForRead(); |
4803 | 540 | TableId indexed_table_id = GetIndexedTableId(l->pb); |
4804 | 540 | resp->mutable_table_identifier()->set_table_id(indexed_table_id); |
4805 | 540 | indexed_table = GetTableInfo(indexed_table_id); |
4806 | 540 | } |
4807 | | |
4808 | 540 | if (indexed_table == nullptr) { |
4809 | 0 | return STATUS(InvalidArgument, "Empty indexed table", |
4810 | 0 | index_table_identifier.ShortDebugString()); |
4811 | 0 | } |
4812 | | |
4813 | | // TODO(jason): when ready to use INDEX_PERM_DO_BACKFILL for resuming backfill across master |
4814 | | // leader changes, replace the following (issue #6218). |
4815 | | |
4816 | | // Collect index_info_pb. |
4817 | 540 | IndexInfoPB index_info_pb; |
4818 | 540 | indexed_table->GetIndexInfo(index_table->id()).ToPB(&index_info_pb); |
4819 | 540 | if (index_info_pb.index_permissions() != INDEX_PERM_WRITE_AND_DELETE) { |
4820 | 0 | return SetupError( |
4821 | 0 | resp->mutable_error(), |
4822 | 0 | MasterErrorPB::INVALID_SCHEMA, |
4823 | 0 | STATUS_FORMAT( |
4824 | 0 | InvalidArgument, |
4825 | 0 | "Expected WRITE_AND_DELETE perm, got $0", |
4826 | 0 | IndexPermissions_Name(index_info_pb.index_permissions()))); |
4827 | 0 | } |
4828 | | |
4829 | 540 | return MultiStageAlterTable::StartBackfillingData( |
4830 | 540 | this, indexed_table, {index_info_pb}, boost::none); |
4831 | 540 | } |
4832 | | |
4833 | | Status CatalogManager::GetBackfillJobs( |
4834 | | const GetBackfillJobsRequestPB* req, |
4835 | | GetBackfillJobsResponsePB* resp, |
4836 | 697 | rpc::RpcContext* rpc) { |
4837 | 697 | TableIdentifierPB table_id = req->table_identifier(); |
4838 | | |
4839 | 697 | scoped_refptr<TableInfo> indexed_table = VERIFY_RESULT(FindTable(table_id)); |
4840 | 697 | if (indexed_table == nullptr) { |
4841 | 0 | Status s = STATUS(NotFound, "Requested table $0 does not exist", table_id.ShortDebugString()); |
4842 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); |
4843 | 0 | } |
4844 | | |
4845 | 697 | { |
4846 | 697 | auto l = indexed_table->LockForRead(); |
4847 | 697 | resp->mutable_backfill_jobs()->CopyFrom(l->pb.backfill_jobs()); |
4848 | 697 | } |
4849 | 697 | return Status::OK(); |
4850 | 697 | } |
4851 | | |
4852 | | Status CatalogManager::LaunchBackfillIndexForTable( |
4853 | | const LaunchBackfillIndexForTableRequestPB* req, |
4854 | | LaunchBackfillIndexForTableResponsePB* resp, |
4855 | 1 | rpc::RpcContext* rpc) { |
4856 | 1 | const TableIdentifierPB& table_id = req->table_identifier(); |
4857 | | |
4858 | 1 | scoped_refptr<TableInfo> indexed_table = VERIFY_RESULT(FindTable(table_id)); |
4859 | 1 | if (indexed_table == nullptr) { |
4860 | 0 | Status s = STATUS(NotFound, "Requested table $0 does not exist", table_id.ShortDebugString()); |
4861 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); |
4862 | 0 | } |
4863 | 1 | if (indexed_table->GetTableType() != YQL_TABLE_TYPE) { |
4864 | | // This request is only supported for YCQL for now. YSQL has its own mechanism. |
4865 | 0 | return STATUS(InvalidArgument, "Unexpected non-YCQL table $0", table_id.ShortDebugString()); |
4866 | 0 | } |
4867 | | |
4868 | 1 | uint32_t current_version; |
4869 | 1 | { |
4870 | 1 | auto l = indexed_table->LockForRead(); |
4871 | 1 | if (l->pb.state() != SysTablesEntryPB::RUNNING) { |
4872 | 0 | Status s = STATUS(TryAgain, |
4873 | 0 | "The table is in state $0. An alter may already be in progress.", |
4874 | 0 | SysTablesEntryPB_State_Name(l->pb.state())); |
4875 | 0 | VLOG(2) << "Table " << indexed_table->ToString() << " is not running returning " << s; |
4876 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INTERNAL_ERROR, s); |
4877 | 0 | } |
4878 | 1 | current_version = l->pb.version(); |
4879 | 1 | } |
4880 | | |
4881 | 0 | auto s = MultiStageAlterTable::LaunchNextTableInfoVersionIfNecessary( |
4882 | 1 | this, indexed_table, current_version, /* respect deferrals for backfill */ false); |
4883 | 1 | if (!s.ok()) { |
4884 | 0 | VLOG(3) << __func__ << " Done failed " << s; |
4885 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::UNKNOWN_ERROR, s); |
4886 | 0 | } |
4887 | 1 | return Status::OK(); |
4888 | 1 | } |
4889 | | |
4890 | | Status CatalogManager::MarkIndexInfoFromTableForDeletion( |
4891 | | const TableId& indexed_table_id, const TableId& index_table_id, bool multi_stage, |
4892 | 916 | DeleteTableResponsePB* resp) { |
4893 | | // Lookup the indexed table and verify if it exists. |
4894 | 916 | scoped_refptr<TableInfo> indexed_table = GetTableInfo(indexed_table_id); |
4895 | 916 | if (indexed_table == nullptr) { |
4896 | 0 | LOG(WARNING) << "Indexed table " << indexed_table_id << " for index " |
4897 | 0 | << index_table_id << " not found"; |
4898 | 0 | return Status::OK(); |
4899 | 0 | } |
4900 | | |
4901 | 916 | if (resp) { |
4902 | 916 | auto ns_info = VERIFY_RESULT(master_->catalog_manager()->FindNamespaceById( |
4903 | 916 | indexed_table->namespace_id())); |
4904 | 0 | auto* resp_indexed_table = resp->mutable_indexed_table(); |
4905 | 916 | resp_indexed_table->mutable_namespace_()->set_name(ns_info->name()); |
4906 | 916 | resp_indexed_table->set_table_name(indexed_table->name()); |
4907 | 916 | resp_indexed_table->set_table_id(indexed_table_id); |
4908 | 916 | } |
4909 | 916 | if (multi_stage) { |
4910 | 105 | RETURN_NOT_OK(MultiStageAlterTable::UpdateIndexPermission( |
4911 | 105 | this, indexed_table, |
4912 | 105 | {{index_table_id, IndexPermissions::INDEX_PERM_WRITE_AND_DELETE_WHILE_REMOVING}})); |
4913 | 811 | } else { |
4914 | 811 | RETURN_NOT_OK(DeleteIndexInfoFromTable(indexed_table_id, index_table_id)); |
4915 | 811 | } |
4916 | | |
4917 | | // Actual Deletion of the index info will happen asynchronously after all the |
4918 | | // tablets move to the new IndexPermission of DELETE_ONLY_WHILE_REMOVING. |
4919 | 916 | RETURN_NOT_OK(SendAlterTableRequest(indexed_table)); |
4920 | 916 | return Status::OK(); |
4921 | 916 | } |
4922 | | |
4923 | | Status CatalogManager::DeleteIndexInfoFromTable( |
4924 | 811 | const TableId& indexed_table_id, const TableId& index_table_id) { |
4925 | 811 | scoped_refptr<TableInfo> indexed_table = GetTableInfo(indexed_table_id); |
4926 | 811 | if (indexed_table == nullptr) { |
4927 | 0 | LOG(WARNING) << "Indexed table " << indexed_table_id << " for index " << index_table_id |
4928 | 0 | << " not found"; |
4929 | 0 | return Status::OK(); |
4930 | 0 | } |
4931 | 811 | TRACE("Locking indexed table"); |
4932 | 811 | auto l = indexed_table->LockForWrite(); |
4933 | 811 | auto &indexed_table_data = *l.mutable_data(); |
4934 | | |
4935 | | // Heed issue #6233. |
4936 | 811 | if (!l->pb.has_fully_applied_schema()) { |
4937 | 701 | MultiStageAlterTable::CopySchemaDetailsToFullyApplied(&indexed_table_data.pb); |
4938 | 701 | } |
4939 | 811 | auto *indexes = indexed_table_data.pb.mutable_indexes(); |
4940 | 988 | for (int i = 0; i < indexes->size(); i++177 ) { |
4941 | 988 | if (indexes->Get(i).table_id() == index_table_id) { |
4942 | | |
4943 | 811 | indexes->DeleteSubrange(i, 1); |
4944 | | |
4945 | 811 | indexed_table_data.pb.set_version(indexed_table_data.pb.version() + 1); |
4946 | | // TODO(Amit) : Is this compatible with the previous version? |
4947 | 811 | indexed_table_data.pb.set_updates_only_index_permissions(false); |
4948 | 811 | indexed_table_data.set_state( |
4949 | 811 | SysTablesEntryPB::ALTERING, |
4950 | 811 | Format("Delete index info version=$0 ts=$1", |
4951 | 811 | indexed_table_data.pb.version(), LocalTimeAsString())); |
4952 | | |
4953 | | // Update sys-catalog with the deleted indexed table info. |
4954 | 811 | TRACE("Updating indexed table metadata on disk"); |
4955 | 811 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), indexed_table)); |
4956 | | |
4957 | | // Update the in-memory state. |
4958 | 811 | TRACE("Committing in-memory state"); |
4959 | 811 | l.Commit(); |
4960 | 811 | return Status::OK(); |
4961 | 811 | } |
4962 | 988 | } |
4963 | | |
4964 | 0 | LOG(WARNING) << "Index " << index_table_id << " not found in indexed table " << indexed_table_id; |
4965 | 0 | return Status::OK(); |
4966 | 811 | } |
4967 | | |
4968 | | Status CatalogManager::DeleteTable( |
4969 | 5.60k | const DeleteTableRequestPB* req, DeleteTableResponsePB* resp, rpc::RpcContext* rpc) { |
4970 | 5.60k | LOG(INFO) << "Servicing DeleteTable request from " << RequestorString(rpc) << ": " |
4971 | 5.60k | << req->ShortDebugString(); |
4972 | | |
4973 | 5.60k | scoped_refptr<TableInfo> table = VERIFY_RESULT5.58k (FindTable(req->table()));5.58k |
4974 | 0 | bool result = IsCdcEnabled(*table); |
4975 | 5.58k | if (!FLAGS_enable_delete_truncate_xcluster_replicated_table && result) { |
4976 | 1 | return STATUS(NotSupported, |
4977 | 1 | "Cannot delete a table in replication.", |
4978 | 1 | req->ShortDebugString(), |
4979 | 1 | MasterError(MasterErrorPB::INVALID_REQUEST)); |
4980 | 1 | } |
4981 | | |
4982 | 5.58k | if (req->is_index_table()) { |
4983 | 808 | TRACE("Looking up index"); |
4984 | 808 | TableId table_id = table->id(); |
4985 | 808 | resp->set_table_id(table_id); |
4986 | 808 | TableId indexed_table_id; |
4987 | 808 | { |
4988 | 808 | auto l = table->LockForRead(); |
4989 | 808 | indexed_table_id = GetIndexedTableId(l->pb); |
4990 | 808 | } |
4991 | 808 | scoped_refptr<TableInfo> indexed_table = GetTableInfo(indexed_table_id); |
4992 | 808 | const bool is_pg_table = indexed_table != nullptr && |
4993 | 808 | indexed_table->GetTableType() == PGSQL_TABLE_TYPE; |
4994 | 808 | bool is_transactional; |
4995 | 808 | { |
4996 | 808 | Schema index_schema; |
4997 | 808 | RETURN_NOT_OK(table->GetSchema(&index_schema)); |
4998 | 808 | is_transactional = index_schema.table_properties().is_transactional(); |
4999 | 808 | } |
5000 | 0 | const bool index_backfill_enabled = |
5001 | 808 | IsIndexBackfillEnabled(table->GetTableType(), is_transactional); |
5002 | 808 | if (!is_pg_table && index_backfill_enabled124 ) { |
5003 | 105 | return MarkIndexInfoFromTableForDeletion( |
5004 | 105 | indexed_table_id, table_id, /* multi_stage */ true, resp); |
5005 | 105 | } |
5006 | 808 | } |
5007 | | |
5008 | 5.48k | return DeleteTableInternal(req, resp, rpc); |
5009 | 5.58k | } |
5010 | | |
5011 | | // Delete a Table |
5012 | | // - Update the table state to "DELETING". |
5013 | | // - Issue DeleteTablet tasks to all said tablets. |
5014 | | // - Update all the underlying tablet states as "DELETED". |
5015 | | // |
5016 | | // This order of events can help us guarantee that: |
5017 | | // - If a table is DELETING/DELETED, we do not add further tasks to it. |
5018 | | // - A DeleteTable is done when a table is either DELETING or DELETED and has no running tasks. |
5019 | | // - If a table is DELETING and it has no tasks on it, then it is safe to mark DELETED. |
5020 | | // |
5021 | | // We are lazy about deletions. |
5022 | | // |
5023 | | // IMPORTANT: If modifying, consider updating DeleteYsqlDBTables(), the bulk deletion API. |
5024 | | Status CatalogManager::DeleteTableInternal( |
5025 | 5.60k | const DeleteTableRequestPB* req, DeleteTableResponsePB* resp, rpc::RpcContext* rpc) { |
5026 | 5.60k | auto schedules_to_tables_map = VERIFY_RESULT( |
5027 | 5.60k | MakeSnapshotSchedulesToObjectIdsMap(SysRowEntryType::TABLE)); |
5028 | | |
5029 | 0 | vector<DeletingTableData> tables; |
5030 | 5.60k | RETURN_NOT_OK(DeleteTableInMemory(req->table(), req->is_index_table(), |
5031 | 5.60k | true /* update_indexed_table */, schedules_to_tables_map, |
5032 | 5.60k | &tables, resp, rpc)); |
5033 | | |
5034 | | // Update the in-memory state. |
5035 | 5.58k | TRACE("Committing in-memory state"); |
5036 | 5.58k | std::unordered_set<TableId> sys_table_ids; |
5037 | 5.88k | for (auto& table : tables) { |
5038 | 5.88k | if (IsSystemTable(*table.info)) { |
5039 | 1 | sys_table_ids.insert(table.info->id()); |
5040 | 1 | } |
5041 | 5.88k | table.write_lock.Commit(); |
5042 | 5.88k | } |
5043 | | |
5044 | | // Delete any CDC streams that are set up on this table, after releasing the Table lock. |
5045 | 5.58k | TRACE("Deleting CDC streams on table"); |
5046 | | // table_id for the requested table will be added to the end of the response. |
5047 | 5.58k | RSTATUS_DCHECK_GE(resp->deleted_table_ids_size(), 1, IllegalState, |
5048 | 5.58k | "DeleteTableInMemory expected to add the index id to resp"); |
5049 | 5.58k | RETURN_NOT_OK( |
5050 | 5.58k | DeleteCDCStreamsForTable(resp->deleted_table_ids(resp->deleted_table_ids_size() - 1))); |
5051 | | |
5052 | 5.58k | if (PREDICT_FALSE(FLAGS_catalog_manager_inject_latency_in_delete_table_ms > 0)) { |
5053 | 2 | LOG(INFO) << "Sleeping in CatalogManager::DeleteTable for " << |
5054 | 2 | FLAGS_catalog_manager_inject_latency_in_delete_table_ms << " ms"; |
5055 | 2 | SleepFor(MonoDelta::FromMilliseconds(FLAGS_catalog_manager_inject_latency_in_delete_table_ms)); |
5056 | 2 | } |
5057 | | |
5058 | | // Update the internal table maps. Exclude Postgres tables which are not in the name map. |
5059 | | // Also exclude hidden tables, that were already removed from this map. |
5060 | 5.58k | if (std::any_of(tables.begin(), tables.end(), [](auto& t) { return t.remove_from_name_map; }5.58k )) { |
5061 | 1.33k | TRACE("Removing tables from by-name map"); |
5062 | 1.33k | LockGuard lock(mutex_); |
5063 | 1.62k | for (const auto& table : tables) { |
5064 | 1.62k | if (table.remove_from_name_map) { |
5065 | 1.62k | TableInfoByNameMap::key_type key = {table.info->namespace_id(), table.info->name()}; |
5066 | 1.62k | if (table_names_map_.erase(key) != 1) { |
5067 | 0 | LOG(WARNING) << "Could not remove table from map: " << key.first << "." << key.second; |
5068 | 0 | } |
5069 | | |
5070 | | // Also remove from the system.partitions table. |
5071 | 1.62k | GetYqlPartitionsVtable().RemoveFromCache(table.info->id()); |
5072 | | |
5073 | | // Remove matviews from matview to pg table id map |
5074 | 1.62k | matview_pg_table_ids_map_.erase(table.info->id()); |
5075 | 1.62k | } |
5076 | 1.62k | } |
5077 | | // We commit another map to increment its version and reset cache. |
5078 | | // Since table_name_map_ does not have version. |
5079 | 1.33k | table_ids_map_.Commit(); |
5080 | 1.33k | } |
5081 | | |
5082 | 5.88k | for (const auto& table : tables) { |
5083 | 5.88k | LOG(INFO) << "Deleting table: " << table.info->name() << ", retained by: " |
5084 | 5.88k | << AsString(table.retained_by_snapshot_schedules, &Uuid::TryFullyDecode); |
5085 | | |
5086 | | // Send a DeleteTablet() request to each tablet replica in the table. |
5087 | 5.88k | RETURN_NOT_OK(DeleteTabletsAndSendRequests(table.info, table.retained_by_snapshot_schedules)); |
5088 | | // Send a RemoveTableFromTablet() request to each colocated parent tablet replica in the table. |
5089 | | // TODO(pitr) handle YSQL colocated tables. |
5090 | 5.88k | if (table.info->IsColocatedUserTable()) { |
5091 | 81 | { |
5092 | 81 | LockGuard lock(mutex_); |
5093 | 81 | const auto it = table_tablegroup_ids_map_.find(table.info->id()); |
5094 | 81 | if (it != table_tablegroup_ids_map_.end()) { |
5095 | 68 | const TablegroupId& tablegroup_id = it->second; |
5096 | 68 | const auto& tablegroup = DCHECK_NOTNULL(tablegroup_ids_map_[tablegroup_id]); |
5097 | 68 | tablegroup->DeleteChildTable(table.info->id()); |
5098 | 68 | table_tablegroup_ids_map_.erase(table.info->id()); |
5099 | 68 | } |
5100 | 81 | } |
5101 | 81 | auto call = std::make_shared<AsyncRemoveTableFromTablet>( |
5102 | 81 | master_, AsyncTaskPool(), table.info->GetColocatedTablet(), table.info); |
5103 | 81 | table.info->AddTask(call); |
5104 | 81 | WARN_NOT_OK(ScheduleTask(call), "Failed to send RemoveTableFromTablet request"); |
5105 | 81 | } |
5106 | 5.88k | } |
5107 | | |
5108 | | // If there are any permissions granted on this table find them and delete them. This is necessary |
5109 | | // because we keep track of the permissions based on the canonical resource name which is a |
5110 | | // combination of the keyspace and table names, so if another table with the same name is created |
5111 | | // (in the same keyspace where the previous one existed), and the permissions were not deleted at |
5112 | | // the time of the previous table deletion, then the permissions that existed for the previous |
5113 | | // table will automatically be granted to the new table even though this wasn't the intention. |
5114 | 5.58k | string canonical_resource = get_canonical_table(req->table().namespace_().name(), |
5115 | 5.58k | req->table().table_name()); |
5116 | 5.58k | RETURN_NOT_OK(permissions_manager_->RemoveAllPermissionsForResource(canonical_resource, resp)); |
5117 | | |
5118 | | // Remove the system tables from system catalog. |
5119 | 5.58k | if (!sys_table_ids.empty()) { |
5120 | | // We do not expect system tables deletion during initial snapshot forming. |
5121 | 1 | DCHECK(!initial_snapshot_writer_); |
5122 | | |
5123 | 1 | TRACE("Sending system table delete RPCs"); |
5124 | 1 | for (auto& table_id : sys_table_ids) { |
5125 | | // "sys_catalog_->DeleteYsqlSystemTable(table_id)" won't work here |
5126 | | // as it only acts on the leader. |
5127 | 1 | tablet::ChangeMetadataRequestPB change_req; |
5128 | 1 | change_req.set_tablet_id(kSysCatalogTabletId); |
5129 | 1 | change_req.set_remove_table_id(table_id); |
5130 | 1 | RETURN_NOT_OK(tablet::SyncReplicateChangeMetadataOperation( |
5131 | 1 | &change_req, sys_catalog_->tablet_peer().get(), leader_ready_term())); |
5132 | 1 | } |
5133 | 5.58k | } else { |
5134 | 5.58k | TRACE("No system tables to delete"); |
5135 | 5.58k | } |
5136 | | |
5137 | 5.58k | LOG(INFO) << "Successfully initiated deletion of " |
5138 | 5.58k | << (req->is_index_table() ? "index"811 : "table"4.77k ) << " with " |
5139 | 5.58k | << req->table().DebugString() << " per request from " << RequestorString(rpc); |
5140 | | // Asynchronously cleans up the final memory traces of the deleted database. |
5141 | 5.58k | background_tasks_->Wake(); |
5142 | 5.58k | return Status::OK(); |
5143 | 5.58k | } |
5144 | | |
5145 | | Status CatalogManager::DeleteTableInMemory( |
5146 | | const TableIdentifierPB& table_identifier, |
5147 | | const bool is_index_table, |
5148 | | const bool update_indexed_table, |
5149 | | const SnapshotSchedulesToObjectIdsMap& schedules_to_tables_map, |
5150 | | vector<DeletingTableData>* tables, |
5151 | | DeleteTableResponsePB* resp, |
5152 | 5.90k | rpc::RpcContext* rpc) { |
5153 | | // TODO(NIC): How to handle a DeleteTable request when the namespace is being deleted? |
5154 | 5.90k | const char* const object_type = is_index_table ? "index"1.12k : "table"4.78k ; |
5155 | 5.90k | const bool cascade_delete_index = is_index_table && !update_indexed_table1.12k ; |
5156 | | |
5157 | 5.90k | VLOG_WITH_PREFIX_AND_FUNC0 (1) << 0 YB_STRUCT_TO_STRING0 ( |
5158 | 0 | table_identifier, is_index_table, update_indexed_table) << "\n" << GetStackTrace(); |
5159 | | |
5160 | | // Lookup the table and verify if it exists. |
5161 | 5.90k | TRACE(Substitute("Looking up $0", object_type)); |
5162 | 5.90k | auto table_result = FindTable(table_identifier); |
5163 | 5.90k | if (!VERIFY_RESULT(DoesTableExist(table_result))) { |
5164 | 0 | if (cascade_delete_index) { |
5165 | 0 | LOG(WARNING) << "Index " << table_identifier.DebugString() << " not found"; |
5166 | 0 | return Status::OK(); |
5167 | 0 | } else { |
5168 | 0 | return table_result.status(); |
5169 | 0 | } |
5170 | 0 | } |
5171 | 5.90k | auto table = std::move(*table_result); |
5172 | | |
5173 | 5.90k | TRACE(Substitute("Locking $0", object_type)); |
5174 | 5.90k | auto data = DeletingTableData { |
5175 | 5.90k | .info = table, |
5176 | 5.90k | .write_lock = table->LockForWrite(), |
5177 | 5.90k | .retained_by_snapshot_schedules = RepeatedBytes(), |
5178 | 5.90k | .remove_from_name_map = false |
5179 | 5.90k | }; |
5180 | 5.90k | auto& l = data.write_lock; |
5181 | | // table_id for the requested table will be added to the end of the response. |
5182 | 5.90k | *resp->add_deleted_table_ids() = table->id(); |
5183 | | |
5184 | 5.90k | if (is_index_table == IsTable(l->pb)) { |
5185 | 0 | Status s = STATUS(NotFound, "The object does not exist"); |
5186 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); |
5187 | 0 | } |
5188 | | |
5189 | 5.90k | FillRetainedBySnapshotSchedules( |
5190 | 5.90k | schedules_to_tables_map, table->id(), &data.retained_by_snapshot_schedules); |
5191 | 5.90k | bool hide_only = !data.retained_by_snapshot_schedules.empty(); |
5192 | | |
5193 | 5.90k | if (l->started_deleting() || (5.88k hide_only5.88k && l->started_hiding()4 )) { |
5194 | 16 | if (cascade_delete_index) { |
5195 | 0 | LOG(WARNING) << "Index " << table_identifier.ShortDebugString() << " was " |
5196 | 0 | << (l->started_deleting() ? "deleted" : "hidden"); |
5197 | 0 | return Status::OK(); |
5198 | 16 | } else { |
5199 | 16 | Status s = STATUS(NotFound, "The object was deleted", l->pb.state_msg()); |
5200 | 16 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); |
5201 | 16 | } |
5202 | 16 | } |
5203 | | |
5204 | | // Determine if we have to remove from the name map here before we change the table state. |
5205 | 5.89k | data.remove_from_name_map = l.data().table_type() != PGSQL_TABLE_TYPE && !l->started_hiding()1.63k ; |
5206 | | |
5207 | 5.89k | TRACE("Updating metadata on disk"); |
5208 | | // Update the metadata for the on-disk state. |
5209 | 5.89k | if (hide_only) { |
5210 | 4 | l.mutable_data()->pb.set_hide_state(SysTablesEntryPB::HIDING); |
5211 | 5.88k | } else { |
5212 | 5.88k | l.mutable_data()->set_state(SysTablesEntryPB::DELETING, |
5213 | 5.88k | Substitute("Started deleting at $0", LocalTimeAsString())); |
5214 | 5.88k | } |
5215 | | |
5216 | 5.89k | auto now = master_->clock()->Now(); |
5217 | 5.89k | DdlLogEntry ddl_log_entry(now, table->id(), l->pb, "Drop"); |
5218 | 5.89k | if (is_index_table) { |
5219 | 1.11k | const auto& indexed_table_id = GetIndexedTableId(l->pb); |
5220 | 1.11k | auto indexed_table = FindTableById(indexed_table_id); |
5221 | 1.11k | if (indexed_table.ok()) { |
5222 | 1.11k | auto lock = (**indexed_table).LockForRead(); |
5223 | 1.11k | ddl_log_entry = DdlLogEntry( |
5224 | 1.11k | now, indexed_table_id, lock->pb, Format("Drop index $0", l->name())); |
5225 | 1.11k | } |
5226 | 1.11k | } |
5227 | | |
5228 | | // Update sys-catalog with the removed table state. |
5229 | 5.89k | Status s = sys_catalog_->Upsert(leader_ready_term(), &ddl_log_entry, table); |
5230 | | |
5231 | 5.89k | if (PREDICT_FALSE(FLAGS_TEST_simulate_crash_after_table_marked_deleting)) { |
5232 | 1 | return Status::OK(); |
5233 | 1 | } |
5234 | | |
5235 | 5.89k | if (!s.ok()) { |
5236 | | // The mutation will be aborted when 'l' exits the scope on early return. |
5237 | 4 | s = s.CloneAndPrepend("An error occurred while updating sys tables"); |
5238 | 4 | LOG(WARNING) << s; |
5239 | 4 | return CheckIfNoLongerLeaderAndSetupError(s, resp); |
5240 | 4 | } |
5241 | | |
5242 | | // For regular (indexed) table, delete all its index tables if any. Else for index table, delete |
5243 | | // index info from the indexed table. |
5244 | 5.88k | if (!is_index_table) { |
5245 | 4.77k | TableIdentifierPB index_identifier; |
5246 | 4.77k | for (const auto& index : l->pb.indexes()) { |
5247 | 300 | index_identifier.set_table_id(index.table_id()); |
5248 | 300 | RETURN_NOT_OK(DeleteTableInMemory(index_identifier, true /* is_index_table */, |
5249 | 300 | false /* update_indexed_table */, schedules_to_tables_map, |
5250 | 300 | tables, resp, rpc)); |
5251 | 300 | } |
5252 | 4.77k | } else if (1.11k update_indexed_table1.11k ) { |
5253 | 811 | s = MarkIndexInfoFromTableForDeletion( |
5254 | 811 | GetIndexedTableId(l->pb), table->id(), /* multi_stage */ false, resp); |
5255 | 811 | if (!s.ok()) { |
5256 | 0 | s = s.CloneAndPrepend(Substitute("An error occurred while deleting index info: $0", |
5257 | 0 | s.ToString())); |
5258 | 0 | LOG(WARNING) << s.ToString(); |
5259 | 0 | return CheckIfNoLongerLeaderAndSetupError(s, resp); |
5260 | 0 | } |
5261 | 811 | } |
5262 | | |
5263 | 5.88k | if (!hide_only) { |
5264 | | // If table is being hidden we should not abort snapshot related tasks. |
5265 | 5.87k | table->AbortTasks(); |
5266 | 5.87k | } |
5267 | | |
5268 | | // For regular (indexed) table, insert table info and lock in the front of the list. Else for |
5269 | | // index table, append them to the end. We do so so that we will commit and delete the indexed |
5270 | | // table first before its indexes. |
5271 | 5.88k | tables->insert(is_index_table ? tables->end()1.11k : tables->begin()4.77k , std::move(data)); |
5272 | | |
5273 | 5.88k | return Status::OK(); |
5274 | 5.88k | } |
5275 | | |
5276 | 15.9M | TableInfo::WriteLock CatalogManager::MaybeTransitionTableToDeleted(const TableInfoPtr& table) { |
5277 | 15.9M | if (!table) { |
5278 | 0 | LOG_WITH_PREFIX(INFO) << "Finished deleting an Orphaned tablet. " |
5279 | 0 | << "Table Information is null. Skipping updating its state to DELETED."; |
5280 | 0 | return TableInfo::WriteLock(); |
5281 | 0 | } |
5282 | 15.9M | if (table->HasTasks()) { |
5283 | 104k | VLOG_WITH_PREFIX_AND_FUNC0 (2) << table->ToString() << " has tasks"0 ; |
5284 | 104k | return TableInfo::WriteLock(); |
5285 | 104k | } |
5286 | 15.8M | bool hide_only; |
5287 | 15.8M | { |
5288 | 15.8M | auto lock = table->LockForRead(); |
5289 | | |
5290 | | // For any table in DELETING state, we will want to mark it as DELETED once all its respective |
5291 | | // tablets have been successfully removed from tservers. |
5292 | | // For any hiding table we will want to mark it as HIDDEN once all its respective |
5293 | | // tablets have been successfully hidden on tservers. |
5294 | 15.8M | if (lock->is_deleted()) { |
5295 | | // Clear the tablets_ and partitions_ maps if table has already been DELETED. |
5296 | | // Usually this would have been done except for tables that were hidden and are now deleted. |
5297 | | // Also, this is a catch all in case any other path misses clearing the maps. |
5298 | 1.00M | table->ClearTabletMaps(); |
5299 | 1.00M | return TableInfo::WriteLock(); |
5300 | 1.00M | } |
5301 | 14.8M | hide_only = !lock->is_deleting(); |
5302 | 14.8M | if (hide_only && !lock->is_hiding()14.7M ) { |
5303 | 14.7M | return TableInfo::WriteLock(); |
5304 | 14.7M | } |
5305 | 14.8M | } |
5306 | | // The current relevant order of operations during a DeleteTable is: |
5307 | | // 1) Mark the table as DELETING |
5308 | | // 2) Abort the current table tasks |
5309 | | // 3) Per tablet, send DeleteTable requests to all TS, then mark that tablet as DELETED |
5310 | | // |
5311 | | // This creates a race, wherein, after 2, HasTasks can be false, but we still have not |
5312 | | // gotten to point 3, which would add further tasks for the deletes. |
5313 | | // |
5314 | | // However, HasTasks is cheaper than AreAllTabletsDeletedOrHidden... |
5315 | 16.3k | auto all_tablets_done = hide_only ? table->AreAllTabletsHidden()4 : table->AreAllTabletsDeleted()16.3k ; |
5316 | 18.4E | VLOG_WITH_PREFIX_AND_FUNC(2) |
5317 | 18.4E | << table->ToString() << " hide only: " << hide_only << ", all tablets done: " |
5318 | 18.4E | << all_tablets_done; |
5319 | 16.3k | if (!all_tablets_done && !IsSystemTable(*table)8.09k && !table->IsColocatedUserTable()203 ) { |
5320 | 115 | return TableInfo::WriteLock(); |
5321 | 115 | } |
5322 | | |
5323 | 16.2k | auto lock = table->LockForWrite(); |
5324 | 16.2k | if (lock->is_hiding()) { |
5325 | 4 | LOG(INFO) << "Marking table as HIDDEN: " << table->ToString(); |
5326 | 4 | lock.mutable_data()->pb.set_hide_state(SysTablesEntryPB::HIDDEN); |
5327 | | // Erase all the tablets from partitions_ structure. |
5328 | 4 | table->ClearTabletMaps(DeactivateOnly::kTrue); |
5329 | 4 | return lock; |
5330 | 4 | } |
5331 | 16.2k | if (lock->is_deleting()) { |
5332 | | // Update the metadata for the on-disk state. |
5333 | 13.8k | LOG(INFO) << "Marking table as DELETED: " << table->ToString(); |
5334 | 13.8k | lock.mutable_data()->set_state(SysTablesEntryPB::DELETED, |
5335 | 13.8k | Substitute("Deleted with tablets at $0", LocalTimeAsString())); |
5336 | | // Erase all the tablets from tablets_ and partitions_ structures. |
5337 | 13.8k | table->ClearTabletMaps(); |
5338 | 13.8k | return lock; |
5339 | 13.8k | } |
5340 | 2.43k | return TableInfo::WriteLock(); |
5341 | 16.2k | } |
5342 | | |
5343 | 56.6k | void CatalogManager::CleanUpDeletedTables() { |
5344 | | // TODO(bogdan): Cache tables being deleted to make this iterate only over those? |
5345 | 56.6k | vector<scoped_refptr<TableInfo>> tables_to_delete; |
5346 | | // Garbage collecting. |
5347 | | // Going through all tables under the global lock, copying them to not hold lock for too long. |
5348 | 56.6k | TableInfoMap copy_of_table_by_id_map; |
5349 | 56.6k | { |
5350 | 56.6k | LockGuard lock(mutex_); |
5351 | 56.6k | copy_of_table_by_id_map = *table_ids_map_; |
5352 | 56.6k | } |
5353 | | // Mark the tables as DELETED and remove them from the in-memory maps. |
5354 | 56.6k | vector<TableInfo*> tables_to_update_on_disk; |
5355 | 56.6k | vector<TableInfo::WriteLock> table_locks; |
5356 | 15.7M | for (const auto& it : copy_of_table_by_id_map) { |
5357 | 15.7M | const auto& table = it.second; |
5358 | 15.7M | auto lock = MaybeTransitionTableToDeleted(table); |
5359 | 15.7M | if (lock.locked()) { |
5360 | 7.93k | table_locks.push_back(std::move(lock)); |
5361 | 7.93k | tables_to_update_on_disk.push_back(table.get()); |
5362 | 7.93k | } |
5363 | 15.7M | } |
5364 | 56.6k | if (tables_to_update_on_disk.size() > 0) { |
5365 | 87 | Status s = sys_catalog_->Upsert(leader_ready_term(), tables_to_update_on_disk); |
5366 | 87 | if (!s.ok()) { |
5367 | 0 | LOG(WARNING) << "Error marking tables as DELETED: " << s.ToString(); |
5368 | 0 | return; |
5369 | 0 | } |
5370 | | // Update the table in-memory info as DELETED after we've removed them from the maps. |
5371 | 7.93k | for (auto& lock : table_locks)87 { |
5372 | 7.93k | lock.Commit(); |
5373 | 7.93k | } |
5374 | | // TODO: Check if we want to delete the totally deleted table from the sys_catalog here. |
5375 | | // TODO: SysCatalog::DeleteItem() if we've DELETED all user tables in a DELETING namespace. |
5376 | | // TODO: Also properly handle namespace_ids_map_.erase(table->namespace_id()) |
5377 | 87 | } |
5378 | 56.6k | } |
5379 | | |
5380 | | Status CatalogManager::IsDeleteTableDone(const IsDeleteTableDoneRequestPB* req, |
5381 | 11.7k | IsDeleteTableDoneResponsePB* resp) { |
5382 | | // Lookup the deleted table. |
5383 | 11.7k | TRACE("Looking up table $0", req->table_id()); |
5384 | 11.7k | scoped_refptr<TableInfo> table; |
5385 | 11.7k | { |
5386 | 11.7k | SharedLock lock(mutex_); |
5387 | 11.7k | table = FindPtrOrNull(*table_ids_map_, req->table_id()); |
5388 | 11.7k | } |
5389 | | |
5390 | 11.7k | if (table == nullptr) { |
5391 | 40 | LOG(INFO) << "Servicing IsDeleteTableDone request for table id " |
5392 | 40 | << req->table_id() << ": deleted (not found)"; |
5393 | 40 | resp->set_done(true); |
5394 | 40 | return Status::OK(); |
5395 | 40 | } |
5396 | | |
5397 | 11.7k | TRACE("Locking table"); |
5398 | 11.7k | auto l = table->LockForRead(); |
5399 | | |
5400 | 11.7k | if (!l->started_deleting() && !l->started_hiding()118 ) { |
5401 | 106 | LOG(WARNING) << "Servicing IsDeleteTableDone request for table id " |
5402 | 106 | << req->table_id() << ": NOT deleted"; |
5403 | 106 | Status s = STATUS(IllegalState, "The object was NOT deleted", l->pb.state_msg()); |
5404 | 106 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); |
5405 | 106 | } |
5406 | | |
5407 | | // Temporary fix for github issue #5290. |
5408 | | // TODO: Wait till deletion completed for tablegroup parent table. |
5409 | 11.6k | if (table->IsTablegroupParentTable()) { |
5410 | 0 | LOG(INFO) << "Servicing IsDeleteTableDone request for tablegroup parent table id " |
5411 | 0 | << req->table_id() << ": deleting. Skipping wait for DELETED state."; |
5412 | 0 | resp->set_done(true); |
5413 | 0 | return Status::OK(); |
5414 | 0 | } |
5415 | | |
5416 | 11.6k | if (l->is_deleted() || l->is_hidden()5.96k ) { |
5417 | 5.66k | LOG(INFO) << "Servicing IsDeleteTableDone request for table id " |
5418 | 5.66k | << req->table_id() << ": totally " << (l->is_hidden() ? "hidden"4 : "deleted"5.66k ); |
5419 | 5.66k | resp->set_done(true); |
5420 | 5.96k | } else { |
5421 | 5.96k | LOG(INFO) << "Servicing IsDeleteTableDone request for table id " << req->table_id() |
5422 | 5.96k | << ((!table->IsColocatedUserTable()) ? ": deleting tablets"5.88k : ""79 ); |
5423 | | |
5424 | 5.96k | std::vector<std::shared_ptr<TSDescriptor>> descs; |
5425 | 5.96k | master_->ts_manager()->GetAllDescriptors(&descs); |
5426 | 17.8k | for (auto& ts_desc : descs) { |
5427 | 17.8k | LOG(INFO) << "Deleting on " << ts_desc->permanent_uuid() << ": " |
5428 | 17.8k | << ts_desc->PendingTabletDeleteToString(); |
5429 | 17.8k | } |
5430 | | |
5431 | 5.96k | resp->set_done(false); |
5432 | 5.96k | } |
5433 | | |
5434 | 11.6k | return Status::OK(); |
5435 | 11.6k | } |
5436 | | |
5437 | | namespace { |
5438 | | |
5439 | | CHECKED_STATUS ApplyAlterSteps(server::Clock* clock, |
5440 | | const TableId& table_id, |
5441 | | const SysTablesEntryPB& current_pb, |
5442 | | const AlterTableRequestPB* req, |
5443 | | Schema* new_schema, |
5444 | | ColumnId* next_col_id, |
5445 | 580 | std::vector<DdlLogEntry>* ddl_log_entries) { |
5446 | 580 | const SchemaPB& current_schema_pb = current_pb.schema(); |
5447 | 580 | Schema cur_schema; |
5448 | 580 | RETURN_NOT_OK(SchemaFromPB(current_schema_pb, &cur_schema)); |
5449 | | |
5450 | 580 | SchemaBuilder builder(cur_schema); |
5451 | 580 | if (current_pb.has_next_column_id()) { |
5452 | 580 | builder.set_next_column_id(ColumnId(current_pb.next_column_id())); |
5453 | 580 | } |
5454 | 580 | if (current_pb.has_colocated() && current_pb.colocated()6 ) { |
5455 | 6 | if (current_schema_pb.table_properties().is_ysql_catalog_table()) { |
5456 | 0 | Uuid cotable_id; |
5457 | 0 | RETURN_NOT_OK(cotable_id.FromHexString(req->table().table_id())); |
5458 | 0 | builder.set_cotable_id(cotable_id); |
5459 | 0 | } |
5460 | | // Colocation ID is set in schema and cannot be altered. |
5461 | 6 | } |
5462 | | |
5463 | 600 | for (const AlterTableRequestPB::Step& step : req->alter_schema_steps())580 { |
5464 | 600 | auto time = clock->Now(); |
5465 | 600 | switch (step.type()) { |
5466 | 339 | case AlterTableRequestPB::ADD_COLUMN: { |
5467 | 339 | if (!step.has_add_column()) { |
5468 | 0 | return STATUS(InvalidArgument, "ADD_COLUMN missing column info"); |
5469 | 0 | } |
5470 | | |
5471 | | // Verify that encoding is appropriate for the new column's type. |
5472 | 339 | ColumnSchemaPB new_col_pb = step.add_column().schema(); |
5473 | 339 | if (new_col_pb.has_id()) { |
5474 | 0 | return STATUS_SUBSTITUTE(InvalidArgument, |
5475 | 0 | "column $0: client should not specify column id", new_col_pb.ShortDebugString()); |
5476 | 0 | } |
5477 | 339 | ColumnSchema new_col = ColumnSchemaFromPB(new_col_pb); |
5478 | | |
5479 | 339 | RETURN_NOT_OK(builder.AddColumn(new_col, false)); |
5480 | 339 | ddl_log_entries->emplace_back(time, table_id, current_pb, Format("Add column $0", new_col)); |
5481 | 339 | break; |
5482 | 339 | } |
5483 | | |
5484 | 232 | case AlterTableRequestPB::DROP_COLUMN: { |
5485 | 232 | if (!step.has_drop_column()) { |
5486 | 0 | return STATUS(InvalidArgument, "DROP_COLUMN missing column info"); |
5487 | 0 | } |
5488 | | |
5489 | 232 | if (cur_schema.is_key_column(step.drop_column().name())) { |
5490 | 0 | return STATUS(InvalidArgument, "cannot remove a key column"); |
5491 | 0 | } |
5492 | | |
5493 | 232 | RETURN_NOT_OK(builder.RemoveColumn(step.drop_column().name())); |
5494 | 232 | ddl_log_entries->emplace_back( |
5495 | 232 | time, table_id, current_pb, Format("Drop column $0", step.drop_column().name())); |
5496 | 232 | break; |
5497 | 232 | } |
5498 | | |
5499 | 29 | case AlterTableRequestPB::RENAME_COLUMN: { |
5500 | 29 | if (!step.has_rename_column()) { |
5501 | 0 | return STATUS(InvalidArgument, "RENAME_COLUMN missing column info"); |
5502 | 0 | } |
5503 | | |
5504 | 29 | RETURN_NOT_OK(builder.RenameColumn( |
5505 | 29 | step.rename_column().old_name(), |
5506 | 29 | step.rename_column().new_name())); |
5507 | 29 | ddl_log_entries->emplace_back( |
5508 | 29 | time, table_id, current_pb, |
5509 | 29 | Format("Rename column $0 => $1", step.rename_column().old_name(), |
5510 | 29 | step.rename_column().new_name())); |
5511 | 29 | break; |
5512 | 29 | } |
5513 | | |
5514 | | // TODO: EDIT_COLUMN. |
5515 | | |
5516 | 0 | default: { |
5517 | 0 | return STATUS_SUBSTITUTE(InvalidArgument, "Invalid alter step type: $0", step.type()); |
5518 | 29 | } |
5519 | 600 | } |
5520 | 600 | } |
5521 | | |
5522 | 580 | if (req->has_alter_properties()) { |
5523 | 7 | RETURN_NOT_OK(builder.AlterProperties(req->alter_properties())); |
5524 | 7 | } |
5525 | | |
5526 | 580 | *new_schema = builder.Build(); |
5527 | 580 | *next_col_id = builder.next_column_id(); |
5528 | 580 | return Status::OK(); |
5529 | 580 | } |
5530 | | |
5531 | | } // namespace |
5532 | | |
5533 | | Status CatalogManager::AlterTable(const AlterTableRequestPB* req, |
5534 | | AlterTableResponsePB* resp, |
5535 | 5.89k | rpc::RpcContext* rpc) { |
5536 | 5.89k | LOG_WITH_PREFIX(INFO) << "Servicing " << __func__ << " request from " << RequestorString(rpc) |
5537 | 5.89k | << ": " << req->ShortDebugString(); |
5538 | | |
5539 | 5.89k | std::vector<DdlLogEntry> ddl_log_entries; |
5540 | | |
5541 | | // Lookup the table and verify if it exists. |
5542 | 5.89k | TRACE("Looking up table"); |
5543 | 5.89k | scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTable(req->table())); |
5544 | | |
5545 | 0 | NamespaceId new_namespace_id; |
5546 | | |
5547 | 5.89k | if (req->has_new_namespace()) { |
5548 | | // Lookup the new namespace and verify if it exists. |
5549 | 119 | TRACE("Looking up new namespace"); |
5550 | 119 | scoped_refptr<NamespaceInfo> ns; |
5551 | 119 | NamespaceIdentifierPB namespace_identifier = req->new_namespace(); |
5552 | | // Use original namespace_id as new_namespace_id for YSQL tables. |
5553 | 119 | if (table->GetTableType() == PGSQL_TABLE_TYPE && !namespace_identifier.has_id()115 ) { |
5554 | 115 | namespace_identifier.set_id(table->namespace_id()); |
5555 | 115 | } |
5556 | 119 | ns = VERIFY_NAMESPACE_FOUND117 (117 FindNamespace(namespace_identifier), resp); |
5557 | | |
5558 | 117 | auto ns_lock = ns->LockForRead(); |
5559 | 117 | new_namespace_id = ns->id(); |
5560 | | // Don't use Namespaces that aren't running. |
5561 | 117 | if (ns->state() != SysNamespaceEntryPB::RUNNING) { |
5562 | 0 | Status s = STATUS_SUBSTITUTE(TryAgain, |
5563 | 0 | "Namespace not running (State=$0). Cannot create $1.$2", |
5564 | 0 | SysNamespaceEntryPB::State_Name(ns->state()), ns->name(), table->name() ); |
5565 | 0 | return SetupError(resp->mutable_error(), NamespaceMasterError(ns->state()), s); |
5566 | 0 | } |
5567 | 117 | } |
5568 | 5.89k | if (req->has_new_namespace() || req->has_new_table_name()5.77k ) { |
5569 | 117 | if (new_namespace_id.empty()) { |
5570 | 0 | const Status s = STATUS(InvalidArgument, "No namespace used"); |
5571 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::NO_NAMESPACE_USED, s); |
5572 | 0 | } |
5573 | 117 | } |
5574 | | |
5575 | 5.89k | TRACE("Locking table"); |
5576 | 5.89k | auto l = table->LockForWrite(); |
5577 | 5.89k | RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp)); |
5578 | | |
5579 | 5.89k | bool has_changes = false; |
5580 | 5.89k | auto& table_pb = l.mutable_data()->pb; |
5581 | 5.89k | const TableName table_name = l->name(); |
5582 | 5.89k | const NamespaceId namespace_id = l->namespace_id(); |
5583 | 5.89k | const TableName new_table_name = req->has_new_table_name() ? req->new_table_name()115 : table_name5.78k ; |
5584 | | |
5585 | | // Calculate new schema for the on-disk state, not persisted yet. |
5586 | 5.89k | Schema new_schema; |
5587 | 5.89k | ColumnId next_col_id = ColumnId(l->pb.next_column_id()); |
5588 | 5.89k | if (req->alter_schema_steps_size() || req->has_alter_properties()5.32k ) { |
5589 | 580 | TRACE("Apply alter schema"); |
5590 | 580 | Status s = ApplyAlterSteps( |
5591 | 580 | master_->clock(), table->id(), l->pb, req, &new_schema, &next_col_id, &ddl_log_entries); |
5592 | 580 | if (!s.ok()) { |
5593 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
5594 | 0 | } |
5595 | 580 | DCHECK_NE(next_col_id, 0); |
5596 | 580 | DCHECK_EQ(new_schema.find_column_by_id(next_col_id), |
5597 | 580 | static_cast<int>(Schema::kColumnNotFound)); |
5598 | 580 | has_changes = true; |
5599 | 580 | } |
5600 | | |
5601 | | // Try to acquire the new table name. |
5602 | 5.89k | if (req->has_new_namespace() || req->has_new_table_name()5.77k ) { |
5603 | | |
5604 | | // Postgres handles name uniqueness constraints in it's own layer. |
5605 | 117 | if (l->table_type() != PGSQL_TABLE_TYPE) { |
5606 | 2 | LockGuard lock(mutex_); |
5607 | 2 | VLOG_WITH_FUNC0 (3) << "Acquired the catalog manager lock"0 ; |
5608 | | |
5609 | 2 | TRACE("Acquired catalog manager lock"); |
5610 | | |
5611 | | // Verify that the table does not exist. |
5612 | 2 | scoped_refptr<TableInfo> other_table = FindPtrOrNull( |
5613 | 2 | table_names_map_, {new_namespace_id, new_table_name}); |
5614 | 2 | if (other_table != nullptr) { |
5615 | 1 | Status s = STATUS_SUBSTITUTE(AlreadyPresent, |
5616 | 1 | "Object '$0.$1' already exists", |
5617 | 1 | GetNamespaceNameUnlocked(new_namespace_id), other_table->name()); |
5618 | 1 | LOG(WARNING) << "Found table: " << other_table->ToStringWithState() |
5619 | 1 | << ". Failed alterring table with error: " |
5620 | 1 | << s.ToString() << " Request:\n" << req->DebugString(); |
5621 | 1 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_ALREADY_PRESENT, s); |
5622 | 1 | } |
5623 | | |
5624 | | // Acquire the new table name (now we have 2 name for the same table). |
5625 | 1 | table_names_map_[{new_namespace_id, new_table_name}] = table; |
5626 | 1 | } |
5627 | | |
5628 | 116 | table_pb.set_namespace_id(new_namespace_id); |
5629 | 116 | table_pb.set_name(new_table_name); |
5630 | | |
5631 | 116 | has_changes = true; |
5632 | 116 | } |
5633 | | |
5634 | | // Check if there has been any changes to the placement policies for this table. |
5635 | 5.89k | if (req->has_replication_info()) { |
5636 | | // If this is a colocated table, it does not make sense to set placement |
5637 | | // policy for this table, as the tablet associated with it is shared by |
5638 | | // multiple tables. |
5639 | 4 | if (table->colocated()) { |
5640 | 0 | const Status s = STATUS(InvalidArgument, |
5641 | 0 | "Placement policy cannot be altered for a colocated table"); |
5642 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s); |
5643 | 0 | } |
5644 | 4 | if (table->GetTableType() == PGSQL_TABLE_TYPE) { |
5645 | 0 | const Status s = STATUS(InvalidArgument, |
5646 | 0 | "Placement policy cannot be altered for YSQL tables, use Tablespaces"); |
5647 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s); |
5648 | 0 | } |
5649 | | // Validate table replication info. |
5650 | 4 | RETURN_NOT_OK(ValidateTableReplicationInfo(req->replication_info())); |
5651 | 4 | table_pb.mutable_replication_info()->CopyFrom(req->replication_info()); |
5652 | 4 | has_changes = true; |
5653 | 4 | } |
5654 | | |
5655 | | // TODO(hector): Simplify the AlterSchema workflow to avoid doing the same checks on every layer |
5656 | | // this request goes through: https://github.com/YugaByte/yugabyte-db/issues/1882. |
5657 | 5.89k | if (req->has_wal_retention_secs()) { |
5658 | 5.19k | if (has_changes) { |
5659 | 0 | const Status s = STATUS(InvalidArgument, |
5660 | 0 | "wal_retention_secs cannot be altered concurrently with other properties"); |
5661 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s); |
5662 | 0 | } |
5663 | | // TODO(hector): Handle co-partitioned tables: |
5664 | | // https://github.com/YugaByte/yugabyte-db/issues/1905. |
5665 | 5.19k | table_pb.set_wal_retention_secs(req->wal_retention_secs()); |
5666 | 5.19k | has_changes = true; |
5667 | 5.19k | } |
5668 | | |
5669 | 5.89k | if (!has_changes) { |
5670 | 0 | if (req->has_force_send_alter_request() && req->force_send_alter_request()) { |
5671 | 0 | RETURN_NOT_OK(SendAlterTableRequest(table, req)); |
5672 | 0 | } |
5673 | | // Skip empty requests... |
5674 | 0 | return Status::OK(); |
5675 | 0 | } |
5676 | | |
5677 | | // Serialize the schema Increment the version number. |
5678 | 5.89k | if (new_schema.initialized()) { |
5679 | 580 | if (!l->pb.has_fully_applied_schema()) { |
5680 | | // The idea here is that if we are in the middle of updating the schema |
5681 | | // from one state to another, then YBClients will be given the older |
5682 | | // version until the schema is updated on all the tablets. |
5683 | | // As of Dec 2019, this may lead to some rejected operations/retries during |
5684 | | // the index backfill. See #3284 for possible optimizations. |
5685 | 580 | MultiStageAlterTable::CopySchemaDetailsToFullyApplied(&table_pb); |
5686 | 580 | } |
5687 | 580 | SchemaToPB(new_schema, table_pb.mutable_schema()); |
5688 | 580 | } |
5689 | | |
5690 | | // Only increment the version number if it is a schema change (AddTable change goes through a |
5691 | | // different path and it's not processed here). |
5692 | 5.89k | if (!req->has_wal_retention_secs()) { |
5693 | 700 | table_pb.set_version(table_pb.version() + 1); |
5694 | 700 | table_pb.set_updates_only_index_permissions(false); |
5695 | 700 | } |
5696 | 5.89k | table_pb.set_next_column_id(next_col_id); |
5697 | 5.89k | l.mutable_data()->set_state( |
5698 | 5.89k | SysTablesEntryPB::ALTERING, |
5699 | 5.89k | Substitute("Alter table version=$0 ts=$1", table_pb.version(), LocalTimeAsString())); |
5700 | | |
5701 | | // Update sys-catalog with the new table schema. |
5702 | 5.89k | TRACE("Updating metadata on disk"); |
5703 | 5.89k | std::vector<const DdlLogEntry*> ddl_log_entry_pointers; |
5704 | 5.89k | ddl_log_entry_pointers.reserve(ddl_log_entries.size()); |
5705 | 5.89k | for (const auto& entry : ddl_log_entries) { |
5706 | 600 | ddl_log_entry_pointers.push_back(&entry); |
5707 | 600 | } |
5708 | 5.89k | Status s = sys_catalog_->Upsert(leader_ready_term(), ddl_log_entry_pointers, table); |
5709 | 5.89k | if (!s.ok()) { |
5710 | 2 | s = s.CloneAndPrepend( |
5711 | 2 | Substitute("An error occurred while updating sys-catalog tables entry: $0", |
5712 | 2 | s.ToString())); |
5713 | 2 | LOG(WARNING) << s.ToString(); |
5714 | 2 | if (table->GetTableType() != PGSQL_TABLE_TYPE && |
5715 | 2 | (0 req->has_new_namespace()0 || req->has_new_table_name()0 )) { |
5716 | 0 | LockGuard lock(mutex_); |
5717 | 0 | VLOG_WITH_FUNC(3) << "Acquired the catalog manager lock"; |
5718 | 0 | CHECK_EQ(table_names_map_.erase({new_namespace_id, new_table_name}), 1); |
5719 | 0 | } |
5720 | | // TableMetadaLock follows RAII paradigm: when it leaves scope, |
5721 | | // 'l' will be unlocked, and the mutation will be aborted. |
5722 | 2 | return CheckIfNoLongerLeaderAndSetupError(s, resp); |
5723 | 2 | } |
5724 | | |
5725 | | // Remove the old name. Not present if PGSQL. |
5726 | 5.89k | if (table->GetTableType() != PGSQL_TABLE_TYPE && |
5727 | 5.89k | (183 req->has_new_namespace()183 || req->has_new_table_name()182 )) { |
5728 | 1 | TRACE("Removing (namespace, table) combination ($0, $1) from by-name map", |
5729 | 1 | namespace_id, table_name); |
5730 | 1 | LockGuard lock(mutex_); |
5731 | 1 | table_names_map_.erase({namespace_id, table_name}); |
5732 | 1 | } |
5733 | | |
5734 | | // Update the in-memory state. |
5735 | 5.89k | TRACE("Committing in-memory state"); |
5736 | 5.89k | l.Commit(); |
5737 | | |
5738 | 5.89k | RETURN_NOT_OK(SendAlterTableRequest(table, req)); |
5739 | | |
5740 | | // Increment transaction status version if needed. |
5741 | 5.89k | if (table->GetTableType() == TableType::TRANSACTION_STATUS_TABLE_TYPE) { |
5742 | 0 | RETURN_NOT_OK(IncrementTransactionTablesVersion()); |
5743 | 0 | } |
5744 | | |
5745 | 5.89k | LOG(INFO) << "Successfully initiated ALTER TABLE (pending tablet schema updates) for " |
5746 | 5.89k | << table->ToString() << " per request from " << RequestorString(rpc); |
5747 | 5.89k | return Status::OK(); |
5748 | 5.89k | } |
5749 | | |
5750 | | Status CatalogManager::IsAlterTableDone(const IsAlterTableDoneRequestPB* req, |
5751 | 1.39k | IsAlterTableDoneResponsePB* resp) { |
5752 | | // 1. Lookup the table and verify if it exists. |
5753 | 1.39k | TRACE("Looking up table"); |
5754 | 1.39k | scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTable(req->table())); |
5755 | | |
5756 | 1.39k | TRACE("Locking table"); |
5757 | 1.39k | auto l = table->LockForRead(); |
5758 | 1.39k | RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp)); |
5759 | | |
5760 | | // 2. Verify if the alter is in-progress. |
5761 | 1.39k | TRACE("Verify if there is an alter operation in progress for $0", table->ToString()); |
5762 | 1.39k | resp->set_schema_version(l->pb.version()); |
5763 | 1.39k | resp->set_done(l->pb.state() != SysTablesEntryPB::ALTERING); |
5764 | | |
5765 | 1.39k | return Status::OK(); |
5766 | 1.39k | } |
5767 | | |
5768 | | Result<TabletInfoPtr> CatalogManager::RegisterNewTabletForSplit( |
5769 | | TabletInfo* source_tablet_info, const PartitionPB& partition, |
5770 | 88 | TableInfo::WriteLock* table_write_lock, TabletInfo::WriteLock* tablet_write_lock) { |
5771 | 88 | const auto tablet_lock = source_tablet_info->LockForRead(); |
5772 | | |
5773 | 88 | auto table = source_tablet_info->table(); |
5774 | 88 | TabletInfoPtr new_tablet; |
5775 | 88 | { |
5776 | 88 | LockGuard lock(mutex_); |
5777 | 88 | new_tablet = CreateTabletInfo(table.get(), partition); |
5778 | 88 | } |
5779 | 88 | const auto& source_tablet_meta = tablet_lock->pb; |
5780 | | |
5781 | 88 | auto& new_tablet_meta = new_tablet->mutable_metadata()->mutable_dirty()->pb; |
5782 | 88 | new_tablet_meta.set_state(SysTabletsEntryPB::CREATING); |
5783 | 88 | new_tablet_meta.mutable_committed_consensus_state()->CopyFrom( |
5784 | 88 | source_tablet_meta.committed_consensus_state()); |
5785 | 88 | new_tablet_meta.set_split_depth(source_tablet_meta.split_depth() + 1); |
5786 | 88 | new_tablet_meta.set_split_parent_tablet_id(source_tablet_info->tablet_id()); |
5787 | | // TODO(tsplit): consider and handle failure scenarios, for example: |
5788 | | // - Crash or leader failover before sending out the split tasks. |
5789 | | // - Long enough partition while trying to send out the splits so that they timeout and |
5790 | | // not get executed. |
5791 | 88 | int new_partition_list_version; |
5792 | 88 | { |
5793 | 88 | LockGuard lock(mutex_); |
5794 | | |
5795 | 88 | auto& table_pb = table_write_lock->mutable_data()->pb; |
5796 | 88 | new_partition_list_version = table_pb.partition_list_version() + 1; |
5797 | 88 | table_pb.set_partition_list_version(new_partition_list_version); |
5798 | | |
5799 | 88 | tablet_write_lock->mutable_data()->pb.add_split_tablet_ids(new_tablet->id()); |
5800 | 88 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), table, new_tablet, source_tablet_info)); |
5801 | | |
5802 | 88 | MAYBE_FAULT(FLAGS_TEST_crash_after_creating_single_split_tablet); |
5803 | | |
5804 | 88 | table->AddTablet(new_tablet); |
5805 | | // TODO: We use this pattern in other places, but what if concurrent thread accesses not yet |
5806 | | // committed TabletInfo from the `table` ? |
5807 | 88 | new_tablet->mutable_metadata()->CommitMutation(); |
5808 | | |
5809 | 88 | auto tablet_map_checkout = tablet_map_.CheckOut(); |
5810 | 88 | (*tablet_map_checkout)[new_tablet->id()] = new_tablet; |
5811 | 88 | } |
5812 | 0 | LOG(INFO) << "Registered new tablet " << new_tablet->tablet_id() |
5813 | 88 | << " (" << AsString(partition) << ") to split the tablet " |
5814 | 88 | << source_tablet_info->tablet_id() |
5815 | 88 | << " (" << AsString(source_tablet_meta.partition()) |
5816 | 88 | << ") for table " << table->ToString() |
5817 | 88 | << ", new partition_list_version: " << new_partition_list_version; |
5818 | | |
5819 | 88 | return new_tablet; |
5820 | 88 | } |
5821 | | |
5822 | | Status CatalogManager::GetTableSchema(const GetTableSchemaRequestPB* req, |
5823 | 171k | GetTableSchemaResponsePB* resp) { |
5824 | 171k | VLOG(1) << "Servicing GetTableSchema request for " << req->ShortDebugString()3 ; |
5825 | | |
5826 | | // Lookup the table and verify if it exists. |
5827 | 171k | TRACE("Looking up table"); |
5828 | 171k | scoped_refptr<TableInfo> table = VERIFY_RESULT168k (FindTable(req->table()));168k |
5829 | | |
5830 | | // Due to differences in the way proxies handle version mismatch (pull for yql vs push for sql). |
5831 | | // For YQL tables, we will return the "set of indexes" being applied instead of the ones |
5832 | | // that are fully completed. |
5833 | | // For PGSQL (and other) tables we want to return the fully applied schema. |
5834 | 0 | const bool get_fully_applied_indexes = table->GetTableType() != TableType::YQL_TABLE_TYPE; |
5835 | 168k | return GetTableSchemaInternal(req, resp, get_fully_applied_indexes); |
5836 | 171k | } |
5837 | | |
5838 | | Status CatalogManager::GetTableSchemaInternal(const GetTableSchemaRequestPB* req, |
5839 | | GetTableSchemaResponsePB* resp, |
5840 | 170k | bool get_fully_applied_indexes) { |
5841 | 170k | VLOG(1) << "Servicing GetTableSchema request for " << req->ShortDebugString()10 ; |
5842 | | |
5843 | | // Lookup the table and verify if it exists. |
5844 | 170k | TRACE("Looking up table"); |
5845 | 170k | scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTable(req->table())); |
5846 | | |
5847 | 170k | TRACE("Locking table"); |
5848 | 170k | auto l = table->LockForRead(); |
5849 | 170k | RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp)); |
5850 | | |
5851 | 169k | if (l->pb.has_fully_applied_schema()) { |
5852 | | // An AlterTable is in progress; fully_applied_schema is the last |
5853 | | // schema that has reached every TS. |
5854 | 1.20k | DCHECK(l->pb.state() == SysTablesEntryPB::ALTERING); |
5855 | 1.20k | resp->mutable_schema()->CopyFrom(l->pb.fully_applied_schema()); |
5856 | 168k | } else { |
5857 | | // There's no AlterTable, the regular schema is "fully applied". |
5858 | 168k | resp->mutable_schema()->CopyFrom(l->pb.schema()); |
5859 | 168k | } |
5860 | | |
5861 | 169k | if (get_fully_applied_indexes && l->pb.has_fully_applied_schema()93.9k ) { |
5862 | 167 | resp->set_version(l->pb.fully_applied_schema_version()); |
5863 | 167 | resp->mutable_indexes()->CopyFrom(l->pb.fully_applied_indexes()); |
5864 | 167 | if (l->pb.has_fully_applied_index_info()) { |
5865 | 0 | resp->set_obsolete_indexed_table_id(GetIndexedTableId(l->pb)); |
5866 | 0 | *resp->mutable_index_info() = l->pb.fully_applied_index_info(); |
5867 | 0 | } |
5868 | 167 | VLOG(1) << "Returning" |
5869 | 0 | << "\nfully_applied_schema with version " |
5870 | 0 | << l->pb.fully_applied_schema_version() |
5871 | 0 | << ":\n" |
5872 | 0 | << yb::ToString(l->pb.fully_applied_indexes()) |
5873 | 0 | << "\ninstead of schema with version " |
5874 | 0 | << l->pb.version() |
5875 | 0 | << ":\n" |
5876 | 0 | << yb::ToString(l->pb.indexes()); |
5877 | 169k | } else { |
5878 | 169k | resp->set_version(l->pb.version()); |
5879 | 169k | resp->mutable_indexes()->CopyFrom(l->pb.indexes()); |
5880 | 169k | if (l->pb.has_index_info()) { |
5881 | 36.8k | resp->set_obsolete_indexed_table_id(GetIndexedTableId(l->pb)); |
5882 | 36.8k | *resp->mutable_index_info() = l->pb.index_info(); |
5883 | 36.8k | } |
5884 | 169k | VLOG(3) << "Returning" |
5885 | 1 | << "\nschema with version " |
5886 | 1 | << l->pb.version() |
5887 | 1 | << ":\n" |
5888 | 1 | << yb::ToString(l->pb.indexes()); |
5889 | 169k | } |
5890 | 169k | resp->set_is_compatible_with_previous_version(l->pb.updates_only_index_permissions()); |
5891 | 169k | resp->mutable_partition_schema()->CopyFrom(l->pb.partition_schema()); |
5892 | 169k | if (IsReplicationInfoSet(l->pb.replication_info())) { |
5893 | 2 | resp->mutable_replication_info()->CopyFrom(l->pb.replication_info()); |
5894 | 2 | } |
5895 | 169k | resp->set_create_table_done(!table->IsCreateInProgress()); |
5896 | 169k | resp->set_table_type(table->metadata().state().pb.table_type()); |
5897 | 169k | resp->mutable_identifier()->set_table_name(l->pb.name()); |
5898 | 169k | resp->mutable_identifier()->set_table_id(table->id()); |
5899 | 169k | resp->mutable_identifier()->mutable_namespace_()->set_id(table->namespace_id()); |
5900 | 169k | auto nsinfo = FindNamespaceById(table->namespace_id()); |
5901 | 169k | if (nsinfo.ok()169k ) { |
5902 | 169k | resp->mutable_identifier()->mutable_namespace_()->set_name((**nsinfo).name()); |
5903 | 169k | } |
5904 | | |
5905 | 169k | if (l->pb.has_wal_retention_secs()) { |
5906 | 5.03k | resp->set_wal_retention_secs(l->pb.wal_retention_secs()); |
5907 | 5.03k | } |
5908 | | |
5909 | | // Get namespace name by id. |
5910 | 169k | SharedLock lock(mutex_); |
5911 | 169k | TRACE("Looking up namespace"); |
5912 | 169k | const scoped_refptr<NamespaceInfo> ns = FindPtrOrNull(namespace_ids_map_, table->namespace_id()); |
5913 | | |
5914 | 169k | if (ns == nullptr) { |
5915 | 0 | Status s = STATUS_SUBSTITUTE( |
5916 | 0 | NotFound, "Could not find namespace by namespace id $0 for request $1.", |
5917 | 0 | table->namespace_id(), req->DebugString()); |
5918 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s); |
5919 | 0 | } |
5920 | | |
5921 | 169k | resp->mutable_identifier()->mutable_namespace_()->set_name(ns->name()); |
5922 | | |
5923 | 169k | resp->set_colocated(table->colocated()); |
5924 | | |
5925 | 18.4E | VLOG(1) << "Serviced GetTableSchema request for " << req->ShortDebugString() << " with " |
5926 | 18.4E | << yb::ToString(*resp); |
5927 | 169k | return Status::OK(); |
5928 | 169k | } |
5929 | | |
5930 | | Status CatalogManager::GetTablegroupSchema(const GetTablegroupSchemaRequestPB* req, |
5931 | 0 | GetTablegroupSchemaResponsePB* resp) { |
5932 | 0 | VLOG(1) << "Servicing GetTablegroupSchema request for " << req->ShortDebugString(); |
5933 | 0 | if (!req->parent_tablegroup().has_id()) { |
5934 | 0 | Status s = STATUS(InvalidArgument, "Invalid get tablegroup request (missing fields)"); |
5935 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
5936 | 0 | } |
5937 | | |
5938 | 0 | const std::string& tablegroupId = req->parent_tablegroup().id(); |
5939 | 0 | if (!IsTablegroupParentTableId(tablegroupId)) { |
5940 | 0 | Status s = STATUS(InvalidArgument, "Received a non tablegroup ID"); |
5941 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
5942 | 0 | } |
5943 | | |
5944 | | // Strip the suffix from the tablegroup ID request (since tablegroup_ids_map_ |
5945 | | // only accepts the plain ID). |
5946 | 0 | DCHECK(boost::algorithm::ends_with(tablegroupId, master::kTablegroupParentTableIdSuffix)); |
5947 | 0 | size_t tgid_len = tablegroupId.size() - strlen(master::kTablegroupParentTableIdSuffix); |
5948 | 0 | TablegroupId tgid = tablegroupId.substr(0, tgid_len); |
5949 | | |
5950 | | // Lookup the tablegroup. |
5951 | 0 | std::unordered_set<TableId> tablesInTablegroup; |
5952 | 0 | { |
5953 | 0 | SharedLock lock(mutex_); |
5954 | |
|
5955 | 0 | if (tablegroup_ids_map_.find(tgid) == tablegroup_ids_map_.end()) { |
5956 | 0 | return STATUS(NotFound, Substitute("Tablegroup not found for tablegroup id: $0", |
5957 | 0 | req->parent_tablegroup().id())); |
5958 | 0 | } |
5959 | 0 | scoped_refptr<TablegroupInfo> tginfo = tablegroup_ids_map_[tgid]; |
5960 | 0 | tablesInTablegroup = tginfo->ChildTables(); |
5961 | 0 | } |
5962 | | |
5963 | 0 | for (const auto& t : tablesInTablegroup) { |
5964 | 0 | TRACE("Looking up table"); |
5965 | 0 | GetTableSchemaRequestPB schemaReq; |
5966 | 0 | GetTableSchemaResponsePB schemaResp; |
5967 | 0 | schemaReq.mutable_table()->set_table_id(t); |
5968 | 0 | Status s = GetTableSchema(&schemaReq, &schemaResp); |
5969 | 0 | if (!s.ok() || schemaResp.has_error()) { |
5970 | 0 | LOG(ERROR) << "Error while getting table schema: " << s; |
5971 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); |
5972 | 0 | } |
5973 | 0 | resp->add_get_table_schema_response_pbs()->Swap(&schemaResp); |
5974 | 0 | } |
5975 | | |
5976 | 0 | return Status::OK(); |
5977 | 0 | } |
5978 | | |
5979 | | Status CatalogManager::GetColocatedTabletSchema(const GetColocatedTabletSchemaRequestPB* req, |
5980 | 0 | GetColocatedTabletSchemaResponsePB* resp) { |
5981 | 0 | VLOG(1) << "Servicing GetColocatedTabletSchema request for " << req->ShortDebugString(); |
5982 | | |
5983 | | // Lookup the given parent colocated table and verify if it exists. |
5984 | 0 | TRACE("Looking up table"); |
5985 | 0 | auto parent_colocated_table = VERIFY_RESULT(FindTable(req->parent_colocated_table())); |
5986 | 0 | { |
5987 | 0 | TRACE("Locking table"); |
5988 | 0 | auto l = parent_colocated_table->LockForRead(); |
5989 | 0 | RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp)); |
5990 | 0 | } |
5991 | | |
5992 | 0 | if (!parent_colocated_table->colocated() || !parent_colocated_table->IsColocatedParentTable()) { |
5993 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_TABLE_TYPE, |
5994 | 0 | STATUS(InvalidArgument, "Table provided is not a parent colocated table")); |
5995 | 0 | } |
5996 | | |
5997 | | // Next get all the user tables that are in the database. |
5998 | 0 | ListTablesRequestPB listTablesReq; |
5999 | 0 | ListTablesResponsePB ListTablesResp; |
6000 | |
|
6001 | 0 | listTablesReq.mutable_namespace_()->set_id(parent_colocated_table->namespace_id()); |
6002 | 0 | listTablesReq.mutable_namespace_()->set_database_type(YQL_DATABASE_PGSQL); |
6003 | 0 | listTablesReq.set_exclude_system_tables(true); |
6004 | 0 | Status status = ListTables(&listTablesReq, &ListTablesResp); |
6005 | 0 | if (!status.ok() || ListTablesResp.has_error()) { |
6006 | 0 | LOG(ERROR) << "Error while listing tables: " << status; |
6007 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, status); |
6008 | 0 | } |
6009 | | |
6010 | | // Get the table schema for each colocated table. |
6011 | 0 | for (const auto& t : ListTablesResp.tables()) { |
6012 | | // Need to check if this table is colocated first. |
6013 | 0 | TRACE("Looking up table"); |
6014 | 0 | scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTableById(t.id())); |
6015 | | |
6016 | 0 | if (table->colocated()) { |
6017 | | // Now we can get the schema for this table. |
6018 | 0 | GetTableSchemaRequestPB schemaReq; |
6019 | 0 | GetTableSchemaResponsePB schemaResp; |
6020 | 0 | schemaReq.mutable_table()->set_table_id(t.id()); |
6021 | 0 | status = GetTableSchema(&schemaReq, &schemaResp); |
6022 | 0 | if (!status.ok() || schemaResp.has_error()) { |
6023 | 0 | LOG(ERROR) << "Error while getting table schema: " << status; |
6024 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, status); |
6025 | 0 | } |
6026 | 0 | resp->add_get_table_schema_response_pbs()->Swap(&schemaResp); |
6027 | 0 | } |
6028 | 0 | } |
6029 | | |
6030 | 0 | return Status::OK(); |
6031 | 0 | } |
6032 | | |
6033 | | Status CatalogManager::ListTables(const ListTablesRequestPB* req, |
6034 | 2.93k | ListTablesResponsePB* resp) { |
6035 | 2.93k | NamespaceId namespace_id; |
6036 | | |
6037 | | // Validate namespace. |
6038 | 2.93k | if (req->has_namespace_()) { |
6039 | | // Lookup the namespace and verify if it exists. |
6040 | 672 | auto ns = VERIFY_NAMESPACE_FOUND(FindNamespace(req->namespace_()), resp); |
6041 | | |
6042 | 329 | auto ns_lock = ns->LockForRead(); |
6043 | 329 | namespace_id = ns->id(); |
6044 | | |
6045 | | // Don't list tables with a namespace that isn't running. |
6046 | 329 | if (ns->state() != SysNamespaceEntryPB::RUNNING) { |
6047 | 0 | LOG(INFO) << "ListTables request for a Namespace not running (State=" |
6048 | 0 | << SysNamespaceEntryPB::State_Name(ns->state()) << ")"; |
6049 | 0 | return Status::OK(); |
6050 | 0 | } |
6051 | 329 | } |
6052 | | |
6053 | 2.59k | bool has_rel_filter = req->relation_type_filter_size() > 0; |
6054 | 2.59k | bool include_user_table = has_rel_filter ? false310 : true2.28k ; |
6055 | 2.59k | bool include_user_index = has_rel_filter ? false310 : true2.28k ; |
6056 | 2.59k | bool include_system_table = req->exclude_system_tables() ? false368 |
6057 | 2.59k | : (2.22k has_rel_filter2.22k ? false310 : true1.91k ); |
6058 | | |
6059 | 2.59k | for (const auto &relation : req->relation_type_filter()) { |
6060 | 311 | if (relation == SYSTEM_TABLE_RELATION) { |
6061 | 2 | include_system_table = true; |
6062 | 309 | } else if (relation == USER_TABLE_RELATION) { |
6063 | 308 | include_user_table = true; |
6064 | 308 | } else if (1 relation == INDEX_TABLE_RELATION1 ) { |
6065 | 1 | include_user_index = true; |
6066 | 1 | } |
6067 | 311 | } |
6068 | | |
6069 | 2.59k | SharedLock lock(mutex_); |
6070 | 2.59k | RelationType relation_type; |
6071 | | |
6072 | 720k | for (const auto& entry : *table_ids_map_) { |
6073 | 720k | auto& table_info = *entry.second; |
6074 | 720k | auto ltm = table_info.LockForRead(); |
6075 | | |
6076 | 720k | if (!ltm->visible_to_client() && !req->include_not_running()3.00k ) { |
6077 | 3.00k | continue; |
6078 | 3.00k | } |
6079 | | |
6080 | 717k | if (!namespace_id.empty() && namespace_id != table_info.namespace_id()182k ) { |
6081 | 144k | continue; // Skip tables from other namespaces. |
6082 | 144k | } |
6083 | | |
6084 | 572k | if (req->has_name_filter()) { |
6085 | 279k | size_t found = ltm->name().find(req->name_filter()); |
6086 | 279k | if (found == string::npos) { |
6087 | 279k | continue; |
6088 | 279k | } |
6089 | 279k | } |
6090 | | |
6091 | 293k | if (IsUserIndexUnlocked(table_info)) { |
6092 | 164 | if (!include_user_index) { |
6093 | 0 | continue; |
6094 | 0 | } |
6095 | 164 | relation_type = INDEX_TABLE_RELATION; |
6096 | 293k | } else if (IsUserTableUnlocked(table_info)) { |
6097 | 11.8k | if (!include_user_table) { |
6098 | 4 | continue; |
6099 | 4 | } |
6100 | 11.8k | relation_type = USER_TABLE_RELATION; |
6101 | 281k | } else { |
6102 | 281k | if (!include_system_table) { |
6103 | 225k | continue; |
6104 | 225k | } |
6105 | 56.3k | relation_type = SYSTEM_TABLE_RELATION; |
6106 | 56.3k | } |
6107 | | |
6108 | 68.4k | NamespaceIdentifierPB ns_identifier; |
6109 | 68.4k | ns_identifier.set_id(ltm->namespace_id()); |
6110 | 68.4k | auto ns = FindNamespaceUnlocked(ns_identifier); |
6111 | 68.4k | if (!ns.ok() || (**ns).state() != SysNamespaceEntryPB::RUNNING68.4k ) { |
6112 | 2 | if (PREDICT_FALSE(FLAGS_TEST_return_error_if_namespace_not_found)) { |
6113 | 1 | VERIFY_NAMESPACE_FOUND0 (0 std::move(ns), resp); |
6114 | 0 | } |
6115 | 1 | LOG(ERROR) << "Unable to find namespace with id " << ltm->namespace_id() |
6116 | 1 | << " for table " << ltm->name(); |
6117 | 1 | continue; |
6118 | 2 | } |
6119 | | |
6120 | 68.4k | ListTablesResponsePB::TableInfo *table = resp->add_tables(); |
6121 | 68.4k | { |
6122 | 68.4k | auto namespace_lock = (**ns).LockForRead(); |
6123 | 68.4k | table->mutable_namespace_()->set_id((**ns).id()); |
6124 | 68.4k | table->mutable_namespace_()->set_name(namespace_lock->name()); |
6125 | 68.4k | table->mutable_namespace_()->set_database_type(namespace_lock->pb.database_type()); |
6126 | 68.4k | } |
6127 | 68.4k | table->set_id(entry.second->id()); |
6128 | 68.4k | table->set_name(ltm->name()); |
6129 | 68.4k | table->set_table_type(ltm->table_type()); |
6130 | 68.4k | table->set_relation_type(relation_type); |
6131 | 68.4k | table->set_state(ltm->pb.state()); |
6132 | 68.4k | table->set_pgschema_name(ltm->schema().pgschema_name()); |
6133 | 68.4k | } |
6134 | 2.59k | return Status::OK(); |
6135 | 2.59k | } |
6136 | | |
6137 | 0 | boost::optional<TablegroupId> CatalogManager::FindTablegroupByTableId(const TableId& table_id) { |
6138 | 0 | SharedLock lock(mutex_); |
6139 | |
|
6140 | 0 | for (const auto& tablegroup : tablegroup_ids_map_) { |
6141 | 0 | const auto& tgid = tablegroup.first; |
6142 | 0 | const auto& tginfo = tablegroup.second; |
6143 | 0 | for (const auto& t : tginfo->ChildTables()) { |
6144 | 0 | if (table_id == t) { |
6145 | 0 | return boost::optional<TablegroupId>(tgid + kTablegroupParentTableIdSuffix); |
6146 | 0 | } |
6147 | 0 | } |
6148 | 0 | } |
6149 | | |
6150 | 0 | return boost::none; |
6151 | 0 | } |
6152 | | |
6153 | 4.85M | scoped_refptr<TableInfo> CatalogManager::GetTableInfo(const TableId& table_id) { |
6154 | 4.85M | SharedLock lock(mutex_); |
6155 | 4.85M | return FindPtrOrNull(*table_ids_map_, table_id); |
6156 | 4.85M | } |
6157 | | |
6158 | | scoped_refptr<TableInfo> CatalogManager::GetTableInfoFromNamespaceNameAndTableName( |
6159 | 0 | YQLDatabase db_type, const NamespaceName& namespace_name, const TableName& table_name) { |
6160 | 0 | if (db_type == YQL_DATABASE_PGSQL) |
6161 | 0 | return nullptr; |
6162 | 0 | SharedLock lock(mutex_); |
6163 | 0 | const auto ns = FindPtrOrNull(namespace_names_mapper_[db_type], namespace_name); |
6164 | 0 | return ns |
6165 | 0 | ? FindPtrOrNull(table_names_map_, {ns->id(), table_name}) |
6166 | 0 | : nullptr; |
6167 | 0 | } |
6168 | | |
6169 | 1.78M | scoped_refptr<TableInfo> CatalogManager::GetTableInfoUnlocked(const TableId& table_id) { |
6170 | 1.78M | return FindPtrOrNull(*table_ids_map_, table_id); |
6171 | 1.78M | } |
6172 | | |
6173 | 47.5k | std::vector<TableInfoPtr> CatalogManager::GetTables(GetTablesMode mode) { |
6174 | 47.5k | std::vector<TableInfoPtr> result; |
6175 | 47.5k | { |
6176 | 47.5k | SharedLock lock(mutex_); |
6177 | 47.5k | result.reserve(table_ids_map_->size()); |
6178 | 1.35M | for (const auto& e : *table_ids_map_) { |
6179 | 1.35M | result.push_back(e.second); |
6180 | 1.35M | } |
6181 | 47.5k | } |
6182 | 47.5k | switch (mode) { |
6183 | 2 | case GetTablesMode::kAll: |
6184 | 2 | return result; |
6185 | 160 | case GetTablesMode::kRunning: { |
6186 | 3.44k | auto filter = [](const TableInfoPtr& table_info) { return !table_info->is_running(); }; |
6187 | 160 | EraseIf(filter, &result); |
6188 | 160 | return result; |
6189 | 0 | } |
6190 | 47.4k | case GetTablesMode::kVisibleToClient: { |
6191 | 1.35M | auto filter = [](const TableInfoPtr& table_info) { |
6192 | 1.35M | return !table_info->LockForRead()->visible_to_client(); |
6193 | 1.35M | }; |
6194 | 47.4k | EraseIf(filter, &result); |
6195 | 47.4k | return result; |
6196 | 0 | } |
6197 | 47.5k | } |
6198 | 0 | FATAL_INVALID_ENUM_VALUE(GetTablesMode, mode); |
6199 | 0 | } |
6200 | | |
6201 | | void CatalogManager::GetAllNamespaces(std::vector<scoped_refptr<NamespaceInfo>>* namespaces, |
6202 | 13.6k | bool includeOnlyRunningNamespaces) { |
6203 | 13.6k | namespaces->clear(); |
6204 | 13.6k | SharedLock lock(mutex_); |
6205 | 56.0k | for (const NamespaceInfoMap::value_type& e : namespace_ids_map_) { |
6206 | 56.0k | if (includeOnlyRunningNamespaces && e.second->state() != SysNamespaceEntryPB::RUNNING55.8k ) { |
6207 | 7 | continue; |
6208 | 7 | } |
6209 | 56.0k | namespaces->push_back(e.second); |
6210 | 56.0k | } |
6211 | 13.6k | } |
6212 | | |
6213 | 13.9k | void CatalogManager::GetAllUDTypes(std::vector<scoped_refptr<UDTypeInfo>>* types) { |
6214 | 13.9k | types->clear(); |
6215 | 13.9k | SharedLock lock(mutex_); |
6216 | 13.9k | for (const UDTypeInfoMap::value_type& e : udtype_ids_map_) { |
6217 | 227 | types->push_back(e.second); |
6218 | 227 | } |
6219 | 13.9k | } |
6220 | | |
6221 | 3 | std::vector<std::shared_ptr<MonitoredTask>> CatalogManager::GetRecentTasks() { |
6222 | 3 | return tasks_tracker_->GetTasks(); |
6223 | 3 | } |
6224 | | |
6225 | 0 | std::vector<std::shared_ptr<MonitoredTask>> CatalogManager::GetRecentJobs() { |
6226 | 0 | return jobs_tracker_->GetTasks(); |
6227 | 0 | } |
6228 | | |
6229 | 25.2k | NamespaceName CatalogManager::GetNamespaceNameUnlocked(const NamespaceId& id) const { |
6230 | 25.2k | const scoped_refptr<NamespaceInfo> ns = FindPtrOrNull(namespace_ids_map_, id); |
6231 | 25.2k | return ns == nullptr ? NamespaceName()4 : ns->name()25.2k ; |
6232 | 25.2k | } |
6233 | | |
6234 | 37 | NamespaceName CatalogManager::GetNamespaceName(const NamespaceId& id) const { |
6235 | 37 | TRACE("Acquired catalog manager lock"); |
6236 | 37 | SharedLock lock(mutex_); |
6237 | 37 | return GetNamespaceNameUnlocked(id); |
6238 | 37 | } |
6239 | | |
6240 | | NamespaceName CatalogManager::GetNamespaceNameUnlocked( |
6241 | 0 | const scoped_refptr<TableInfo>& table) const { |
6242 | 0 | return GetNamespaceNameUnlocked(table->namespace_id()); |
6243 | 0 | } |
6244 | | |
6245 | 0 | NamespaceName CatalogManager::GetNamespaceName(const scoped_refptr<TableInfo>& table) const { |
6246 | 0 | return GetNamespaceName(table->namespace_id()); |
6247 | 0 | } |
6248 | | |
6249 | 239M | bool CatalogManager::IsSystemTable(const TableInfo& table) const { |
6250 | 239M | return table.is_system(); |
6251 | 239M | } |
6252 | | |
6253 | | // True if table is created by user. |
6254 | | // Table can be regular table or index in this case. |
6255 | 965 | bool CatalogManager::IsUserCreatedTable(const TableInfo& table) const { |
6256 | 965 | SharedLock lock(mutex_); |
6257 | 965 | return IsUserCreatedTableUnlocked(table); |
6258 | 965 | } |
6259 | | |
6260 | 588k | bool CatalogManager::IsUserCreatedTableUnlocked(const TableInfo& table) const { |
6261 | 588k | if (table.GetTableType() == PGSQL_TABLE_TYPE || table.GetTableType() == YQL_TABLE_TYPE59.8k ) { |
6262 | 586k | if (!IsSystemTable(table) && !IsSequencesSystemTable(table)25.2k && |
6263 | 586k | GetNamespaceNameUnlocked(table.namespace_id()) != kSystemNamespaceName25.2k && |
6264 | 586k | !table.IsColocatedParentTable()24.7k && |
6265 | 586k | !table.IsTablegroupParentTable()24.7k ) { |
6266 | 24.7k | return true; |
6267 | 24.7k | } |
6268 | 586k | } |
6269 | 563k | return false; |
6270 | 588k | } |
6271 | | |
6272 | 216 | bool CatalogManager::IsUserTable(const TableInfo& table) const { |
6273 | 216 | SharedLock lock(mutex_); |
6274 | 216 | return IsUserTableUnlocked(table); |
6275 | 216 | } |
6276 | | |
6277 | 293k | bool CatalogManager::IsUserTableUnlocked(const TableInfo& table) const { |
6278 | 293k | return IsUserCreatedTableUnlocked(table) && table.indexed_table_id().empty()11.9k ; |
6279 | 293k | } |
6280 | | |
6281 | 36 | bool CatalogManager::IsUserIndex(const TableInfo& table) const { |
6282 | 36 | SharedLock lock(mutex_); |
6283 | 36 | return IsUserIndexUnlocked(table); |
6284 | 36 | } |
6285 | | |
6286 | 293k | bool CatalogManager::IsUserIndexUnlocked(const TableInfo& table) const { |
6287 | 293k | return IsUserCreatedTableUnlocked(table) && !table.indexed_table_id().empty()12.0k ; |
6288 | 293k | } |
6289 | | |
6290 | 5 | bool CatalogManager::IsTablegroupParentTableId(const TableId& table_id) const { |
6291 | 5 | return table_id.find(kTablegroupParentTableIdSuffix) != std::string::npos; |
6292 | 5 | } |
6293 | | |
6294 | 5 | bool CatalogManager::IsColocatedParentTableId(const TableId& table_id) const { |
6295 | 5 | return table_id.find(kColocatedParentTableIdSuffix) != std::string::npos; |
6296 | 5 | } |
6297 | | |
6298 | 25.2k | bool CatalogManager::IsSequencesSystemTable(const TableInfo& table) const { |
6299 | 25.2k | if (table.GetTableType() == PGSQL_TABLE_TYPE && !table.IsColocatedParentTable()22.3k |
6300 | 25.2k | && !table.IsTablegroupParentTable()22.3k ) { |
6301 | | // This case commonly occurs during unit testing. Avoid unnecessary assert within Get(). |
6302 | 22.3k | if (!IsPgsqlId(table.namespace_id()) || !IsPgsqlId(table.id())22.3k ) { |
6303 | 4 | LOG(WARNING) << "Not PGSQL IDs " << table.namespace_id() << ", " << table.id(); |
6304 | 4 | return false; |
6305 | 4 | } |
6306 | 22.3k | Result<uint32_t> database_oid = GetPgsqlDatabaseOid(table.namespace_id()); |
6307 | 22.3k | if (!database_oid.ok()) { |
6308 | 0 | LOG(WARNING) << "Invalid Namespace ID " << table.namespace_id(); |
6309 | 0 | return false; |
6310 | 0 | } |
6311 | 22.3k | Result<uint32_t> table_oid = GetPgsqlTableOid(table.id()); |
6312 | 22.3k | if (!table_oid.ok()) { |
6313 | 0 | LOG(WARNING) << "Invalid Table ID " << table.id(); |
6314 | 0 | return false; |
6315 | 0 | } |
6316 | 22.3k | if (*database_oid == kPgSequencesDataDatabaseOid && *table_oid == kPgSequencesDataTableOid0 ) { |
6317 | 0 | return true; |
6318 | 0 | } |
6319 | 22.3k | } |
6320 | 25.2k | return false; |
6321 | 25.2k | } |
6322 | | |
6323 | | void CatalogManager::NotifyTabletDeleteFinished(const TabletServerId& tserver_uuid, |
6324 | | const TabletId& tablet_id, |
6325 | 75.8k | const TableInfoPtr& table) { |
6326 | 75.8k | shared_ptr<TSDescriptor> ts_desc; |
6327 | 75.8k | if (!master_->ts_manager()->LookupTSByUUID(tserver_uuid, &ts_desc)) { |
6328 | 1 | LOG(WARNING) << "Unable to find tablet server " << tserver_uuid; |
6329 | 75.8k | } else if (!ts_desc->IsTabletDeletePending(tablet_id)) { |
6330 | 1.86k | LOG(WARNING) << "Pending delete for tablet " << tablet_id << " in ts " |
6331 | 1.86k | << tserver_uuid << " doesn't exist"; |
6332 | 73.9k | } else { |
6333 | 73.9k | LOG(INFO) << "Clearing pending delete for tablet " << tablet_id << " in ts " << tserver_uuid; |
6334 | 73.9k | ts_desc->ClearPendingTabletDelete(tablet_id); |
6335 | 73.9k | } |
6336 | 75.8k | CheckTableDeleted(table); |
6337 | 75.8k | } |
6338 | | |
6339 | | bool CatalogManager::ReplicaMapDiffersFromConsensusState(const scoped_refptr<TabletInfo>& tablet, |
6340 | 373k | const ConsensusStatePB& cstate) { |
6341 | 373k | auto locs = tablet->GetReplicaLocations(); |
6342 | 373k | if (locs->size() != implicit_cast<size_t>(cstate.config().peers_size())) { |
6343 | 48.8k | return true; |
6344 | 48.8k | } |
6345 | 1.32M | for (auto iter = cstate.config().peers().begin(); 324k iter != cstate.config().peers().end(); iter++1.00M ) { |
6346 | 1.00M | if (locs->find(iter->permanent_uuid()) == locs->end()) { |
6347 | 0 | return true; |
6348 | 0 | } |
6349 | 1.00M | } |
6350 | 324k | return false; |
6351 | 324k | } |
6352 | | |
6353 | | namespace { |
6354 | | |
6355 | 899k | int64_t GetCommittedConsensusStateOpIdIndex(const ReportedTabletPB& report) { |
6356 | 899k | if (!report.has_committed_consensus_state() || |
6357 | 899k | !report.committed_consensus_state().config().has_opid_index()896k ) { |
6358 | 3.54k | return consensus::kInvalidOpIdIndex; |
6359 | 3.54k | } |
6360 | | |
6361 | 896k | return report.committed_consensus_state().config().opid_index(); |
6362 | 899k | } |
6363 | | |
6364 | | } // namespace |
6365 | | |
6366 | | bool CatalogManager::ProcessCommittedConsensusState( |
6367 | | TSDescriptor* ts_desc, |
6368 | | bool is_incremental, |
6369 | | const ReportedTabletPB& report, |
6370 | | const TableInfo::WriteLock& table_lock, |
6371 | | const TabletInfoPtr& tablet, |
6372 | | const TabletInfo::WriteLock& tablet_lock, |
6373 | 450k | std::vector<RetryingTSRpcTaskPtr>* rpcs) { |
6374 | 450k | const ConsensusStatePB& prev_cstate = tablet_lock->pb.committed_consensus_state(); |
6375 | 450k | ConsensusStatePB cstate = report.committed_consensus_state(); |
6376 | 450k | bool tablet_was_mutated = false; |
6377 | | |
6378 | | // 6a. The master only processes reports for replicas with committed |
6379 | | // consensus configurations since it needs the committed index to only |
6380 | | // cache the most up-to-date config. Since it's possible for TOMBSTONED |
6381 | | // replicas with no ConsensusMetadata on disk to be reported as having no |
6382 | | // committed config opid_index, we skip over those replicas. |
6383 | 450k | if (!cstate.config().has_opid_index()) { |
6384 | 0 | LOG(WARNING) << "Missing opid_index in reported config: " << report.ShortDebugString(); |
6385 | 0 | return false; |
6386 | 0 | } |
6387 | 450k | if (PREDICT_TRUE(FLAGS_master_ignore_stale_cstate) && |
6388 | 450k | (cstate.current_term() < prev_cstate.current_term() || |
6389 | 450k | GetCommittedConsensusStateOpIdIndex(report) < prev_cstate.config().opid_index()445k )) { |
6390 | 12.9k | LOG(WARNING) << "Stale heartbeat for Tablet " << tablet->ToString() |
6391 | 12.9k | << " on TS " << ts_desc->permanent_uuid() |
6392 | 12.9k | << "cstate=" << cstate.ShortDebugString() |
6393 | 12.9k | << ", prev_cstate=" << prev_cstate.ShortDebugString(); |
6394 | 12.9k | return false; |
6395 | 12.9k | } |
6396 | | |
6397 | | // 6b. Disregard the leader state if the reported leader is not a member |
6398 | | // of the committed config. |
6399 | 437k | if (cstate.leader_uuid().empty() || |
6400 | 437k | !IsRaftConfigMember(cstate.leader_uuid(), cstate.config())276k ) { |
6401 | 160k | cstate.clear_leader_uuid(); |
6402 | 160k | tablet_was_mutated = true; |
6403 | 160k | } |
6404 | | |
6405 | | // 6c. Mark the tablet as RUNNING if it makes sense to do so. |
6406 | | // |
6407 | | // We need to wait for a leader before marking a tablet as RUNNING, or |
6408 | | // else we could incorrectly consider a tablet created when only a |
6409 | | // minority of its replicas were successful. In that case, the tablet |
6410 | | // would be stuck in this bad state forever. |
6411 | | // - FLAG added to avoid waiting during mock tests. |
6412 | 437k | if (!tablet_lock->is_running() && |
6413 | 437k | report.state() == tablet::RUNNING205k && |
6414 | 437k | (205k cstate.has_leader_uuid()205k || |
6415 | 205k | !FLAGS_catalog_manager_wait_for_new_tablets_to_elect_leader157k )) { |
6416 | 47.9k | DCHECK_EQ(SysTabletsEntryPB::CREATING, tablet_lock->pb.state()) |
6417 | 0 | << "Tablet in unexpected state: " << tablet->ToString() |
6418 | 0 | << ": " << tablet_lock->pb.ShortDebugString(); |
6419 | 47.9k | VLOG(1) << "Tablet " << tablet->ToString() << " is now online"0 ; |
6420 | 47.9k | tablet_lock.mutable_data()->set_state(SysTabletsEntryPB::RUNNING, |
6421 | 47.9k | "Tablet reported with an active leader"); |
6422 | 47.9k | tablet_was_mutated = true; |
6423 | 47.9k | } |
6424 | | |
6425 | | // 6d. Update the consensus state if: |
6426 | | // - A config change operation was committed (reflected by a change to |
6427 | | // the committed config's opid_index). |
6428 | | // - The new cstate has a leader, and either the old cstate didn't, or |
6429 | | // there was a term change. |
6430 | 437k | if (cstate.config().opid_index() > prev_cstate.config().opid_index() || |
6431 | 437k | (432k cstate.has_leader_uuid()432k && |
6432 | 432k | (271k !prev_cstate.has_leader_uuid()271k || |
6433 | 271k | cstate.current_term() > prev_cstate.current_term()223k ))) { |
6434 | | |
6435 | | // 6d(i). Retain knowledge of the leader even if it wasn't reported in |
6436 | | // the latest config. |
6437 | | // |
6438 | | // When a config change is reported to the master, it may not include the |
6439 | | // leader because the follower doing the reporting may not know who the |
6440 | | // leader is yet (it may have just started up). It is safe to reuse |
6441 | | // the previous leader if the reported cstate has the same term as the |
6442 | | // previous cstate, and the leader was known for that term. |
6443 | 64.5k | if (cstate.current_term() == prev_cstate.current_term()) { |
6444 | 5.64k | if (!cstate.has_leader_uuid() && prev_cstate.has_leader_uuid()2 ) { |
6445 | 2 | cstate.set_leader_uuid(prev_cstate.leader_uuid()); |
6446 | | // Sanity check to detect consensus divergence bugs. |
6447 | 5.64k | } else if (cstate.has_leader_uuid() && prev_cstate.has_leader_uuid() && |
6448 | 5.64k | cstate.leader_uuid() != prev_cstate.leader_uuid()5.64k ) { |
6449 | 0 | string msg = Substitute("Previously reported cstate for tablet $0 gave " |
6450 | 0 | "a different leader for term $1 than the current cstate. " |
6451 | 0 | "Previous cstate: $2. Current cstate: $3.", |
6452 | 0 | tablet->ToString(), cstate.current_term(), |
6453 | 0 | prev_cstate.ShortDebugString(), cstate.ShortDebugString()); |
6454 | 0 | LOG(DFATAL) << msg; |
6455 | 0 | return false; |
6456 | 0 | } |
6457 | 5.64k | } |
6458 | | |
6459 | | // 6d(ii). Delete any replicas from the previous config that are not in the new one. |
6460 | 64.5k | if (64.5k FLAGS_master_tombstone_evicted_tablet_replicas64.5k ) { |
6461 | 64.5k | std::unordered_set<string> current_member_uuids; |
6462 | 194k | for (const consensus::RaftPeerPB &peer : cstate.config().peers()) { |
6463 | 194k | InsertOrDie(¤t_member_uuids, peer.permanent_uuid()); |
6464 | 194k | } |
6465 | 194k | for (const consensus::RaftPeerPB &prev_peer : prev_cstate.config().peers()) { |
6466 | 194k | const string& peer_uuid = prev_peer.permanent_uuid(); |
6467 | 194k | if (!ContainsKey(current_member_uuids, peer_uuid)) { |
6468 | | // Don't delete a tablet server that hasn't reported in yet (Bootstrapping). |
6469 | 1.72k | shared_ptr<TSDescriptor> dummy_ts_desc; |
6470 | 1.72k | if (!master_->ts_manager()->LookupTSByUUID(peer_uuid, &dummy_ts_desc)) { |
6471 | 12 | continue; |
6472 | 12 | } |
6473 | | // Otherwise, the TabletServer needs to remove this peer. |
6474 | 1.71k | rpcs->push_back(std::make_shared<AsyncDeleteReplica>( |
6475 | 1.71k | master_, AsyncTaskPool(), peer_uuid, tablet->table(), tablet->tablet_id(), |
6476 | 1.71k | TABLET_DATA_TOMBSTONED, prev_cstate.config().opid_index(), |
6477 | 1.71k | Substitute("TS $0 not found in new config with opid_index $1", |
6478 | 1.71k | peer_uuid, cstate.config().opid_index()))); |
6479 | 1.71k | } |
6480 | 194k | } |
6481 | 64.5k | } |
6482 | | // 6d(iii). Update the in-memory ReplicaLocations for this tablet using the new config. |
6483 | 64.5k | VLOG(2) << "Updating replicas for tablet " << tablet->tablet_id() |
6484 | 4 | << " using config reported by " << ts_desc->permanent_uuid() |
6485 | 4 | << " to that committed in log index " << cstate.config().opid_index() |
6486 | 4 | << " with leader state from term " << cstate.current_term(); |
6487 | 64.5k | ReconcileTabletReplicasInLocalMemoryWithReport( |
6488 | 64.5k | tablet, ts_desc->permanent_uuid(), cstate, report); |
6489 | | |
6490 | | // 6d(iv). Update the consensus state. Don't use 'prev_cstate' after this. |
6491 | 64.5k | LOG(INFO) << "Tablet: " << tablet->tablet_id() << " reported consensus state change." |
6492 | 64.5k | << " New consensus state: " << cstate.ShortDebugString() |
6493 | 64.5k | << " from " << ts_desc->permanent_uuid(); |
6494 | 64.5k | *tablet_lock.mutable_data()->pb.mutable_committed_consensus_state() = cstate; |
6495 | 64.5k | tablet_was_mutated = true; |
6496 | 373k | } else { |
6497 | | // Report opid_index is equal to the previous opid_index. If some |
6498 | | // replica is reporting the same consensus configuration we already know about, but we |
6499 | | // haven't yet heard from all the tservers in the config, update the in-memory |
6500 | | // ReplicaLocations. |
6501 | 373k | LOG(INFO) << "Peer " << ts_desc->permanent_uuid() << " sent " |
6502 | 373k | << (is_incremental ? "incremental"370k : "full tablet"2.47k ) |
6503 | 373k | << " report for " << tablet->tablet_id() |
6504 | 373k | << ", prev state op id: " << prev_cstate.config().opid_index() |
6505 | 373k | << ", prev state term: " << prev_cstate.current_term() |
6506 | 373k | << ", prev state has_leader_uuid: " << prev_cstate.has_leader_uuid() |
6507 | 373k | << ". Consensus state: " << cstate.ShortDebugString(); |
6508 | 373k | if (GetAtomicFlag(&FLAGS_enable_register_ts_from_raft) && |
6509 | 373k | ReplicaMapDiffersFromConsensusState(tablet, cstate)373k ) { |
6510 | 48.8k | ReconcileTabletReplicasInLocalMemoryWithReport( |
6511 | 48.8k | tablet, ts_desc->permanent_uuid(), cstate, report); |
6512 | 324k | } else { |
6513 | 324k | UpdateTabletReplicaInLocalMemory(ts_desc, &cstate, report, tablet); |
6514 | 324k | } |
6515 | 373k | } |
6516 | | |
6517 | 437k | if (FLAGS_use_create_table_leader_hint && |
6518 | 437k | !cstate.has_leader_uuid()436k && cstate.current_term() == 0160k ) { |
6519 | 153k | StartElectionIfReady(cstate, tablet.get()); |
6520 | 153k | } |
6521 | | |
6522 | | // 7. Send an AlterSchema RPC if the tablet has an old schema version. |
6523 | 437k | if (report.has_schema_version() && |
6524 | 437k | report.schema_version() != table_lock->pb.version()437k ) { |
6525 | 344 | if (report.schema_version() > table_lock->pb.version()) { |
6526 | 0 | LOG(ERROR) << "TS " << ts_desc->permanent_uuid() |
6527 | 0 | << " has reported a schema version greater than the current one " |
6528 | 0 | << " for tablet " << tablet->ToString() |
6529 | 0 | << ". Expected version " << table_lock->pb.version() |
6530 | 0 | << " got " << report.schema_version() |
6531 | 0 | << " (corruption)"; |
6532 | 344 | } else { |
6533 | | // TODO: For Alter (rolling apply to tablets), this is an expected transitory state. |
6534 | 344 | LOG(INFO) << "TS " << ts_desc->permanent_uuid() |
6535 | 344 | << " does not have the latest schema for tablet " << tablet->ToString() |
6536 | 344 | << ". Expected version " << table_lock->pb.version() |
6537 | 344 | << " got " << report.schema_version(); |
6538 | 344 | } |
6539 | | // It's possible that the tablet being reported is a laggy replica, and in fact |
6540 | | // the leader has already received an AlterTable RPC. That's OK, though -- |
6541 | | // it'll safely ignore it if we send another. |
6542 | 344 | TransactionId txn_id = TransactionId::Nil(); |
6543 | 344 | if (table_lock->pb.has_transaction() && |
6544 | 344 | table_lock->pb.transaction().has_transaction_id()39 ) { |
6545 | 39 | LOG(INFO) << "Parsing transaction ID for tablet ID " << tablet->tablet_id(); |
6546 | 39 | auto txn_id_res = FullyDecodeTransactionId(table_lock->pb.transaction().transaction_id()); |
6547 | 39 | if (!txn_id_res.ok()) { |
6548 | 0 | LOG(WARNING) << "Parsing transaction ID failed for tablet ID " << tablet->tablet_id(); |
6549 | 0 | return false; |
6550 | 0 | } |
6551 | 39 | txn_id = txn_id_res.get(); |
6552 | 39 | } |
6553 | 344 | LOG(INFO) << "Triggering AlterTable with transaction ID " << txn_id |
6554 | 344 | << " due to heartbeat delay for tablet ID " << tablet->tablet_id(); |
6555 | 344 | rpcs->push_back(std::make_shared<AsyncAlterTable>( |
6556 | 344 | master_, AsyncTaskPool(), tablet, tablet->table(), txn_id)); |
6557 | 344 | } |
6558 | | |
6559 | 437k | return tablet_was_mutated; |
6560 | 437k | } |
6561 | | |
6562 | | Status CatalogManager::ProcessTabletReportBatch( |
6563 | | TSDescriptor* ts_desc, |
6564 | | bool is_incremental, |
6565 | | ReportedTablets::const_iterator begin, |
6566 | | ReportedTablets::const_iterator end, |
6567 | | TabletReportUpdatesPB* full_report_update, |
6568 | 453k | std::vector<RetryingTSRpcTaskPtr>* rpcs) { |
6569 | | // 1. First Pass. Iterate in TabletId Order to discover all Table locks we'll need. Even though |
6570 | | // read locks are sufficient here, take write locks since we'll be writing to the tablet while |
6571 | | // holding this. |
6572 | | // Need to acquire both types of locks in Id order to prevent deadlock. |
6573 | 453k | std::map<TableId, TableInfo::WriteLock> table_write_locks; |
6574 | 906k | for (auto it = begin; it != end; ++it452k ) { |
6575 | 452k | auto& lock = table_write_locks[it->info->table()->id()]; |
6576 | 452k | if (!lock.locked()) { |
6577 | 451k | lock = it->info->table()->LockForWrite(); |
6578 | 451k | } |
6579 | 452k | } |
6580 | | |
6581 | 453k | map<TabletId, TabletInfo::WriteLock> tablet_write_locks; // used for unlock. |
6582 | | // 2. Second Pass. Process each tablet. This may not be in the order that the tablets |
6583 | | // appear in 'full_report', but that has no bearing on correctness. |
6584 | 453k | vector<TabletInfo*> mutated_tablets; // refcount protected by 'tablet_infos' |
6585 | 908k | for (auto it = begin; it != end; ++it454k ) { |
6586 | 454k | const auto& tablet_id = it->tablet_id; |
6587 | 454k | const TabletInfoPtr& tablet = it->info; |
6588 | 454k | const ReportedTabletPB& report = *it->report; |
6589 | 454k | const TableInfoPtr& table = tablet->table(); |
6590 | | |
6591 | | // Prepare an heartbeat response entry for this tablet, now that we're going to process it. |
6592 | | // Every tablet in the report that is processed gets one, even if there are no changes to it. |
6593 | 454k | ReportedTabletUpdatesPB* update = full_report_update->add_tablets(); |
6594 | 454k | update->set_tablet_id(tablet_id); |
6595 | | |
6596 | | // Get tablet lock on demand. This works in the batch case because the loop is ordered. |
6597 | 454k | tablet_write_locks[tablet_id] = tablet->LockForWrite(); |
6598 | 454k | auto& table_lock = table_write_locks[table->id()]; |
6599 | 454k | auto& tablet_lock = tablet_write_locks[tablet_id]; |
6600 | | |
6601 | 454k | TRACE_EVENT1("master", "HandleReportedTablet", "tablet_id", report.tablet_id()); |
6602 | 454k | RETURN_NOT_OK_PREPEND(CheckIsLeaderAndReady(), |
6603 | 454k | Substitute("This master is no longer the leader, unable to handle report for tablet $0", |
6604 | 454k | tablet_id)); |
6605 | | |
6606 | 18.4E | VLOG(3) << "tablet report: " << report.ShortDebugString(); |
6607 | | |
6608 | | // 3. Delete the tablet if it (or its table) have been deleted. |
6609 | 454k | if (tablet_lock->is_deleted() || |
6610 | 454k | table_lock->started_deleting()) { |
6611 | 98 | const string msg = tablet_lock->pb.state_msg(); |
6612 | 98 | update->set_state_msg(msg); |
6613 | 98 | LOG(INFO) << "Got report from deleted tablet " << tablet->ToString() |
6614 | 98 | << " (" << msg << "): Sending delete request for this tablet"; |
6615 | | // TODO(unknown): Cancel tablet creation, instead of deleting, in cases |
6616 | | // where that might be possible (tablet creation timeout & replacement). |
6617 | 98 | rpcs->push_back(std::make_shared<AsyncDeleteReplica>( |
6618 | 98 | master_, AsyncTaskPool(), ts_desc->permanent_uuid(), table, tablet_id, |
6619 | 98 | TABLET_DATA_DELETED, boost::none, msg)); |
6620 | 98 | ts_desc->AddPendingTabletDelete(tablet_id); |
6621 | 98 | continue; |
6622 | 98 | } |
6623 | | |
6624 | 454k | if (!table_lock->is_running()) { |
6625 | 0 | const string msg = tablet_lock->pb.state_msg(); |
6626 | 0 | LOG(INFO) << "Got report from tablet " << tablet->tablet_id() |
6627 | 0 | << " for non-running table " << table->ToString() << ": " << msg; |
6628 | 0 | update->set_state_msg(msg); |
6629 | 0 | continue; |
6630 | 0 | } |
6631 | | |
6632 | | // 3. Tombstone a replica that is no longer part of the Raft config (and |
6633 | | // not already tombstoned or deleted outright). |
6634 | | // |
6635 | | // If the report includes a committed raft config, we only tombstone if |
6636 | | // the opid_index is strictly less than the latest reported committed |
6637 | | // config. This prevents us from spuriously deleting replicas that have |
6638 | | // just been added to the committed config and are in the process of copying. |
6639 | 454k | const ConsensusStatePB& prev_cstate = tablet_lock->pb.committed_consensus_state(); |
6640 | 454k | const int64_t prev_opid_index = prev_cstate.config().opid_index(); |
6641 | 454k | const int64_t report_opid_index = GetCommittedConsensusStateOpIdIndex(report); |
6642 | 454k | if (FLAGS_master_tombstone_evicted_tablet_replicas && |
6643 | 454k | report.tablet_data_state() != TABLET_DATA_TOMBSTONED && |
6644 | 454k | report.tablet_data_state() != TABLET_DATA_DELETED && |
6645 | 454k | report_opid_index < prev_opid_index && |
6646 | 454k | !IsRaftConfigMember(ts_desc->permanent_uuid(), prev_cstate.config())11.0k ) { |
6647 | 194 | const string delete_msg = (report_opid_index == consensus::kInvalidOpIdIndex) ? |
6648 | 29 | "Replica has no consensus available" : |
6649 | 194 | Substitute("Replica with old config index $0", report_opid_index)165 ; |
6650 | 194 | rpcs->push_back(std::make_shared<AsyncDeleteReplica>( |
6651 | 194 | master_, AsyncTaskPool(), ts_desc->permanent_uuid(), table, tablet_id, |
6652 | 194 | TABLET_DATA_TOMBSTONED, prev_opid_index, |
6653 | 194 | Substitute("$0 (current committed config index is $1)", |
6654 | 194 | delete_msg, prev_opid_index))); |
6655 | 194 | ts_desc->AddPendingTabletDelete(tablet_id); |
6656 | 194 | continue; |
6657 | 194 | } |
6658 | | |
6659 | | // 4. Skip a non-deleted tablet which reports an error. |
6660 | 454k | if (report.has_error()) { |
6661 | 0 | Status s = StatusFromPB(report.error()); |
6662 | 0 | DCHECK(!s.ok()); |
6663 | 0 | DCHECK_EQ(report.state(), tablet::FAILED); |
6664 | 0 | LOG(WARNING) << "Tablet " << tablet->ToString() << " has failed on TS " |
6665 | 0 | << ts_desc->permanent_uuid() << ": " << s.ToString(); |
6666 | 0 | continue; |
6667 | 0 | } |
6668 | | |
6669 | | // Hide the tablet if it (or its table) has been hidden and the tablet hasn't. |
6670 | 454k | if ((tablet_lock->is_hidden() || |
6671 | 454k | table_lock->started_hiding()) && |
6672 | 454k | report.has_is_hidden()0 && |
6673 | 454k | !report.is_hidden()0 ) { |
6674 | 0 | const string msg = tablet_lock->pb.state_msg(); |
6675 | 0 | LOG(INFO) << "Got report from hidden tablet " << tablet->ToString() |
6676 | 0 | << " (" << msg << "): Sending hide request for this tablet"; |
6677 | 0 | auto task = std::make_shared<AsyncDeleteReplica>( |
6678 | 0 | master_, AsyncTaskPool(), ts_desc->permanent_uuid(), table, tablet_id, |
6679 | 0 | TABLET_DATA_DELETED, boost::none, msg); |
6680 | 0 | task->set_hide_only(true); |
6681 | 0 | ts_desc->AddPendingTabletDelete(tablet_id); |
6682 | 0 | rpcs->push_back(task); |
6683 | 0 | } |
6684 | | |
6685 | | // 5. Process the report's consensus state. |
6686 | | // The report will not have a committed_consensus_state if it is in the |
6687 | | // middle of starting up, such as during tablet bootstrap. |
6688 | | // If we received an incremental report, and the tablet is starting up, we will update the |
6689 | | // replica so that the balancer knows how many tablets are in the middle of remote bootstrap. |
6690 | 454k | if (report.has_committed_consensus_state()) { |
6691 | 450k | if (ProcessCommittedConsensusState( |
6692 | 450k | ts_desc, is_incremental, report, table_lock, tablet, tablet_lock, rpcs)) { |
6693 | | // 6. If the tablet was mutated, add it to the tablets to be re-persisted. |
6694 | | // |
6695 | | // Done here and not on a per-mutation basis to avoid duplicate entries. |
6696 | 225k | mutated_tablets.push_back(tablet.get()); |
6697 | 225k | } |
6698 | 450k | } else if (3.40k is_incremental3.40k && |
6699 | 3.41k | (report.state() == tablet::NOT_STARTED || report.state() == tablet::BOOTSTRAPPING613 )) { |
6700 | | // When a tablet server is restarted, it sends a full tablet report with all of its tablets |
6701 | | // in the NOT_STARTED state, so this would make the load balancer think that all the |
6702 | | // tablets are being remote bootstrapped at once, so only process incremental reports here. |
6703 | 3.41k | UpdateTabletReplicaInLocalMemory(ts_desc, nullptr /* consensus */, report, tablet); |
6704 | 3.41k | } |
6705 | 454k | } // Finished one round of batch processing. |
6706 | | |
6707 | | // 7. Unlock the tables; we no longer need to access their state. |
6708 | 454k | for (auto& l : table_write_locks)453k { |
6709 | 454k | l.second.Unlock(); |
6710 | 454k | } |
6711 | 453k | table_write_locks.clear(); |
6712 | | |
6713 | | // 8. Write all tablet mutations to the catalog table. |
6714 | | // |
6715 | | // SysCatalogTable::Write will short-circuit the case where the data has not |
6716 | | // in fact changed since the previous version and avoid any unnecessary mutations. |
6717 | 453k | if (!mutated_tablets.empty()) { |
6718 | 225k | Status s = sys_catalog_->Upsert(leader_ready_term(), mutated_tablets); |
6719 | 225k | if (!s.ok()) { |
6720 | 1 | LOG(WARNING) << "Error updating tablets: " << s; |
6721 | 1 | return s; |
6722 | 1 | } |
6723 | 225k | } |
6724 | | // Filter the mutated tablets to find which tablets were modified. Need to actually commit the |
6725 | | // state of the tablets before updating the system.partitions table, so get this first. |
6726 | 453k | vector<TabletInfoPtr> yql_partitions_mutated_tablets = |
6727 | 453k | VERIFY_RESULT(GetYqlPartitionsVtable().FilterRelevantTablets(mutated_tablets)); |
6728 | | |
6729 | | // 9. Publish the in-memory tablet mutations and release the locks. |
6730 | 454k | for (auto& l : tablet_write_locks) { |
6731 | 454k | l.second.Commit(); |
6732 | 454k | } |
6733 | 453k | tablet_write_locks.clear(); |
6734 | | |
6735 | | // Update the relevant tablet entries in system.partitions. |
6736 | 453k | if (!yql_partitions_mutated_tablets.empty()) { |
6737 | 21.1k | Status s = GetYqlPartitionsVtable() |
6738 | 21.1k | .ProcessMutatedTablets(yql_partitions_mutated_tablets, tablet_write_locks); |
6739 | 21.1k | } |
6740 | | |
6741 | | // 10. Third Pass. Process all tablet schema version changes. |
6742 | | // (This is separate from tablet state mutations because only table on-disk state is changed.) |
6743 | 908k | for (auto it = begin; it != end; ++it454k ) { |
6744 | 454k | const ReportedTabletPB& report = *it->report; |
6745 | 454k | if (!report.has_schema_version()) { |
6746 | 0 | continue; |
6747 | 0 | } |
6748 | 454k | const TabletInfoPtr& tablet = it->info; |
6749 | 454k | auto leader = tablet->GetLeader(); |
6750 | 454k | if (leader.ok() && leader.get()->permanent_uuid() == ts_desc->permanent_uuid()296k ) { |
6751 | 65.6k | RETURN_NOT_OK(HandleTabletSchemaVersionReport(tablet.get(), report.schema_version())); |
6752 | 65.6k | } |
6753 | 454k | } |
6754 | | |
6755 | 453k | return Status::OK(); |
6756 | 453k | } |
6757 | | |
6758 | | Status CatalogManager::ProcessTabletReport(TSDescriptor* ts_desc, |
6759 | | const TabletReportPB& full_report, |
6760 | | TabletReportUpdatesPB* full_report_update, |
6761 | 4.80M | RpcContext* rpc) { |
6762 | 4.80M | int num_tablets = full_report.updated_tablets_size(); |
6763 | 4.80M | TRACE_EVENT2("master", "ProcessTabletReport", |
6764 | 4.80M | "requestor", rpc->requestor_string(), |
6765 | 4.80M | "num_tablets", num_tablets); |
6766 | | |
6767 | 4.80M | VLOG_WITH_PREFIX2.50k (2) << "Received tablet report from " << RequestorString(rpc) << "(" |
6768 | 2.50k | << ts_desc->permanent_uuid() << "): " << full_report.DebugString(); |
6769 | | |
6770 | 4.80M | if (!ts_desc->has_tablet_report() && full_report.is_incremental()16.4k ) { |
6771 | 8.02k | LOG_WITH_PREFIX(WARNING) |
6772 | 8.02k | << "Invalid tablet report from " << ts_desc->permanent_uuid() |
6773 | 8.02k | << ": Received an incremental tablet report when a full one was needed"; |
6774 | | // We should respond with success in order to send reply that we need full report. |
6775 | 8.02k | return Status::OK(); |
6776 | 8.02k | } |
6777 | | |
6778 | | // TODO: on a full tablet report, we may want to iterate over the tablets we think |
6779 | | // the server should have, compare vs the ones being reported, and somehow mark |
6780 | | // any that have been "lost" (eg somehow the tablet metadata got corrupted or something). |
6781 | | |
6782 | 4.80M | ReportedTablets reported_tablets; |
6783 | | |
6784 | | // Tablet Deletes to process after the catalog lock below. |
6785 | 4.80M | set<TabletId> tablets_to_delete; |
6786 | | |
6787 | 4.80M | { |
6788 | | // Lock the catalog to iterate over tablet_ids_map_ & table_ids_map_. |
6789 | 4.80M | SharedLock lock(mutex_); |
6790 | | |
6791 | | // Fill the above variables before processing |
6792 | 4.80M | full_report_update->mutable_tablets()->Reserve(num_tablets); |
6793 | 4.80M | for (const ReportedTabletPB& report : full_report.updated_tablets()) { |
6794 | 456k | const string& tablet_id = report.tablet_id(); |
6795 | | |
6796 | | // 1a. Find the tablet, deleting/skipping it if it can't be found. |
6797 | 456k | scoped_refptr<TabletInfo> tablet = FindPtrOrNull(*tablet_map_, tablet_id); |
6798 | 456k | if (!tablet) { |
6799 | | // If a TS reported an unknown tablet, send a delete tablet rpc to the TS. |
6800 | 0 | LOG(INFO) << "Null tablet reported, possibly the TS was not around when the" |
6801 | 0 | " table was being deleted. Sending Delete tablet RPC to this TS."; |
6802 | 0 | tablets_to_delete.insert(tablet_id); |
6803 | | // Every tablet in the report that is processed gets a heartbeat response entry. |
6804 | 0 | ReportedTabletUpdatesPB* update = full_report_update->add_tablets(); |
6805 | 0 | update->set_tablet_id(tablet_id); |
6806 | 0 | continue; |
6807 | 0 | } |
6808 | 456k | if (!tablet->table() || FindOrNull(*table_ids_map_, tablet->table()->id()) == nullptr454k ) { |
6809 | 0 | auto table_id = tablet->table() == nullptr ? "(null)" : tablet->table()->id(); |
6810 | 0 | LOG(INFO) << "Got report from an orphaned tablet " << tablet_id << " on table " << table_id; |
6811 | 0 | tablets_to_delete.insert(tablet_id); |
6812 | | // Every tablet in the report that is processed gets a heartbeat response entry. |
6813 | 0 | ReportedTabletUpdatesPB* update = full_report_update->add_tablets(); |
6814 | 0 | update->set_tablet_id(tablet_id); |
6815 | 0 | continue; |
6816 | 0 | } |
6817 | | |
6818 | | // 1b. Found the tablet, update local state. |
6819 | 456k | reported_tablets.push_back(ReportedTablet { |
6820 | 456k | .tablet_id = tablet_id, |
6821 | 456k | .info = tablet, |
6822 | 456k | .report = &report, |
6823 | 456k | }); |
6824 | 456k | } |
6825 | 4.80M | } |
6826 | | |
6827 | 4.80M | std::sort(reported_tablets.begin(), reported_tablets.end(), [](const auto& lhs, const auto& rhs) { |
6828 | 206k | return lhs.tablet_id < rhs.tablet_id; |
6829 | 206k | }); |
6830 | | |
6831 | | // Process any delete requests from orphaned tablets, identified above. |
6832 | 4.80M | for (auto tablet_id : tablets_to_delete) { |
6833 | 0 | SendDeleteTabletRequest(tablet_id, TABLET_DATA_DELETED, boost::none, nullptr, ts_desc, |
6834 | 0 | "Report from an orphaned tablet"); |
6835 | 0 | } |
6836 | | |
6837 | | // Calculate the deadline for this expensive loop coming up. |
6838 | 4.80M | const auto safe_deadline = rpc->GetClientDeadline() - |
6839 | 4.80M | (FLAGS_heartbeat_rpc_timeout_ms * 1ms * FLAGS_heartbeat_safe_deadline_ratio); |
6840 | | |
6841 | | // Process tablets by batches. |
6842 | 5.25M | for (auto tablet_iter = reported_tablets.begin(); tablet_iter != reported_tablets.end();) { |
6843 | 453k | auto batch_begin = tablet_iter; |
6844 | 453k | tablet_iter += std::min<size_t>( |
6845 | 453k | reported_tablets.end() - tablet_iter, FLAGS_catalog_manager_report_batch_size); |
6846 | | |
6847 | | // Keeps track of all RPCs that should be sent when we're done with a single batch. |
6848 | 453k | std::vector<RetryingTSRpcTaskPtr> rpcs; |
6849 | 453k | auto status = ProcessTabletReportBatch( |
6850 | 453k | ts_desc, full_report.is_incremental(), batch_begin, tablet_iter, full_report_update, &rpcs); |
6851 | 453k | if (!status.ok()) { |
6852 | 2 | for (auto& rpc : rpcs) { |
6853 | 0 | rpc->AbortAndReturnPrevState(status); |
6854 | 0 | } |
6855 | 2 | return status; |
6856 | 2 | } |
6857 | | |
6858 | | // 13. Send all queued RPCs. |
6859 | 453k | for (auto& rpc : rpcs) { |
6860 | 2.34k | DCHECK(rpc->table()); |
6861 | 2.34k | rpc->table()->AddTask(rpc); |
6862 | 2.34k | WARN_NOT_OK(ScheduleTask(rpc), Substitute("Failed to send $0", rpc->description())); |
6863 | 2.34k | } |
6864 | 453k | rpcs.clear(); |
6865 | | |
6866 | | // 14. Check deadline. Need to exit before processing all batches if we're close to timing out. |
6867 | 453k | if (ts_desc->HasCapability(CAPABILITY_TabletReportLimit) && |
6868 | 454k | tablet_iter != reported_tablets.end()) { |
6869 | | // [TESTING] Inject latency before processing a batch to test deadline. |
6870 | 153k | if (PREDICT_FALSE(FLAGS_TEST_inject_latency_during_tablet_report_ms > 0)) { |
6871 | 0 | LOG(INFO) << "Sleeping in CatalogManager::ProcessTabletReport for " |
6872 | 0 | << FLAGS_TEST_inject_latency_during_tablet_report_ms << " ms"; |
6873 | 0 | SleepFor(MonoDelta::FromMilliseconds(FLAGS_TEST_inject_latency_during_tablet_report_ms)); |
6874 | 0 | } |
6875 | | |
6876 | | // Return from here at configured safe heartbeat deadline to give the response packet time. |
6877 | 153k | if (safe_deadline < CoarseMonoClock::Now()) { |
6878 | 919 | LOG(INFO) << "Reached Heartbeat deadline. Returning early after processing " |
6879 | 919 | << full_report_update->tablets_size() << " tablets"; |
6880 | 919 | full_report_update->set_processing_truncated(true); |
6881 | 919 | return Status::OK(); |
6882 | 919 | } |
6883 | 153k | } |
6884 | 453k | } // Loop to process the next batch until fully iterated. |
6885 | | |
6886 | 4.80M | if (!full_report.is_incremental()) { |
6887 | | // A full report may take multiple heartbeats. |
6888 | | // The TS communicates how much is left to process for the full report beyond this specific HB. |
6889 | 8.17k | bool completed_full_report = !full_report.has_remaining_tablet_count() |
6890 | 8.17k | || full_report.remaining_tablet_count() == 0; |
6891 | 8.17k | if (full_report.updated_tablets_size() == 0) { |
6892 | 7.97k | LOG(INFO) << ts_desc->permanent_uuid() << " sent full tablet report with 0 tablets."; |
6893 | 7.97k | } else if (206 !ts_desc->has_tablet_report()206 ) { |
6894 | 205 | LOG(INFO) << ts_desc->permanent_uuid() |
6895 | 205 | << (completed_full_report ? " finished" : " receiving"0 ) << " first full report: " |
6896 | 205 | << full_report.updated_tablets_size() << " tablets."; |
6897 | 205 | } |
6898 | | // We have a tablet report only once we're done processing all the chunks of the initial report. |
6899 | 8.17k | ts_desc->set_has_tablet_report(completed_full_report); |
6900 | 8.17k | } |
6901 | | |
6902 | | // 14. Queue background processing if we had updates. |
6903 | 4.80M | if (full_report.updated_tablets_size() > 0) { |
6904 | 301k | background_tasks_->WakeIfHasPendingUpdates(); |
6905 | 301k | } |
6906 | | |
6907 | 4.80M | return Status::OK(); |
6908 | 4.80M | } |
6909 | | |
6910 | | Status CatalogManager::CreateTablegroup(const CreateTablegroupRequestPB* req, |
6911 | | CreateTablegroupResponsePB* resp, |
6912 | 56 | rpc::RpcContext* rpc) { |
6913 | | |
6914 | 56 | CreateTableRequestPB ctreq; |
6915 | 56 | CreateTableResponsePB ctresp; |
6916 | | |
6917 | | // Sanity check for PB fields. |
6918 | 56 | if (!req->has_id() || !req->has_namespace_id() || !req->has_namespace_name()) { |
6919 | 0 | Status s = STATUS(InvalidArgument, "Improper CREATE TABLEGROUP request (missing fields)."); |
6920 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
6921 | 0 | } |
6922 | | |
6923 | | // Use the tablegroup id as the prefix for the parent table id. |
6924 | 56 | const auto parent_table_id = req->id() + kTablegroupParentTableIdSuffix; |
6925 | 56 | const auto parent_table_name = req->id() + kTablegroupParentTableNameSuffix; |
6926 | 56 | ctreq.set_name(parent_table_name); |
6927 | 56 | ctreq.set_table_id(parent_table_id); |
6928 | 56 | ctreq.mutable_namespace_()->set_name(req->namespace_name()); |
6929 | 56 | ctreq.mutable_namespace_()->set_id(req->namespace_id()); |
6930 | 56 | ctreq.set_table_type(PGSQL_TABLE_TYPE); |
6931 | 56 | ctreq.set_tablegroup_id(req->id()); |
6932 | 56 | ctreq.set_tablespace_id(req->tablespace_id()); |
6933 | | |
6934 | 56 | YBSchemaBuilder schemaBuilder; |
6935 | 56 | schemaBuilder.AddColumn("parent_column")->Type(BINARY)->PrimaryKey()->NotNull(); |
6936 | 56 | YBSchema ybschema; |
6937 | 56 | CHECK_OK(schemaBuilder.Build(&ybschema)); |
6938 | 56 | auto schema = yb::client::internal::GetSchema(ybschema); |
6939 | 56 | SchemaToPB(schema, ctreq.mutable_schema()); |
6940 | 56 | if (!FLAGS_TEST_tablegroup_master_only) { |
6941 | 55 | ctreq.mutable_schema()->mutable_table_properties()->set_is_transactional(true); |
6942 | 55 | } |
6943 | | |
6944 | | // Create a parent table, which will create the tablet. |
6945 | 56 | Status s = CreateTable(&ctreq, &ctresp, rpc); |
6946 | 56 | resp->set_parent_table_id(ctresp.table_id()); |
6947 | 56 | resp->set_parent_table_name(parent_table_name); |
6948 | | |
6949 | | // Carry over error. |
6950 | 56 | if (ctresp.has_error()) { |
6951 | 0 | resp->mutable_error()->Swap(ctresp.mutable_error()); |
6952 | 0 | } |
6953 | | |
6954 | | // We do not lock here so it is technically possible that the table was already created. |
6955 | | // If so, there is nothing to do so we just ignore the "AlreadyPresent" error. |
6956 | 56 | if (!s.ok() && !s.IsAlreadyPresent()2 ) { |
6957 | 2 | LOG(WARNING) << "Tablegroup creation failed: " << s.ToString(); |
6958 | 2 | return s; |
6959 | 2 | } |
6960 | | |
6961 | | // Update catalog manager maps |
6962 | 54 | LockGuard lock(mutex_); |
6963 | 54 | TRACE("Acquired catalog manager lock"); |
6964 | 54 | TablegroupInfo *tg = new TablegroupInfo(req->id(), req->namespace_id()); |
6965 | 54 | tablegroup_ids_map_[req->id()] = tg; |
6966 | 54 | table_tablegroup_ids_map_[parent_table_id] = tg->id(); |
6967 | | |
6968 | 54 | return s; |
6969 | 56 | } |
6970 | | |
6971 | | Status CatalogManager::DeleteTablegroup(const DeleteTablegroupRequestPB* req, |
6972 | | DeleteTablegroupResponsePB* resp, |
6973 | 40 | rpc::RpcContext* rpc) { |
6974 | 40 | LOG(INFO) << "Servicing DeleteTablegroup request from " << RequestorString(rpc) << ": " |
6975 | 40 | << req->ShortDebugString(); |
6976 | 40 | DeleteTableRequestPB dtreq; |
6977 | 40 | DeleteTableResponsePB dtresp; |
6978 | | |
6979 | | // Sanity check for PB fields |
6980 | 40 | if (!req->has_id() || !req->has_namespace_id()) { |
6981 | 0 | Status s = STATUS(InvalidArgument, "Improper DELETE TABLEGROUP request (missing fields)."); |
6982 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
6983 | 0 | } |
6984 | | |
6985 | | // Use the tablegroup id as the prefix for the parent table id. |
6986 | 40 | const auto parent_table_id = req->id() + kTablegroupParentTableIdSuffix; |
6987 | 40 | const auto parent_table_name = req->id() + kTablegroupParentTableNameSuffix; |
6988 | | |
6989 | 40 | dtreq.mutable_table()->set_table_name(parent_table_name); |
6990 | 40 | dtreq.mutable_table()->set_table_id(parent_table_id); |
6991 | 40 | dtreq.set_is_index_table(false); |
6992 | | |
6993 | 40 | { |
6994 | 40 | SharedLock lock(mutex_); |
6995 | 40 | const auto& tablegroup = tablegroup_ids_map_[req->id()]; |
6996 | | // Tablegroup should be empty. In practice that means it would contain only the |
6997 | | // dummy parent table. |
6998 | | // TODO(alex): Rework tablegroup internals to track real tables. |
6999 | 40 | if (tablegroup->NumChildTables() > 1) { |
7000 | 0 | return SetupError( |
7001 | 0 | resp->mutable_error(), |
7002 | 0 | MasterErrorPB::INVALID_REQUEST, |
7003 | 0 | STATUS_FORMAT(InvalidArgument, |
7004 | 0 | "Cannot delete tablegroup, it still has $0 tables in it", |
7005 | 0 | tablegroup->NumChildTables() - 1)); |
7006 | 0 | } |
7007 | 40 | } |
7008 | | |
7009 | 40 | Status s = DeleteTable(&dtreq, &dtresp, rpc); |
7010 | 40 | resp->set_parent_table_id(dtresp.table_id()); |
7011 | | |
7012 | | // Carry over error. |
7013 | 40 | if (dtresp.has_error()) { |
7014 | 0 | resp->mutable_error()->Swap(dtresp.mutable_error()); |
7015 | 0 | return s; |
7016 | 0 | } |
7017 | | |
7018 | | // Perform map updates. |
7019 | 40 | LockGuard lock(mutex_); |
7020 | 40 | TRACE("Acquired catalog manager lock"); |
7021 | 40 | tablegroup_ids_map_.erase(req->id()); |
7022 | 40 | tablegroup_tablet_ids_map_[req->namespace_id()].erase(req->id()); |
7023 | 40 | table_tablegroup_ids_map_.erase(parent_table_id); |
7024 | | |
7025 | 40 | LOG(INFO) << "Deleted tablegroup " << req->id(); |
7026 | 40 | return s; |
7027 | 40 | } |
7028 | | |
7029 | | Status CatalogManager::ListTablegroups(const ListTablegroupsRequestPB* req, |
7030 | | ListTablegroupsResponsePB* resp, |
7031 | 7 | rpc::RpcContext* rpc) { |
7032 | 7 | SharedLock lock(mutex_); |
7033 | | |
7034 | 7 | if (!req->has_namespace_id()) { |
7035 | 0 | Status s = STATUS(InvalidArgument, "Improper ListTablegroups request (missing fields)."); |
7036 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
7037 | 0 | } |
7038 | | |
7039 | 7 | if (tablegroup_tablet_ids_map_.find(req->namespace_id()) == tablegroup_tablet_ids_map_.end()) { |
7040 | 0 | return STATUS(NotFound, "Tablegroups not found for namespace id: ", req->namespace_id()); |
7041 | 0 | } |
7042 | | |
7043 | 9 | for (const auto& entry : tablegroup_tablet_ids_map_[req->namespace_id()])7 { |
7044 | 9 | const TablegroupId tgid = entry.first; |
7045 | 9 | if (tablegroup_ids_map_.find(tgid) == tablegroup_ids_map_.end()) { |
7046 | 0 | LOG(WARNING) << "Tablegroup info in " << req->namespace_id() |
7047 | 0 | << " not found for tablegroup id: " << tgid; |
7048 | 0 | continue; |
7049 | 0 | } |
7050 | 9 | scoped_refptr<TablegroupInfo> tginfo = tablegroup_ids_map_[tgid]; |
7051 | | |
7052 | 9 | TablegroupIdentifierPB *tg = resp->add_tablegroups(); |
7053 | 9 | tg->set_id(tginfo->id()); |
7054 | 9 | tg->set_namespace_id(tginfo->namespace_id()); |
7055 | 9 | } |
7056 | 7 | return Status::OK(); |
7057 | 7 | } |
7058 | | |
7059 | 2 | bool CatalogManager::HasTablegroups() { |
7060 | 2 | SharedLock lock(mutex_); |
7061 | 2 | return !tablegroup_ids_map_.empty(); |
7062 | 2 | } |
7063 | | |
7064 | | Status CatalogManager::CreateNamespace(const CreateNamespaceRequestPB* req, |
7065 | | CreateNamespaceResponsePB* resp, |
7066 | 2.65k | rpc::RpcContext* rpc) { |
7067 | 2.65k | Status return_status; |
7068 | | |
7069 | | // Copy the request, so we can fill in some defaults. |
7070 | 2.65k | LOG(INFO) << "CreateNamespace from " << RequestorString(rpc) |
7071 | 2.65k | << ": " << req->DebugString(); |
7072 | | |
7073 | 2.65k | scoped_refptr<NamespaceInfo> ns; |
7074 | 2.65k | std::vector<scoped_refptr<TableInfo>> pgsql_tables; |
7075 | 2.65k | TransactionMetadata txn; |
7076 | 2.65k | const auto db_type = GetDatabaseType(*req); |
7077 | 2.65k | { |
7078 | 2.65k | LockGuard lock(mutex_); |
7079 | 2.65k | TRACE("Acquired catalog manager lock"); |
7080 | | |
7081 | | // Validate the user request. |
7082 | | |
7083 | | // Verify that the namespace does not already exist. |
7084 | 2.65k | ns = FindPtrOrNull(namespace_ids_map_, req->namespace_id()); // Same ID. |
7085 | 2.65k | if (ns == nullptr && db_type != YQL_DATABASE_PGSQL) { |
7086 | | // PGSQL databases have name uniqueness handled at a different layer, so ignore overlaps. |
7087 | 2.39k | ns = FindPtrOrNull(namespace_names_mapper_[db_type], req->name()); |
7088 | 2.39k | } |
7089 | 2.65k | if (ns != nullptr) { |
7090 | 6 | resp->set_id(ns->id()); |
7091 | 6 | return_status = STATUS_SUBSTITUTE(AlreadyPresent, "Keyspace '$0' already exists", |
7092 | 6 | req->name()); |
7093 | 6 | LOG(WARNING) << "Found keyspace: " << ns->id() << ". Failed creating keyspace with error: " |
7094 | 6 | << return_status.ToString() << " Request:\n" << req->DebugString(); |
7095 | 6 | return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_ALREADY_PRESENT, |
7096 | 6 | return_status); |
7097 | 6 | } |
7098 | | |
7099 | | // Add the new namespace. |
7100 | | |
7101 | | // Create unique id for this new namespace. |
7102 | 2.64k | NamespaceId new_id = !req->namespace_id().empty() |
7103 | 2.64k | ? req->namespace_id()229 : GenerateIdUnlocked(SysRowEntryType::NAMESPACE)2.41k ; |
7104 | 2.64k | ns = new NamespaceInfo(new_id); |
7105 | 2.64k | ns->mutable_metadata()->StartMutation(); |
7106 | 2.64k | SysNamespaceEntryPB *metadata = &ns->mutable_metadata()->mutable_dirty()->pb; |
7107 | 2.64k | metadata->set_name(req->name()); |
7108 | 2.64k | metadata->set_database_type(db_type); |
7109 | 2.64k | metadata->set_colocated(req->colocated()); |
7110 | 2.64k | metadata->set_state(SysNamespaceEntryPB::PREPARING); |
7111 | | |
7112 | | // For namespace created for a Postgres database, save the list of tables and indexes for |
7113 | | // for the database that need to be copied. |
7114 | 2.64k | if (db_type == YQL_DATABASE_PGSQL) { |
7115 | 260 | if (req->source_namespace_id().empty()) { |
7116 | 137 | metadata->set_next_pg_oid(req->next_pg_oid()); |
7117 | 137 | } else { |
7118 | 123 | const auto source_oid = GetPgsqlDatabaseOid(req->source_namespace_id()); |
7119 | 123 | if (!source_oid.ok()) { |
7120 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, |
7121 | 0 | source_oid.status()); |
7122 | 0 | } |
7123 | 77.6k | for (const auto& iter : *table_ids_map_)123 { |
7124 | 77.6k | const auto& table_id = iter.first; |
7125 | 77.6k | const auto& table = iter.second; |
7126 | 77.6k | if (IsPgsqlId(table_id) && CHECK_RESULT75.2k (GetPgsqlDatabaseOid(table_id)) == *source_oid75.2k ) { |
7127 | | // Since indexes have dependencies on the base tables, put the tables in the front. |
7128 | 15.5k | const bool is_table = table->indexed_table_id().empty(); |
7129 | 15.5k | pgsql_tables.insert(is_table ? pgsql_tables.begin()8.75k : pgsql_tables.end()6.81k , table); |
7130 | 15.5k | } |
7131 | 77.6k | } |
7132 | | |
7133 | 123 | scoped_refptr<NamespaceInfo> source_ns = FindPtrOrNull(namespace_ids_map_, |
7134 | 123 | req->source_namespace_id()); |
7135 | 123 | if (!source_ns) { |
7136 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, |
7137 | 0 | STATUS(NotFound, "Source keyspace not found", |
7138 | 0 | req->source_namespace_id())); |
7139 | 0 | } |
7140 | 123 | auto source_ns_lock = source_ns->LockForRead(); |
7141 | 123 | metadata->set_next_pg_oid(source_ns_lock->pb.next_pg_oid()); |
7142 | 123 | } |
7143 | 260 | } |
7144 | | |
7145 | | // NS with a Transaction should be rolled back if the transaction does not get Committed. |
7146 | | // Store this on the NS for now and use it later. |
7147 | 2.64k | if (req->has_transaction() && PREDICT_TRUE114 (FLAGS_enable_transactional_ddl_gc)) { |
7148 | 92 | metadata->mutable_transaction()->CopyFrom(req->transaction()); |
7149 | 92 | txn = VERIFY_RESULT(TransactionMetadata::FromPB(req->transaction())); |
7150 | 92 | RSTATUS_DCHECK(!txn.status_tablet.empty(), Corruption, "Given incomplete Transaction"); |
7151 | 92 | } |
7152 | | |
7153 | | // Add the namespace to the in-memory map for the assignment. |
7154 | 2.64k | namespace_ids_map_[ns->id()] = ns; |
7155 | 2.64k | namespace_names_mapper_[db_type][req->name()] = ns; |
7156 | | |
7157 | 2.64k | resp->set_id(ns->id()); |
7158 | 2.64k | } |
7159 | 2.64k | TRACE("Inserted new keyspace info into CatalogManager maps"); |
7160 | | |
7161 | | // Update the on-disk system catalog. |
7162 | 2.64k | return_status = sys_catalog_->Upsert(leader_ready_term(), ns); |
7163 | 2.64k | if (!return_status.ok()) { |
7164 | 6 | LOG(WARNING) << "Keyspace creation failed:" << return_status.ToString(); |
7165 | 6 | { |
7166 | 6 | LockGuard lock(mutex_); |
7167 | 6 | namespace_ids_map_.erase(ns->id()); |
7168 | 6 | namespace_names_mapper_[db_type].erase(req->name()); |
7169 | 6 | } |
7170 | 6 | ns->mutable_metadata()->AbortMutation(); |
7171 | 6 | return CheckIfNoLongerLeaderAndSetupError(return_status, resp); |
7172 | 6 | } |
7173 | 2.63k | TRACE("Wrote keyspace to sys-catalog"); |
7174 | | // Commit the namespace in-memory state. |
7175 | 2.63k | ns->mutable_metadata()->CommitMutation(); |
7176 | | |
7177 | 2.63k | LOG(INFO) << "Created keyspace " << ns->ToString(); |
7178 | | |
7179 | 2.63k | if (req->has_creator_role_name()) { |
7180 | 904 | RETURN_NOT_OK(permissions_manager_->GrantPermissions( |
7181 | 904 | req->creator_role_name(), |
7182 | 904 | get_canonical_keyspace(req->name()), |
7183 | 904 | req->name() /* resource name */, |
7184 | 904 | req->name() /* keyspace name */, |
7185 | 904 | all_permissions_for_resource(ResourceType::KEYSPACE), |
7186 | 904 | ResourceType::KEYSPACE, |
7187 | 904 | resp)); |
7188 | 904 | } |
7189 | | |
7190 | | // Colocated databases need to create a parent tablet to serve as the base storage location. |
7191 | 2.63k | if (req->colocated()) { |
7192 | 17 | CreateTableRequestPB req; |
7193 | 17 | CreateTableResponsePB resp; |
7194 | 17 | const auto parent_table_id = ns->id() + kColocatedParentTableIdSuffix; |
7195 | 17 | const auto parent_table_name = ns->id() + kColocatedParentTableNameSuffix; |
7196 | 17 | req.set_name(parent_table_name); |
7197 | 17 | req.set_table_id(parent_table_id); |
7198 | 17 | req.mutable_namespace_()->set_name(ns->name()); |
7199 | 17 | req.mutable_namespace_()->set_id(ns->id()); |
7200 | 17 | req.set_table_type(GetTableTypeForDatabase(ns->database_type())); |
7201 | 17 | req.set_colocated(true); |
7202 | | |
7203 | 17 | YBSchemaBuilder schemaBuilder; |
7204 | 17 | schemaBuilder.AddColumn("parent_column")->Type(BINARY)->PrimaryKey()->NotNull(); |
7205 | 17 | YBSchema ybschema; |
7206 | 17 | CHECK_OK(schemaBuilder.Build(&ybschema)); |
7207 | 17 | auto schema = yb::client::internal::GetSchema(ybschema); |
7208 | 17 | SchemaToPB(schema, req.mutable_schema()); |
7209 | 17 | req.mutable_schema()->mutable_table_properties()->set_is_transactional(true); |
7210 | | |
7211 | | // create a parent table, which will create the tablet. |
7212 | 17 | Status s = CreateTable(&req, &resp, rpc); |
7213 | | // We do not lock here so it is technically possible that the table was already created. |
7214 | | // If so, there is nothing to do so we just ignore the "AlreadyPresent" error. |
7215 | 17 | if (!s.ok() && !s.IsAlreadyPresent()0 ) { |
7216 | 0 | LOG(WARNING) << "Keyspace creation failed:" << s.ToString(); |
7217 | | // TODO: We should verify this behavior works end-to-end. |
7218 | | // Diverging in-memory state from disk so the user can issue a delete if no new leader. |
7219 | 0 | auto l = ns->LockForWrite(); |
7220 | 0 | SysNamespaceEntryPB& metadata = ns->mutable_metadata()->mutable_dirty()->pb; |
7221 | 0 | metadata.set_state(SysNamespaceEntryPB::FAILED); |
7222 | 0 | l.Commit(); |
7223 | 0 | return s; |
7224 | 0 | } |
7225 | 17 | } |
7226 | | |
7227 | 2.63k | if ((db_type == YQL_DATABASE_PGSQL && !pgsql_tables.empty()254 ) || |
7228 | 2.63k | PREDICT_FALSE2.51k (GetAtomicFlag(&FLAGS_TEST_hang_on_namespace_transition))) { |
7229 | | // Process the subsequent work in the background thread (normally PGSQL). |
7230 | 125 | LOG(INFO) << "Keyspace create enqueued for later processing: " << ns->ToString(); |
7231 | 125 | RETURN_NOT_OK(background_tasks_thread_pool_->SubmitFunc( |
7232 | 125 | std::bind(&CatalogManager::ProcessPendingNamespace, this, ns->id(), pgsql_tables, txn))); |
7233 | 125 | return Status::OK(); |
7234 | 2.51k | } else { |
7235 | | // All work is done, it's now safe to online the namespace (normally YQL). |
7236 | 2.51k | auto l = ns->LockForWrite(); |
7237 | 2.51k | SysNamespaceEntryPB& metadata = ns->mutable_metadata()->mutable_dirty()->pb; |
7238 | 2.51k | if (metadata.state() == SysNamespaceEntryPB::PREPARING) { |
7239 | 2.51k | metadata.set_state(SysNamespaceEntryPB::RUNNING); |
7240 | 2.51k | return_status = sys_catalog_->Upsert(leader_ready_term(), ns); |
7241 | 2.51k | if (!return_status.ok()) { |
7242 | | // Diverging in-memory state from disk so the user can issue a delete if no new leader. |
7243 | 4 | LOG(WARNING) << "Keyspace creation failed:" << return_status.ToString(); |
7244 | 4 | metadata.set_state(SysNamespaceEntryPB::FAILED); |
7245 | 4 | return_status = CheckIfNoLongerLeaderAndSetupError(return_status, resp); |
7246 | 2.51k | } else { |
7247 | 2.51k | TRACE("Activated keyspace in sys-catalog"); |
7248 | 2.51k | LOG(INFO) << "Activated keyspace: " << ns->ToString(); |
7249 | 2.51k | } |
7250 | | // Commit the namespace in-memory state. |
7251 | 2.51k | l.Commit(); |
7252 | 2.51k | } else { |
7253 | 0 | LOG(WARNING) << "Keyspace has invalid state (" << metadata.state() << "), aborting create"; |
7254 | 0 | } |
7255 | 2.51k | } |
7256 | 2.51k | return return_status; |
7257 | 2.63k | } |
7258 | | |
7259 | | void CatalogManager::ProcessPendingNamespace( |
7260 | | NamespaceId id, |
7261 | | std::vector<scoped_refptr<TableInfo>> template_tables, |
7262 | 126 | TransactionMetadata txn) { |
7263 | 126 | LOG(INFO) << "ProcessPendingNamespace started for " << id; |
7264 | | |
7265 | | // Ensure that we are currently the Leader before handling DDL operations. |
7266 | 126 | { |
7267 | 126 | SCOPED_LEADER_SHARED_LOCK(l, this); |
7268 | 126 | if (!l.catalog_status().ok()) { |
7269 | 0 | LOG(WARNING) << "Catalog status failure: " << l.catalog_status().ToString(); |
7270 | | // Don't try again, we have to reset in-memory state after losing leader election. |
7271 | 0 | return; |
7272 | 0 | } |
7273 | 126 | if (!l.leader_status().ok()) { |
7274 | 0 | LOG(WARNING) << "Leader status failure: " << l.leader_status().ToString(); |
7275 | | // Don't try again, we have to reset in-memory state after losing leader election. |
7276 | 0 | return; |
7277 | 0 | } |
7278 | 126 | } |
7279 | | |
7280 | 126 | if (PREDICT_FALSE(GetAtomicFlag(&FLAGS_TEST_hang_on_namespace_transition))) { |
7281 | 2 | LOG(INFO) << "Artificially waiting (" << FLAGS_catalog_manager_bg_task_wait_ms |
7282 | 2 | << "ms) on namespace creation for " << id; |
7283 | 2 | SleepFor(MonoDelta::FromMilliseconds(FLAGS_catalog_manager_bg_task_wait_ms)); |
7284 | 2 | WARN_NOT_OK(background_tasks_thread_pool_->SubmitFunc( |
7285 | 2 | std::bind(&CatalogManager::ProcessPendingNamespace, this, id, template_tables, txn)), |
7286 | 2 | "Could not submit ProcessPendingNamespaces to thread pool"); |
7287 | 2 | return; |
7288 | 2 | } |
7289 | | |
7290 | 124 | scoped_refptr<NamespaceInfo> ns; |
7291 | 124 | { |
7292 | 124 | LockGuard lock(mutex_); |
7293 | 124 | ns = FindPtrOrNull(namespace_ids_map_, id);; |
7294 | 124 | } |
7295 | 124 | if (ns == nullptr) { |
7296 | 0 | LOG(WARNING) << "Pending Namespace not found to finish creation: " << id; |
7297 | 0 | return; |
7298 | 0 | } |
7299 | | |
7300 | | // Copy the system tables necessary to create this namespace. This can be time-intensive. |
7301 | 124 | bool success = true; |
7302 | 124 | if (!template_tables.empty()) { |
7303 | 123 | auto s = CopyPgsqlSysTables(ns->id(), template_tables); |
7304 | 123 | WARN_NOT_OK(s, "Error Copying PGSQL System Tables for Pending Namespace"); |
7305 | 123 | success = s.ok(); |
7306 | 123 | } |
7307 | | |
7308 | | // All work is done, change the namespace state regardless of success or failure. |
7309 | 124 | { |
7310 | 124 | auto l = ns->LockForWrite(); |
7311 | 124 | SysNamespaceEntryPB& metadata = ns->mutable_metadata()->mutable_dirty()->pb; |
7312 | 124 | if (metadata.state() == SysNamespaceEntryPB::PREPARING) { |
7313 | 124 | metadata.set_state(success ? SysNamespaceEntryPB::RUNNING123 : SysNamespaceEntryPB::FAILED1 ); |
7314 | 124 | auto s = sys_catalog_->Upsert(leader_ready_term(), ns); |
7315 | 124 | if (s.ok()) { |
7316 | 123 | TRACE("Done processing keyspace"); |
7317 | 123 | LOG(INFO) << (success ? "Processed" : "Failed"0 ) << " keyspace: " << ns->ToString(); |
7318 | | |
7319 | | // Verify Transaction gets committed, which occurs after namespace create finishes. |
7320 | 123 | if (success && metadata.has_transaction()) { |
7321 | 91 | LOG(INFO) << "Enqueuing keyspace for Transaction Verification: " << ns->ToString(); |
7322 | 91 | std::function<Status(bool)> when_done = |
7323 | 91 | std::bind(&CatalogManager::VerifyNamespacePgLayer, this, ns, _1); |
7324 | 91 | WARN_NOT_OK(background_tasks_thread_pool_->SubmitFunc( |
7325 | 91 | std::bind(&YsqlTransactionDdl::VerifyTransaction, ysql_transaction_.get(), |
7326 | 91 | txn, when_done)), |
7327 | 91 | "Could not submit VerifyTransaction to thread pool"); |
7328 | 91 | } |
7329 | 123 | } else { |
7330 | 1 | metadata.set_state(SysNamespaceEntryPB::FAILED); |
7331 | 1 | if (s.IsIllegalState() || s.IsAborted()) { |
7332 | 0 | s = STATUS(ServiceUnavailable, |
7333 | 0 | "operation requested can only be executed on a leader master, but this" |
7334 | 0 | " master is no longer the leader", s.ToString()); |
7335 | 1 | } else { |
7336 | 1 | s = s.CloneAndPrepend(Substitute( |
7337 | 1 | "An error occurred while modifying keyspace to $0 in sys-catalog: $1", |
7338 | 1 | metadata.state(), s.ToString())); |
7339 | 1 | } |
7340 | 1 | LOG(WARNING) << s.ToString(); |
7341 | 1 | } |
7342 | | // Commit the namespace in-memory state. |
7343 | 124 | l.Commit(); |
7344 | 124 | } else { |
7345 | 0 | LOG(WARNING) << "Bad keyspace state (" << metadata.state() |
7346 | 0 | << "), abandoning creation work for " << ns->ToString(); |
7347 | 0 | } |
7348 | 124 | } |
7349 | 124 | } |
7350 | | |
7351 | | Status CatalogManager::VerifyNamespacePgLayer( |
7352 | 90 | scoped_refptr<NamespaceInfo> ns, bool rpc_success) { |
7353 | | // Upon Transaction completion, check pg system table using OID to ensure SUCCESS. |
7354 | 90 | const auto pg_table_id = GetPgsqlTableId(atoi(kSystemNamespaceId), kPgDatabaseTableOid); |
7355 | 90 | auto entry_exists = VERIFY_RESULT( |
7356 | 90 | ysql_transaction_->PgEntryExists(pg_table_id, GetPgsqlDatabaseOid(ns->id()))); |
7357 | 0 | auto l = ns->LockForWrite(); |
7358 | 90 | SysNamespaceEntryPB& metadata = ns->mutable_metadata()->mutable_dirty()->pb; |
7359 | | |
7360 | | // #5981: Mark un-retryable rpc failures as pass to avoid infinite retry of GC'd txns. |
7361 | 90 | bool txn_check_passed = entry_exists || !rpc_success2 ; |
7362 | | |
7363 | 90 | if (txn_check_passed) { |
7364 | | // Passed checks. Remove the transaction from the entry since we're done processing it. |
7365 | 88 | SCHECK_EQ(metadata.state(), SysNamespaceEntryPB::RUNNING, Aborted, |
7366 | 85 | Substitute("Invalid Namespace state ($0), abandoning transaction GC work for $1", |
7367 | 85 | SysNamespaceEntryPB_State_Name(metadata.state()), ns->ToString())); |
7368 | 85 | metadata.clear_transaction(); |
7369 | 85 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), ns)); |
7370 | 85 | if (entry_exists) { |
7371 | 85 | LOG(INFO) << "Namespace transaction succeeded: " << ns->ToString(); |
7372 | 85 | } else { |
7373 | 0 | LOG(WARNING) << "Unknown RPC Failure, removing transaction on namespace: " << ns->ToString(); |
7374 | 0 | } |
7375 | | // Commit the namespace in-memory state. |
7376 | 85 | l.Commit(); |
7377 | 85 | } else { |
7378 | | // Transaction failed. We need to delete this Database now. |
7379 | 2 | SCHECK(metadata.state() == SysNamespaceEntryPB::RUNNING || |
7380 | 2 | metadata.state() == SysNamespaceEntryPB::FAILED, Aborted, |
7381 | 2 | Substitute("Invalid Namespace state ($0), aborting delete.", |
7382 | 2 | SysNamespaceEntryPB_State_Name(metadata.state()), ns->ToString())); |
7383 | 2 | LOG(INFO) << "Namespace transaction failed, deleting: " << ns->ToString(); |
7384 | 2 | metadata.set_state(SysNamespaceEntryPB::DELETING); |
7385 | 2 | metadata.clear_transaction(); |
7386 | 2 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), ns)); |
7387 | | // Commit the namespace in-memory state. |
7388 | 2 | l.Commit(); |
7389 | | // Async enqueue delete. |
7390 | 2 | RETURN_NOT_OK(background_tasks_thread_pool_->SubmitFunc( |
7391 | 2 | std::bind(&CatalogManager::DeleteYsqlDatabaseAsync, this, ns))); |
7392 | 2 | } |
7393 | 87 | return Status::OK(); |
7394 | 90 | } |
7395 | | |
7396 | | // Get the information about an in-progress create operation. |
7397 | | Status CatalogManager::IsCreateNamespaceDone(const IsCreateNamespaceDoneRequestPB* req, |
7398 | 3.56k | IsCreateNamespaceDoneResponsePB* resp) { |
7399 | 3.56k | auto ns_pb = req->namespace_(); |
7400 | | |
7401 | | // 1. Lookup the namespace and verify it exists. |
7402 | 3.56k | TRACE("Looking up keyspace"); |
7403 | 3.56k | auto ns = VERIFY_NAMESPACE_FOUND(FindNamespace(ns_pb), resp); |
7404 | | |
7405 | 3.56k | TRACE("Locking keyspace"); |
7406 | 3.56k | auto l = ns->LockForRead(); |
7407 | 3.56k | auto metadata = l->pb; |
7408 | | |
7409 | 3.56k | switch (metadata.state()) { |
7410 | | // Success cases. Done and working. |
7411 | 2.17k | case SysNamespaceEntryPB::RUNNING: |
7412 | 2.17k | if (!ns->colocated()) { |
7413 | 2.15k | resp->set_done(true); |
7414 | 2.15k | } else { |
7415 | | // Verify system table created as well, if colocated. |
7416 | 22 | IsCreateTableDoneRequestPB table_req; |
7417 | 22 | IsCreateTableDoneResponsePB table_resp; |
7418 | 22 | const auto parent_table_id = ns->id() + kColocatedParentTableIdSuffix; |
7419 | 22 | table_req.mutable_table()->set_table_id(parent_table_id); |
7420 | 22 | auto s = IsCreateTableDone(&table_req, &table_resp); |
7421 | 22 | resp->set_done(table_resp.done()); |
7422 | 22 | if (!s.ok()) { |
7423 | 0 | if (table_resp.has_error()) { |
7424 | 0 | resp->mutable_error()->Swap(table_resp.mutable_error()); |
7425 | 0 | } |
7426 | 0 | return s; |
7427 | 0 | } |
7428 | 22 | } |
7429 | 2.17k | break; |
7430 | | // These states indicate that a create completed but a subsequent remove was requested. |
7431 | 2.17k | case SysNamespaceEntryPB::DELETING: |
7432 | 0 | case SysNamespaceEntryPB::DELETED: |
7433 | 0 | resp->set_done(true); |
7434 | 0 | break; |
7435 | | // Pending cases. NOT DONE |
7436 | 1.39k | case SysNamespaceEntryPB::PREPARING: |
7437 | 1.39k | resp->set_done(false); |
7438 | 1.39k | break; |
7439 | | // Failure cases. Done, but we need to give the user an error message. |
7440 | 1 | case SysNamespaceEntryPB::FAILED: |
7441 | 1 | resp->set_done(true); |
7442 | 1 | return SetupError(resp->mutable_error(), MasterErrorPB::UNKNOWN_ERROR, STATUS(InternalError, |
7443 | 1 | "Namespace Create Failed: not onlined.")); |
7444 | 0 | default: |
7445 | 0 | Status s = STATUS_SUBSTITUTE(IllegalState, "IsCreateNamespaceDone failure: state=$0", |
7446 | 0 | SysNamespaceEntryPB_State_Name(metadata.state())); |
7447 | 0 | LOG(WARNING) << s.ToString(); |
7448 | 0 | resp->set_done(true); |
7449 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::UNKNOWN_ERROR, s); |
7450 | 3.56k | } |
7451 | | |
7452 | 3.56k | return Status::OK(); |
7453 | 3.56k | } |
7454 | | |
7455 | | Status CatalogManager::DeleteNamespace(const DeleteNamespaceRequestPB* req, |
7456 | | DeleteNamespaceResponsePB* resp, |
7457 | 1.64k | rpc::RpcContext* rpc) { |
7458 | 1.64k | auto status = DoDeleteNamespace(req, resp, rpc); |
7459 | 1.64k | if (!status.ok()) { |
7460 | 11 | return SetupError(resp->mutable_error(), status); |
7461 | 11 | } |
7462 | 1.63k | return status; |
7463 | 1.64k | } |
7464 | | |
7465 | | Status CatalogManager::DoDeleteNamespace(const DeleteNamespaceRequestPB* req, |
7466 | | DeleteNamespaceResponsePB* resp, |
7467 | 1.64k | rpc::RpcContext* rpc) { |
7468 | 1.64k | LOG(INFO) << "Servicing DeleteNamespace request from " << RequestorString(rpc) |
7469 | 1.64k | << ": " << req->ShortDebugString(); |
7470 | | |
7471 | | // Lookup the namespace and verify if it exists. |
7472 | 1.64k | TRACE("Looking up keyspace"); |
7473 | 1.64k | auto ns = VERIFY_RESULT1.64k (FindNamespace(req->namespace_()));1.64k |
7474 | | |
7475 | 1.64k | if (req->has_database_type() && req->database_type() != ns->database_type()74 ) { |
7476 | | // Could not find the right database to delete. |
7477 | 0 | return STATUS(NotFound, "Keyspace not found", ns->name(), |
7478 | 0 | MasterError(MasterErrorPB::NAMESPACE_NOT_FOUND)); |
7479 | 0 | } |
7480 | 1.64k | { |
7481 | | // Don't allow deletion if the namespace is in a transient state. |
7482 | 1.64k | auto cur_state = ns->state(); |
7483 | 1.64k | if (cur_state != SysNamespaceEntryPB::RUNNING && cur_state != SysNamespaceEntryPB::FAILED11 ) { |
7484 | 2 | if (cur_state == SysNamespaceEntryPB::DELETED) { |
7485 | 1 | return STATUS(NotFound, "Keyspace already deleted", ns->name(), |
7486 | 1 | MasterError(MasterErrorPB::NAMESPACE_NOT_FOUND)); |
7487 | 1 | } else { |
7488 | 1 | return STATUS_EC_FORMAT( |
7489 | 1 | TryAgain, MasterError(MasterErrorPB::IN_TRANSITION_CAN_RETRY), |
7490 | 1 | "Namespace deletion not allowed when State = $0", |
7491 | 1 | SysNamespaceEntryPB::State_Name(cur_state)); |
7492 | 1 | } |
7493 | 2 | } |
7494 | 1.64k | } |
7495 | | |
7496 | | // PGSQL has a completely forked implementation because it allows non-empty namespaces on delete. |
7497 | 1.64k | if (ns->database_type() == YQL_DATABASE_PGSQL) { |
7498 | 101 | return DeleteYsqlDatabase(req, resp, rpc); |
7499 | 101 | } |
7500 | | |
7501 | 1.54k | TRACE("Locking keyspace"); |
7502 | 1.54k | auto l = ns->LockForWrite(); |
7503 | | |
7504 | | // Only empty namespace can be deleted. |
7505 | 1.54k | TRACE("Looking for tables in the keyspace"); |
7506 | 1.54k | { |
7507 | 1.54k | SharedLock lock(mutex_); |
7508 | 1.54k | VLOG_WITH_FUNC0 (3) << "Acquired the catalog manager lock"0 ; |
7509 | | |
7510 | 32.5k | for (const TableInfoMap::value_type& entry : *table_ids_map_) { |
7511 | 32.5k | auto ltm = entry.second->LockForRead(); |
7512 | | |
7513 | 32.5k | if (!ltm->started_deleting() && ltm->namespace_id() == ns->id()29.9k ) { |
7514 | 3 | return STATUS_EC_FORMAT( |
7515 | 3 | InvalidArgument, MasterError(MasterErrorPB::NAMESPACE_IS_NOT_EMPTY), |
7516 | 3 | "Cannot delete keyspace which has $0: $1 [id=$2], request: $3", |
7517 | 3 | IsTable(ltm->pb) ? "table" : "index", ltm->name(), entry.second->id(), |
7518 | 3 | req->ShortDebugString()); |
7519 | 3 | } |
7520 | 32.5k | } |
7521 | | |
7522 | | // Only empty namespace can be deleted. |
7523 | 1.53k | TRACE("Looking for types in the keyspace"); |
7524 | | |
7525 | 1.53k | for (const UDTypeInfoMap::value_type& entry : udtype_ids_map_) { |
7526 | 0 | auto ltm = entry.second->LockForRead(); |
7527 | |
|
7528 | 0 | if (ltm->namespace_id() == ns->id()) { |
7529 | 0 | return STATUS_EC_FORMAT( |
7530 | 0 | InvalidArgument, MasterError(MasterErrorPB::NAMESPACE_IS_NOT_EMPTY), |
7531 | 0 | "Cannot delete keyspace which has type: $0 [id=$1], request: $2", |
7532 | 0 | ltm->name(), entry.second->id(), req->ShortDebugString()); |
7533 | 0 | } |
7534 | 0 | } |
7535 | 1.53k | } |
7536 | | |
7537 | | // Disallow deleting namespaces with snapshot schedules. |
7538 | 1.53k | auto map = VERIFY_RESULT(MakeSnapshotSchedulesToObjectIdsMap(SysRowEntryType::NAMESPACE)); |
7539 | 0 | for (const auto& schedule_and_objects : map) { |
7540 | 0 | for (const auto& id : schedule_and_objects.second) { |
7541 | 0 | if (id == ns->id()) { |
7542 | 0 | return STATUS_EC_FORMAT( |
7543 | 0 | InvalidArgument, MasterError(MasterErrorPB::NAMESPACE_IS_NOT_EMPTY), |
7544 | 0 | "Cannot delete keyspace which has schedule: $0, request: $1", |
7545 | 0 | schedule_and_objects.first, req->ShortDebugString()); |
7546 | 0 | } |
7547 | 0 | } |
7548 | 0 | } |
7549 | | |
7550 | | // [Delete]. Skip the DELETING->DELETED state, since no tables are present in this namespace. |
7551 | 1.53k | TRACE("Updating metadata on disk"); |
7552 | | // Update sys-catalog. |
7553 | 1.53k | Status s = sys_catalog_->Delete(leader_ready_term(), ns); |
7554 | 1.53k | if (!s.ok()) { |
7555 | | // The mutation will be aborted when 'l' exits the scope on early return. |
7556 | 0 | s = s.CloneAndPrepend("An error occurred while updating sys-catalog"); |
7557 | 0 | LOG(WARNING) << s; |
7558 | 0 | return CheckIfNoLongerLeader(s); |
7559 | 0 | } |
7560 | | |
7561 | | // Update the in-memory state. |
7562 | 1.53k | TRACE("Committing in-memory state"); |
7563 | 1.53k | l.Commit(); |
7564 | | |
7565 | | // Remove the namespace from all CatalogManager mappings. |
7566 | 1.53k | { |
7567 | 1.53k | LockGuard lock(mutex_); |
7568 | 1.53k | if (namespace_names_mapper_[ns->database_type()].erase(ns->name()) < 1) { |
7569 | 0 | LOG(WARNING) << Format("Could not remove namespace from names map, id=$1", ns->id()); |
7570 | 0 | } |
7571 | 1.53k | if (namespace_ids_map_.erase(ns->id()) < 1) { |
7572 | 0 | LOG(WARNING) << Format("Could not remove namespace from ids map, id=$1", ns->id()); |
7573 | 0 | } |
7574 | 1.53k | } |
7575 | | |
7576 | | // Delete any permissions granted on this keyspace to any role. See comment in DeleteTable() for |
7577 | | // more details. |
7578 | 1.53k | string canonical_resource = get_canonical_keyspace(req->namespace_().name()); |
7579 | 1.53k | RETURN_NOT_OK(permissions_manager_->RemoveAllPermissionsForResource(canonical_resource, resp)); |
7580 | | |
7581 | 1.53k | LOG(INFO) << "Successfully deleted keyspace " << ns->ToString() |
7582 | 1.53k | << " per request from " << RequestorString(rpc); |
7583 | 1.53k | return Status::OK(); |
7584 | 1.53k | } |
7585 | | |
7586 | 0 | void CatalogManager::DeleteYcqlDatabaseAsync(scoped_refptr<NamespaceInfo> database) { |
7587 | 0 | TRACE("Locking keyspace"); |
7588 | 0 | auto l = database->LockForWrite(); |
7589 | | |
7590 | | // Only empty namespace can be deleted. |
7591 | 0 | TRACE("Looking for tables in the keyspace"); |
7592 | 0 | { |
7593 | 0 | SharedLock lock(mutex_); |
7594 | 0 | VLOG_WITH_FUNC(3) << "Acquired the catalog manager lock"; |
7595 | |
|
7596 | 0 | for (const TableInfoMap::value_type& entry : *table_ids_map_) { |
7597 | 0 | auto ltm = entry.second->LockForRead(); |
7598 | |
|
7599 | 0 | if (!ltm->started_deleting() && ltm->namespace_id() == database->id()) { |
7600 | 0 | LOG(WARNING) << "Cannot delete keyspace which has " << ltm->name() |
7601 | 0 | << " with id=" << entry.second->id(); |
7602 | 0 | return; |
7603 | 0 | } |
7604 | 0 | } |
7605 | 0 | } |
7606 | | |
7607 | | // Only empty namespace can be deleted. |
7608 | 0 | TRACE("Looking for types in the keyspace"); |
7609 | 0 | { |
7610 | 0 | SharedLock lock(mutex_); |
7611 | 0 | VLOG_WITH_FUNC(3) << "Acquired the catalog manager lock"; |
7612 | |
|
7613 | 0 | for (const UDTypeInfoMap::value_type& entry : udtype_ids_map_) { |
7614 | 0 | auto ltm = entry.second->LockForRead(); |
7615 | |
|
7616 | 0 | if (ltm->namespace_id() == database->id()) { |
7617 | 0 | LOG(WARNING) << "Cannot delete keyspace which has type: " << ltm->name() |
7618 | 0 | << " with id=" << entry.second->id(); |
7619 | 0 | return; |
7620 | 0 | } |
7621 | 0 | } |
7622 | 0 | } |
7623 | | |
7624 | | // [Delete]. Skip the DELETING->DELETED state, since no tables are present in this namespace. |
7625 | 0 | TRACE("Updating metadata on disk"); |
7626 | | // Update sys-catalog. |
7627 | 0 | Status s = sys_catalog_->Delete(leader_ready_term(), database); |
7628 | 0 | if (!s.ok()) { |
7629 | | // The mutation will be aborted when 'l' exits the scope on early return. |
7630 | 0 | s = s.CloneAndPrepend(Substitute("An error occurred while updating sys-catalog: $0", |
7631 | 0 | s.ToString())); |
7632 | 0 | LOG(WARNING) << s.ToString(); |
7633 | 0 | return; |
7634 | 0 | } |
7635 | | |
7636 | | // Update the in-memory state. |
7637 | 0 | TRACE("Committing in-memory state"); |
7638 | 0 | l.Commit(); |
7639 | | |
7640 | | // Remove the namespace from all CatalogManager mappings. |
7641 | 0 | { |
7642 | 0 | LockGuard lock(mutex_); |
7643 | 0 | namespace_names_mapper_[database->database_type()].erase(database->name()); |
7644 | 0 | if (namespace_ids_map_.erase(database->id()) < 1) { |
7645 | 0 | LOG(WARNING) << Format("Could not remove namespace from maps, id=$1", database->id()); |
7646 | 0 | } |
7647 | 0 | } |
7648 | | |
7649 | | // Delete any permissions granted on this keyspace to any role. See comment in DeleteTable() for |
7650 | | // more details. |
7651 | 0 | string canonical_resource = get_canonical_keyspace(database->name()); |
7652 | 0 | DeleteNamespaceResponsePB resp; |
7653 | 0 | s = permissions_manager_->RemoveAllPermissionsForResource(canonical_resource, &resp); |
7654 | 0 | if (s.ok()) { |
7655 | 0 | LOG(INFO) << "Successfully deleted keyspace " << database->ToString(); |
7656 | 0 | } else { |
7657 | 0 | LOG(WARNING) << "Error deleting keyspace " << database->ToString() << ": " << s; |
7658 | 0 | } |
7659 | 0 | } |
7660 | | |
7661 | | Status CatalogManager::DeleteYsqlDatabase(const DeleteNamespaceRequestPB* req, |
7662 | | DeleteNamespaceResponsePB* resp, |
7663 | 101 | rpc::RpcContext* rpc) { |
7664 | | // Lookup database. |
7665 | 101 | auto database = VERIFY_NAMESPACE_FOUND(FindNamespace(req->namespace_()), resp); |
7666 | | |
7667 | | // Make sure this is a YSQL database. |
7668 | 101 | if (database->database_type() != YQL_DATABASE_PGSQL) { |
7669 | | // A non-YSQL namespace is found, but the rpc requests to drop a YSQL database. |
7670 | 0 | Status s = STATUS(NotFound, "YSQL database not found", database->name()); |
7671 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s); |
7672 | 0 | } |
7673 | | |
7674 | | // Set the Namespace to DELETING. |
7675 | 101 | TRACE("Locking database"); |
7676 | 101 | auto l = database->LockForWrite(); |
7677 | 101 | SysNamespaceEntryPB &metadata = database->mutable_metadata()->mutable_dirty()->pb; |
7678 | 101 | if (metadata.state() == SysNamespaceEntryPB::RUNNING || |
7679 | 101 | metadata.state() == SysNamespaceEntryPB::FAILED9 ) { |
7680 | 101 | metadata.set_state(SysNamespaceEntryPB::DELETING); |
7681 | 101 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), database)); |
7682 | 96 | TRACE("Marked keyspace for deletion in sys-catalog"); |
7683 | | // Commit the namespace in-memory state. |
7684 | 96 | l.Commit(); |
7685 | 96 | } else { |
7686 | 0 | Status s = STATUS_SUBSTITUTE(IllegalState, |
7687 | 0 | "Keyspace ($0) has invalid state ($1), aborting delete", |
7688 | 0 | database->name(), metadata.state()); |
7689 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INTERNAL_ERROR, s); |
7690 | 0 | } |
7691 | | |
7692 | 96 | return background_tasks_thread_pool_->SubmitFunc( |
7693 | 96 | std::bind(&CatalogManager::DeleteYsqlDatabaseAsync, this, database)); |
7694 | 101 | } |
7695 | | |
7696 | 100 | void CatalogManager::DeleteYsqlDatabaseAsync(scoped_refptr<NamespaceInfo> database) { |
7697 | 100 | TEST_PAUSE_IF_FLAG(TEST_hang_on_namespace_transition); |
7698 | | |
7699 | | // Lock database before removing content. |
7700 | 100 | TRACE("Locking database"); |
7701 | 100 | auto l = database->LockForWrite(); |
7702 | 100 | SysNamespaceEntryPB &metadata = database->mutable_metadata()->mutable_dirty()->pb; |
7703 | | |
7704 | | // A DELETED Namespace has finished but was tombstoned to avoid immediately reusing the same ID. |
7705 | | // We consider a restart enough time, so we just need to remove it from the SysCatalog. |
7706 | 100 | if (metadata.state() == SysNamespaceEntryPB::DELETED) { |
7707 | 0 | Status s = sys_catalog_->Delete(leader_ready_term(), database); |
7708 | 0 | WARN_NOT_OK(s, "SysCatalog DeleteItem for Namespace"); |
7709 | 0 | if (!s.ok()) { |
7710 | 0 | return; |
7711 | 0 | } |
7712 | 100 | } else if (metadata.state() == SysNamespaceEntryPB::DELETING) { |
7713 | | // Delete all tables in the database. |
7714 | 99 | TRACE("Delete all tables in YSQL database"); |
7715 | 99 | Status s = DeleteYsqlDBTables(database); |
7716 | 99 | WARN_NOT_OK(s, "DeleteYsqlDBTables failed"); |
7717 | 99 | if (!s.ok()) { |
7718 | | // Move to FAILED so DeleteNamespace can be reissued by the user. |
7719 | 4 | metadata.set_state(SysNamespaceEntryPB::FAILED); |
7720 | 4 | l.Commit(); |
7721 | 4 | return; |
7722 | 4 | } |
7723 | | |
7724 | | // Once all user-facing data has been offlined, move the Namespace to DELETED state. |
7725 | 95 | metadata.set_state(SysNamespaceEntryPB::DELETED); |
7726 | 95 | s = sys_catalog_->Upsert(leader_ready_term(), database); |
7727 | 95 | WARN_NOT_OK(s, "SysCatalog Update for Namespace"); |
7728 | 95 | if (!s.ok()) { |
7729 | | // Move to FAILED so DeleteNamespace can be reissued by the user. |
7730 | 1 | metadata.set_state(SysNamespaceEntryPB::FAILED); |
7731 | 1 | l.Commit(); |
7732 | 1 | return; |
7733 | 1 | } |
7734 | 94 | TRACE("Marked keyspace as deleted in sys-catalog"); |
7735 | 94 | } else { |
7736 | 1 | LOG(WARNING) << "Keyspace (" << database->name() << ") has invalid state (" |
7737 | 1 | << metadata.state() << "), aborting delete"; |
7738 | 1 | return; |
7739 | 1 | } |
7740 | | |
7741 | | // Remove namespace from CatalogManager name mapping. Will remove ID map after all Tables gone. |
7742 | 94 | { |
7743 | 94 | LockGuard lock(mutex_); |
7744 | 94 | if (namespace_names_mapper_[database->database_type()].erase(database->name()) < 1) { |
7745 | 0 | LOG(WARNING) << Format("Could not remove namespace from maps, name=$0, id=$1", |
7746 | 0 | database->name(), database->id()); |
7747 | 0 | } |
7748 | 94 | } |
7749 | | |
7750 | | // Update the in-memory state. |
7751 | 94 | TRACE("Committing in-memory state"); |
7752 | 94 | l.Commit(); |
7753 | | |
7754 | | // DROP completed. Return status. |
7755 | 94 | LOG(INFO) << "Successfully deleted YSQL database " << database->ToString(); |
7756 | 94 | } |
7757 | | |
7758 | | // IMPORTANT: If modifying, consider updating DeleteTable(), the singular deletion API. |
7759 | 99 | Status CatalogManager::DeleteYsqlDBTables(const scoped_refptr<NamespaceInfo>& database) { |
7760 | 99 | TabletInfoPtr sys_tablet_info; |
7761 | 99 | vector<pair<scoped_refptr<TableInfo>, TableInfo::WriteLock>> tables; |
7762 | 99 | std::unordered_set<TableId> sys_table_ids; |
7763 | 99 | { |
7764 | | // Lock the catalog to iterate over table_ids_map_. |
7765 | 99 | SharedLock lock(mutex_); |
7766 | | |
7767 | 99 | sys_tablet_info = tablet_map_->find(kSysCatalogTabletId)->second; |
7768 | | |
7769 | | // Populate tables and sys_table_ids. |
7770 | 63.7k | for (const TableInfoMap::value_type& entry : *table_ids_map_) { |
7771 | 63.7k | scoped_refptr<TableInfo> table = entry.second; |
7772 | 63.7k | if (table->namespace_id() != database->id()) { |
7773 | 55.5k | continue; |
7774 | 55.5k | } |
7775 | 8.13k | auto l = table->LockForWrite(); |
7776 | 8.13k | if (l->started_deleting()) { |
7777 | 49 | continue; |
7778 | 49 | } |
7779 | 8.13k | RSTATUS_DCHECK( |
7780 | 8.08k | !l->pb.is_pg_shared_table(), Corruption, "Shared table found in database"); |
7781 | | |
7782 | 8.08k | if (IsSystemTable(*table)) { |
7783 | 8.00k | sys_table_ids.insert(table->id()); |
7784 | 8.00k | } |
7785 | | |
7786 | | // For regular (indexed) table, insert table info and lock in the front of the list. Else for |
7787 | | // index table, append them to the end. We do so so that we will commit and delete the indexed |
7788 | | // table first before its indexes. |
7789 | 8.08k | if (IsTable(l->pb)) { |
7790 | 4.51k | tables.insert(tables.begin(), {table, std::move(l)}); |
7791 | 4.51k | } else { |
7792 | 3.57k | tables.push_back({table, std::move(l)}); |
7793 | 3.57k | } |
7794 | 8.08k | } |
7795 | 99 | } |
7796 | | // Remove the system tables from RAFT. |
7797 | 99 | TRACE("Sending system table delete RPCs"); |
7798 | 8.00k | for (auto &table_id : sys_table_ids) { |
7799 | 8.00k | RETURN_NOT_OK(sys_catalog_->DeleteYsqlSystemTable(table_id)); |
7800 | 8.00k | } |
7801 | | // Remove the system tables from the system catalog TabletInfo. |
7802 | 99 | RETURN_NOT_OK(RemoveTableIdsFromTabletInfo(sys_tablet_info, sys_table_ids)); |
7803 | | |
7804 | | // Set all table states to DELETING as one batch RPC call. |
7805 | 95 | TRACE("Sending delete table batch RPC to sys catalog"); |
7806 | 95 | vector<TableInfo *> tables_rpc; |
7807 | 95 | tables_rpc.reserve(tables.size()); |
7808 | 8.08k | for (auto &table_and_lock : tables) { |
7809 | 8.08k | tables_rpc.push_back(table_and_lock.first.get()); |
7810 | 8.08k | auto &l = table_and_lock.second; |
7811 | | // Mark the table state as DELETING tablets. |
7812 | 8.08k | l.mutable_data()->set_state(SysTablesEntryPB::DELETING, |
7813 | 8.08k | Substitute("Started deleting at $0", LocalTimeAsString())); |
7814 | 8.08k | } |
7815 | | // Update all the table states in raft in bulk. |
7816 | 95 | Status s = sys_catalog_->Upsert(leader_ready_term(), tables_rpc); |
7817 | 95 | if (!s.ok()) { |
7818 | | // The mutation will be aborted when 'l' exits the scope on early return. |
7819 | 0 | s = s.CloneAndPrepend(Substitute("An error occurred while updating sys tables: $0", |
7820 | 0 | s.ToString())); |
7821 | 0 | LOG(WARNING) << s.ToString(); |
7822 | 0 | return CheckIfNoLongerLeader(s); |
7823 | 0 | } |
7824 | 8.08k | for (auto &table_and_lock : tables)95 { |
7825 | 8.08k | auto &table = table_and_lock.first; |
7826 | 8.08k | auto &l = table_and_lock.second; |
7827 | | // Cancel all table busywork and commit the DELETING change. |
7828 | 8.08k | l.Commit(); |
7829 | 8.08k | table->AbortTasks(); |
7830 | 8.08k | } |
7831 | | |
7832 | | // Batch remove all relevant CDC streams, handle after releasing Table locks. |
7833 | 95 | TRACE("Deleting CDC streams on table"); |
7834 | 95 | vector<TableId> id_list; |
7835 | 95 | id_list.reserve(tables.size()); |
7836 | 8.08k | for (auto &table_and_lock : tables) { |
7837 | 8.08k | id_list.push_back(table_and_lock.first->id()); |
7838 | 8.08k | } |
7839 | 95 | RETURN_NOT_OK(DeleteCDCStreamsForTables(id_list)); |
7840 | | |
7841 | | // Send a DeleteTablet() RPC request to each tablet replica in the table. |
7842 | 8.08k | for (auto &table_and_lock : tables)95 { |
7843 | 8.08k | auto &table = table_and_lock.first; |
7844 | | // TODO(pitr) undelete for YSQL tables |
7845 | 8.08k | RETURN_NOT_OK(DeleteTabletsAndSendRequests(table, {})); |
7846 | 8.08k | } |
7847 | | |
7848 | | // Invoke any background tasks and return (notably, table cleanup). |
7849 | 95 | background_tasks_->Wake(); |
7850 | 95 | return Status::OK(); |
7851 | 95 | } |
7852 | | |
7853 | | // Get the information about an in-progress delete operation. |
7854 | | Status CatalogManager::IsDeleteNamespaceDone(const IsDeleteNamespaceDoneRequestPB* req, |
7855 | 1.74k | IsDeleteNamespaceDoneResponsePB* resp) { |
7856 | 1.74k | auto ns_pb = req->namespace_(); |
7857 | | |
7858 | | // Lookup the namespace and verify it exists. |
7859 | 1.74k | TRACE("Looking up keyspace"); |
7860 | 1.74k | auto ns = FindNamespace(ns_pb); |
7861 | 1.74k | if (!ns.ok()) { |
7862 | | // Namespace no longer exists means success. |
7863 | 1.55k | LOG(INFO) << "Servicing IsDeleteNamespaceDone request for " |
7864 | 1.55k | << ns_pb.DebugString() << ": deleted (not found)"; |
7865 | 1.55k | resp->set_done(true); |
7866 | 1.55k | return Status::OK(); |
7867 | 1.55k | } |
7868 | | |
7869 | 192 | TRACE("Locking keyspace"); |
7870 | 192 | auto l = (**ns).LockForRead(); |
7871 | 192 | auto& metadata = l->pb; |
7872 | | |
7873 | 192 | if (metadata.state() == SysNamespaceEntryPB::DELETED) { |
7874 | 73 | resp->set_done(true); |
7875 | 119 | } else if (metadata.state() == SysNamespaceEntryPB::DELETING) { |
7876 | 114 | resp->set_done(false); |
7877 | 114 | } else { |
7878 | 5 | Status s = STATUS_SUBSTITUTE(IllegalState, |
7879 | 5 | "Servicing IsDeleteNamespaceDone request for $0: NOT deleted (state=$1)", |
7880 | 5 | ns_pb.DebugString(), metadata.state()); |
7881 | 5 | LOG(WARNING) << s.ToString(); |
7882 | | // Done != Successful. We just want to let the user know the delete has finished processing. |
7883 | 5 | resp->set_done(true); |
7884 | 5 | return SetupError(resp->mutable_error(), MasterErrorPB::INTERNAL_ERROR, s); |
7885 | 5 | } |
7886 | 187 | return Status::OK(); |
7887 | 192 | } |
7888 | | |
7889 | | Status CatalogManager::AlterNamespace(const AlterNamespaceRequestPB* req, |
7890 | | AlterNamespaceResponsePB* resp, |
7891 | 7 | rpc::RpcContext* rpc) { |
7892 | 7 | LOG(INFO) << "Servicing AlterNamespace request from " << RequestorString(rpc) |
7893 | 7 | << ": " << req->ShortDebugString(); |
7894 | | |
7895 | 7 | auto database = VERIFY_NAMESPACE_FOUND(FindNamespace(req->namespace_()), resp); |
7896 | | |
7897 | 7 | if (req->namespace_().has_database_type() && |
7898 | 7 | database->database_type() != req->namespace_().database_type()5 ) { |
7899 | 0 | Status s = STATUS(NotFound, "Database not found", database->name()); |
7900 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s); |
7901 | 0 | } |
7902 | | |
7903 | 7 | TRACE("Locking database"); |
7904 | 7 | auto l = database->LockForWrite(); |
7905 | | |
7906 | | // Don't allow an alter if the namespace isn't running. |
7907 | 7 | if (l->pb.state() != SysNamespaceEntryPB::RUNNING) { |
7908 | 1 | Status s = STATUS_SUBSTITUTE(TryAgain, "Namespace not running. State = $0", |
7909 | 1 | SysNamespaceEntryPB::State_Name(l->pb.state())); |
7910 | 1 | return SetupError(resp->mutable_error(), NamespaceMasterError(l->pb.state()), s); |
7911 | 1 | } |
7912 | | |
7913 | 6 | const string old_name = l->pb.name(); |
7914 | | |
7915 | 6 | if (req->has_new_name() && req->new_name() != old_name) { |
7916 | 6 | const string new_name = req->new_name(); |
7917 | | |
7918 | | // Verify that the new name does not exist. |
7919 | 6 | NamespaceIdentifierPB ns_identifier; |
7920 | 6 | ns_identifier.set_name(new_name); |
7921 | 6 | if (req->namespace_().has_database_type()) { |
7922 | 4 | ns_identifier.set_database_type(req->namespace_().database_type()); |
7923 | 4 | } |
7924 | | // TODO: This check will only work for YSQL once we add support for YSQL namespaces in |
7925 | | // namespace_name_map (#1476). |
7926 | 6 | LockGuard lock(mutex_); |
7927 | 6 | TRACE("Acquired catalog manager lock"); |
7928 | 6 | auto ns = FindNamespaceUnlocked(ns_identifier); |
7929 | 6 | if (ns.ok() && req->namespace_().has_database_type()0 && |
7930 | 6 | (**ns).database_type() == req->namespace_().database_type()0 ) { |
7931 | 0 | Status s = STATUS_SUBSTITUTE(AlreadyPresent, "Keyspace '$0' already exists", (**ns).name()); |
7932 | 0 | LOG(WARNING) << "Found keyspace: " << (**ns).id() << ". Failed altering keyspace with error: " |
7933 | 0 | << s << " Request:\n" << req->DebugString(); |
7934 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_ALREADY_PRESENT, s); |
7935 | 0 | } |
7936 | | |
7937 | 6 | namespace_names_mapper_[req->namespace_().database_type()][new_name] = database; |
7938 | 6 | namespace_names_mapper_[req->namespace_().database_type()].erase(old_name); |
7939 | | |
7940 | 6 | l.mutable_data()->pb.set_name(new_name); |
7941 | 6 | } |
7942 | | |
7943 | 6 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), database)); |
7944 | | |
7945 | 6 | TRACE("Committing in-memory state"); |
7946 | 6 | l.Commit(); |
7947 | | |
7948 | 6 | LOG(INFO) << "Successfully altered keyspace " << req->namespace_().name() |
7949 | 6 | << " per request from " << RequestorString(rpc); |
7950 | 6 | return Status::OK(); |
7951 | 6 | } |
7952 | | |
7953 | | Status CatalogManager::ListNamespaces(const ListNamespacesRequestPB* req, |
7954 | 5.13k | ListNamespacesResponsePB* resp) { |
7955 | 5.13k | NamespaceInfoMap namespace_ids_copy; |
7956 | 5.13k | { |
7957 | 5.13k | SharedLock lock(mutex_); |
7958 | 5.13k | namespace_ids_copy = namespace_ids_map_; |
7959 | 5.13k | } |
7960 | | |
7961 | 24.3k | for (const auto& entry : namespace_ids_copy) { |
7962 | 24.3k | const auto& namespace_info = *entry.second; |
7963 | | // If the request asks for namespaces for a specific database type, filter by the type. |
7964 | 24.3k | if (req->has_database_type() && namespace_info.database_type() != req->database_type()2.50k ) { |
7965 | 848 | continue; |
7966 | 848 | } |
7967 | | // Only return RUNNING namespaces. |
7968 | 23.4k | if (namespace_info.state() != SysNamespaceEntryPB::RUNNING) { |
7969 | 162 | continue; |
7970 | 162 | } |
7971 | | |
7972 | 23.3k | NamespaceIdentifierPB *ns = resp->add_namespaces(); |
7973 | 23.3k | ns->set_id(namespace_info.id()); |
7974 | 23.3k | ns->set_name(namespace_info.name()); |
7975 | 23.3k | ns->set_database_type(namespace_info.database_type()); |
7976 | 23.3k | } |
7977 | 5.13k | return Status::OK(); |
7978 | 5.13k | } |
7979 | | |
7980 | | Status CatalogManager::GetNamespaceInfo(const GetNamespaceInfoRequestPB* req, |
7981 | | GetNamespaceInfoResponsePB* resp, |
7982 | 6.03k | rpc::RpcContext* rpc) { |
7983 | 6.03k | LOG(INFO) << __func__ << " from " << RequestorString(rpc) << ": " << req->ShortDebugString(); |
7984 | | |
7985 | | // Look up the namespace and verify if it exists. |
7986 | 6.03k | TRACE("Looking up namespace"); |
7987 | 6.03k | auto ns = VERIFY_NAMESPACE_FOUND(FindNamespace(req->namespace_()), resp); |
7988 | | |
7989 | 6.03k | resp->mutable_namespace_()->set_id(ns->id()); |
7990 | 6.03k | resp->mutable_namespace_()->set_name(ns->name()); |
7991 | 6.03k | resp->mutable_namespace_()->set_database_type(ns->database_type()); |
7992 | 6.03k | resp->set_colocated(ns->colocated()); |
7993 | 6.03k | return Status::OK(); |
7994 | 6.03k | } |
7995 | | |
7996 | | Status CatalogManager::RedisConfigSet( |
7997 | 182 | const RedisConfigSetRequestPB* req, RedisConfigSetResponsePB* resp, rpc::RpcContext* rpc) { |
7998 | 182 | DCHECK(req->has_keyword()); |
7999 | 182 | const auto& key = req->keyword(); |
8000 | 182 | SysRedisConfigEntryPB config_entry; |
8001 | 182 | config_entry.set_key(key); |
8002 | 182 | *config_entry.mutable_args() = req->args(); |
8003 | 182 | bool created = false; |
8004 | | |
8005 | 182 | TRACE("Acquired catalog manager lock"); |
8006 | 182 | LockGuard lock(mutex_); |
8007 | 182 | scoped_refptr<RedisConfigInfo> cfg = FindPtrOrNull(redis_config_map_, req->keyword()); |
8008 | 182 | if (cfg == nullptr) { |
8009 | 182 | created = true; |
8010 | 182 | cfg = new RedisConfigInfo(key); |
8011 | 182 | redis_config_map_[key] = cfg; |
8012 | 182 | } |
8013 | | |
8014 | 182 | auto wl = cfg->LockForWrite(); |
8015 | 182 | wl.mutable_data()->pb = std::move(config_entry); |
8016 | 182 | if (created) { |
8017 | 182 | CHECK_OK(sys_catalog_->Upsert(leader_ready_term(), cfg)); |
8018 | 182 | } else { |
8019 | 0 | CHECK_OK(sys_catalog_->Upsert(leader_ready_term(), cfg)); |
8020 | 0 | } |
8021 | 182 | wl.Commit(); |
8022 | 182 | return Status::OK(); |
8023 | 182 | } |
8024 | | |
8025 | | Status CatalogManager::RedisConfigGet( |
8026 | 1.17k | const RedisConfigGetRequestPB* req, RedisConfigGetResponsePB* resp, rpc::RpcContext* rpc) { |
8027 | 1.17k | DCHECK(req->has_keyword()); |
8028 | 1.17k | resp->set_keyword(req->keyword()); |
8029 | 1.17k | TRACE("Acquired catalog manager lock"); |
8030 | 1.17k | SharedLock lock(mutex_); |
8031 | 1.17k | scoped_refptr<RedisConfigInfo> cfg = FindPtrOrNull(redis_config_map_, req->keyword()); |
8032 | 1.17k | if (cfg == nullptr) { |
8033 | 821 | Status s = STATUS_SUBSTITUTE(NotFound, "Redis config for $0 does not exists", req->keyword()); |
8034 | 821 | return SetupError(resp->mutable_error(), MasterErrorPB::REDIS_CONFIG_NOT_FOUND, s); |
8035 | 821 | } |
8036 | 355 | auto rci = cfg->LockForRead(); |
8037 | 355 | resp->mutable_args()->CopyFrom(rci->pb.args()); |
8038 | 355 | return Status::OK(); |
8039 | 1.17k | } |
8040 | | |
8041 | | Status CatalogManager::CreateUDType(const CreateUDTypeRequestPB* req, |
8042 | | CreateUDTypeResponsePB* resp, |
8043 | 47 | rpc::RpcContext* rpc) { |
8044 | 47 | LOG(INFO) << "CreateUDType from " << RequestorString(rpc) |
8045 | 47 | << ": " << req->DebugString(); |
8046 | | |
8047 | 47 | Status s; |
8048 | 47 | scoped_refptr<UDTypeInfo> tp; |
8049 | 47 | scoped_refptr<NamespaceInfo> ns; |
8050 | | |
8051 | | // Lookup the namespace and verify if it exists. |
8052 | 47 | if (req->has_namespace_()) { |
8053 | 47 | TRACE("Looking up namespace"); |
8054 | 47 | ns = VERIFY_NAMESPACE_FOUND(FindNamespace(req->namespace_()), resp); |
8055 | 47 | if (ns->database_type() != YQLDatabase::YQL_DATABASE_CQL) { |
8056 | 0 | Status s = STATUS(NotFound, "Namespace not found"); |
8057 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s); |
8058 | 0 | } |
8059 | 47 | } |
8060 | | |
8061 | | // Get all the referenced types (if any). |
8062 | 47 | std::vector<std::string> referenced_udts; |
8063 | 89 | for (const QLTypePB& field_type : req->field_types()) { |
8064 | 89 | QLType::GetUserDefinedTypeIds(field_type, /* transitive = */ true, &referenced_udts); |
8065 | 89 | } |
8066 | | |
8067 | 47 | { |
8068 | 47 | TRACE("Acquired catalog manager lock"); |
8069 | 47 | LockGuard lock(mutex_); |
8070 | | |
8071 | | // Verify that the type does not exist. |
8072 | 47 | tp = FindPtrOrNull(udtype_names_map_, std::make_pair(ns->id(), req->name())); |
8073 | | |
8074 | 47 | if (tp != nullptr) { |
8075 | 1 | s = STATUS_SUBSTITUTE(AlreadyPresent, |
8076 | 1 | "Type '$0.$1' already exists", ns->name(), req->name()); |
8077 | 1 | LOG(WARNING) << "Found type: " << tp->id() << ". Failed creating type with error: " |
8078 | 1 | << s.ToString() << " Request:\n" << req->DebugString(); |
8079 | 1 | return SetupError(resp->mutable_error(), MasterErrorPB::TYPE_ALREADY_PRESENT, s); |
8080 | 1 | } |
8081 | | |
8082 | | // Verify that all referenced types actually exist. |
8083 | 46 | for (const auto& udt_id : referenced_udts) { |
8084 | 11 | if (FindPtrOrNull(udtype_ids_map_, udt_id) == nullptr) { |
8085 | | // This may be caused by a stale cache (e.g. referenced type name resolves to an old, |
8086 | | // deleted type). Return InvalidArgument so query layer will clear cache and retry. |
8087 | 0 | s = STATUS_SUBSTITUTE(InvalidArgument, |
8088 | 0 | "Type id '$0' referenced by type '$1' does not exist", udt_id, req->name()); |
8089 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s); |
8090 | 0 | } |
8091 | 11 | } |
8092 | | |
8093 | | // Construct the new type (generate fresh name and set fields). |
8094 | 46 | UDTypeId new_id = GenerateIdUnlocked(SysRowEntryType::UDTYPE); |
8095 | 46 | tp = new UDTypeInfo(new_id); |
8096 | 46 | tp->mutable_metadata()->StartMutation(); |
8097 | 46 | SysUDTypeEntryPB *metadata = &tp->mutable_metadata()->mutable_dirty()->pb; |
8098 | 46 | metadata->set_name(req->name()); |
8099 | 46 | metadata->set_namespace_id(ns->id()); |
8100 | 88 | for (const string& field_name : req->field_names()) { |
8101 | 88 | metadata->add_field_names(field_name); |
8102 | 88 | } |
8103 | | |
8104 | 88 | for (const QLTypePB& field_type : req->field_types()) { |
8105 | 88 | metadata->add_field_types()->CopyFrom(field_type); |
8106 | 88 | } |
8107 | | |
8108 | | // Add the type to the in-memory maps. |
8109 | 46 | udtype_ids_map_[tp->id()] = tp; |
8110 | 46 | udtype_names_map_[std::make_pair(ns->id(), req->name())] = tp; |
8111 | 46 | resp->set_id(tp->id()); |
8112 | 46 | } |
8113 | 46 | TRACE("Inserted new user-defined type info into CatalogManager maps"); |
8114 | | |
8115 | | // Update the on-disk system catalog. |
8116 | 46 | s = sys_catalog_->Upsert(leader_ready_term(), tp); |
8117 | 46 | if (!s.ok()) { |
8118 | 0 | s = s.CloneAndPrepend(Substitute( |
8119 | 0 | "An error occurred while inserting user-defined type to sys-catalog: $0", s.ToString())); |
8120 | 0 | LOG(WARNING) << s.ToString(); |
8121 | 0 | return CheckIfNoLongerLeaderAndSetupError(s, resp); |
8122 | 0 | } |
8123 | 46 | TRACE("Wrote user-defined type to sys-catalog"); |
8124 | | |
8125 | | // Commit the in-memory state. |
8126 | 46 | tp->mutable_metadata()->CommitMutation(); |
8127 | 46 | LOG(INFO) << "Created user-defined type " << tp->ToString(); |
8128 | 46 | return Status::OK(); |
8129 | 46 | } |
8130 | | |
8131 | | Status CatalogManager::DeleteUDType(const DeleteUDTypeRequestPB* req, |
8132 | | DeleteUDTypeResponsePB* resp, |
8133 | 54 | rpc::RpcContext* rpc) { |
8134 | 54 | LOG(INFO) << "Servicing DeleteUDType request from " << RequestorString(rpc) |
8135 | 54 | << ": " << req->ShortDebugString(); |
8136 | | |
8137 | 54 | scoped_refptr<UDTypeInfo> tp; |
8138 | 54 | scoped_refptr<NamespaceInfo> ns; |
8139 | | |
8140 | 54 | if (!req->has_type()) { |
8141 | 0 | Status s = STATUS(InvalidArgument, "No type given", req->DebugString()); |
8142 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s); |
8143 | 0 | } |
8144 | | |
8145 | | // Validate namespace. |
8146 | 54 | if (req->type().has_namespace_()) { |
8147 | | // Lookup the namespace and verify if it exists. |
8148 | 54 | TRACE("Looking up namespace"); |
8149 | 54 | ns = VERIFY_NAMESPACE_FOUND(FindNamespace(req->type().namespace_()), resp); |
8150 | 54 | } |
8151 | | |
8152 | 54 | { |
8153 | 54 | LockGuard lock(mutex_); |
8154 | 54 | TRACE("Acquired catalog manager lock"); |
8155 | | |
8156 | 54 | if (req->type().has_type_id()) { |
8157 | 0 | tp = FindPtrOrNull(udtype_ids_map_, req->type().type_id()); |
8158 | 54 | } else if (req->type().has_type_name()) { |
8159 | 54 | tp = FindPtrOrNull(udtype_names_map_, {ns->id(), req->type().type_name()}); |
8160 | 54 | } |
8161 | | |
8162 | 54 | if (tp == nullptr) { |
8163 | 2 | Status s = STATUS(NotFound, "The type does not exist", req->DebugString()); |
8164 | 2 | return SetupError(resp->mutable_error(), MasterErrorPB::TYPE_NOT_FOUND, s); |
8165 | 2 | } |
8166 | | |
8167 | | // Checking if any table uses this type. |
8168 | | // TODO: this could be more efficient. |
8169 | 982 | for (const TableInfoMap::value_type& entry : *table_ids_map_)52 { |
8170 | 982 | auto ltm = entry.second->LockForRead(); |
8171 | 982 | if (!ltm->started_deleting()) { |
8172 | 7.08k | for (const auto &col : ltm->schema().columns()) { |
8173 | 7.08k | if (col.type().main() == DataType::USER_DEFINED_TYPE && |
8174 | 7.08k | col.type().udtype_info().id() == tp->id()8 ) { |
8175 | 2 | Status s = STATUS(QLError, |
8176 | 2 | Substitute("Cannot delete type '$0.$1'. It is used in column $2 of table $3", |
8177 | 2 | ns->name(), tp->name(), col.name(), ltm->name())); |
8178 | 2 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s); |
8179 | 2 | } |
8180 | 7.08k | } |
8181 | 869 | } |
8182 | 982 | } |
8183 | | |
8184 | | // Checking if any other type uses this type (i.e. in the case of nested types). |
8185 | | // TODO: this could be more efficient. |
8186 | 73 | for (const UDTypeInfoMap::value_type& entry : udtype_ids_map_)50 { |
8187 | 73 | auto ltm = entry.second->LockForRead(); |
8188 | | |
8189 | 203 | for (int i = 0; i < ltm->field_types_size(); i++130 ) { |
8190 | | // Only need to check direct (non-transitive) type dependencies here. |
8191 | | // This also means we report more precise errors for in-use types. |
8192 | 134 | if (QLType::DoesUserDefinedTypeIdExist(ltm->field_types(i), |
8193 | 134 | false /* transitive */, |
8194 | 134 | tp->id())) { |
8195 | 4 | Status s = STATUS(QLError, |
8196 | 4 | Substitute("Cannot delete type '$0.$1'. It is used in field $2 of type '$3'", |
8197 | 4 | ns->name(), tp->name(), ltm->field_names(i), ltm->name())); |
8198 | 4 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s); |
8199 | 4 | } |
8200 | 134 | } |
8201 | 73 | } |
8202 | 50 | } |
8203 | | |
8204 | 46 | auto l = tp->LockForWrite(); |
8205 | | |
8206 | 46 | Status s = sys_catalog_->Delete(leader_ready_term(), tp); |
8207 | 46 | if (!s.ok()) { |
8208 | | // The mutation will be aborted when 'l' exits the scope on early return. |
8209 | 0 | s = s.CloneAndPrepend(Substitute("An error occurred while updating sys-catalog: $0", |
8210 | 0 | s.ToString())); |
8211 | 0 | LOG(WARNING) << s.ToString(); |
8212 | 0 | return CheckIfNoLongerLeaderAndSetupError(s, resp); |
8213 | 0 | } |
8214 | | |
8215 | | // Remove it from the maps. |
8216 | 46 | { |
8217 | 46 | TRACE("Removing from maps"); |
8218 | 46 | LockGuard lock(mutex_); |
8219 | 46 | if (udtype_ids_map_.erase(tp->id()) < 1) { |
8220 | 0 | PANIC_RPC(rpc, "Could not remove user defined type from map, name=" + l->name()); |
8221 | 0 | } |
8222 | 46 | if (udtype_names_map_.erase({ns->id(), tp->name()}) < 1) { |
8223 | 0 | PANIC_RPC(rpc, "Could not remove user defined type from map, name=" + l->name()); |
8224 | 0 | } |
8225 | 46 | } |
8226 | | |
8227 | | // Update the in-memory state. |
8228 | 46 | TRACE("Committing in-memory state"); |
8229 | 46 | l.Commit(); |
8230 | | |
8231 | 46 | LOG(INFO) << "Successfully deleted user-defined type " << tp->ToString() |
8232 | 46 | << " per request from " << RequestorString(rpc); |
8233 | | |
8234 | 46 | return Status::OK(); |
8235 | 46 | } |
8236 | | |
8237 | | Status CatalogManager::GetUDTypeInfo(const GetUDTypeInfoRequestPB* req, |
8238 | | GetUDTypeInfoResponsePB* resp, |
8239 | 56 | rpc::RpcContext* rpc) { |
8240 | 56 | LOG(INFO) << "GetUDTypeInfo from " << RequestorString(rpc) |
8241 | 56 | << ": " << req->DebugString(); |
8242 | 56 | Status s; |
8243 | 56 | scoped_refptr<UDTypeInfo> tp; |
8244 | 56 | scoped_refptr<NamespaceInfo> ns; |
8245 | | |
8246 | 56 | if (!req->has_type()) { |
8247 | 0 | s = STATUS(InvalidArgument, "Cannot get type, no type identifier given", req->DebugString()); |
8248 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::TYPE_NOT_FOUND, s); |
8249 | 0 | } |
8250 | | |
8251 | 56 | if (req->type().has_type_id()) { |
8252 | 0 | tp = FindPtrOrNull(udtype_ids_map_, req->type().type_id()); |
8253 | 56 | } else if (req->type().has_type_name() && req->type().has_namespace_()) { |
8254 | | // Lookup the type and verify if it exists. |
8255 | 56 | TRACE("Looking up namespace"); |
8256 | 56 | ns = VERIFY_NAMESPACE_FOUND(FindNamespace(req->type().namespace_()), resp); |
8257 | | |
8258 | 56 | tp = FindPtrOrNull(udtype_names_map_, std::make_pair(ns->id(), req->type().type_name())); |
8259 | 56 | } |
8260 | | |
8261 | 56 | if (tp == nullptr) { |
8262 | 7 | s = STATUS(InvalidArgument, "Couldn't find type", req->DebugString()); |
8263 | 7 | return SetupError(resp->mutable_error(), MasterErrorPB::TYPE_NOT_FOUND, s); |
8264 | 7 | } |
8265 | | |
8266 | 49 | { |
8267 | 49 | auto type_lock = tp->LockForRead(); |
8268 | | |
8269 | 49 | UDTypeInfoPB* type_info = resp->mutable_udtype(); |
8270 | | |
8271 | 49 | type_info->set_name(tp->name()); |
8272 | 49 | type_info->set_id(tp->id()); |
8273 | 49 | type_info->mutable_namespace_()->set_id(type_lock->namespace_id()); |
8274 | | |
8275 | 144 | for (int i = 0; i < type_lock->field_names_size(); i++95 ) { |
8276 | 95 | type_info->add_field_names(type_lock->field_names(i)); |
8277 | 95 | } |
8278 | 144 | for (int i = 0; i < type_lock->field_types_size(); i++95 ) { |
8279 | 95 | type_info->add_field_types()->CopyFrom(type_lock->field_types(i)); |
8280 | 95 | } |
8281 | | |
8282 | 49 | LOG(INFO) << "Retrieved user-defined type " << tp->ToString(); |
8283 | 49 | } |
8284 | 49 | return Status::OK(); |
8285 | 56 | } |
8286 | | |
8287 | | Status CatalogManager::ListUDTypes(const ListUDTypesRequestPB* req, |
8288 | 0 | ListUDTypesResponsePB* resp) { |
8289 | 0 | SharedLock lock(mutex_); |
8290 | | |
8291 | | // Lookup the namespace and verify that it exists. |
8292 | 0 | auto ns = VERIFY_NAMESPACE_FOUND(FindNamespaceUnlocked(req->namespace_()), resp); |
8293 | |
|
8294 | 0 | for (const UDTypeInfoByNameMap::value_type& entry : udtype_names_map_) { |
8295 | 0 | auto ltm = entry.second->LockForRead(); |
8296 | | |
8297 | | // key is a pair <namespace_id, type_name>. |
8298 | 0 | if (!ns->id().empty() && ns->id() != entry.first.first) { |
8299 | 0 | continue; // Skip types from other namespaces. |
8300 | 0 | } |
8301 | | |
8302 | 0 | UDTypeInfoPB* udtype = resp->add_udtypes(); |
8303 | 0 | udtype->set_id(entry.second->id()); |
8304 | 0 | udtype->set_name(ltm->name()); |
8305 | 0 | for (int i = 0; i <= ltm->field_names_size(); i++) { |
8306 | 0 | udtype->add_field_names(ltm->field_names(i)); |
8307 | 0 | } |
8308 | 0 | for (int i = 0; i <= ltm->field_types_size(); i++) { |
8309 | 0 | udtype->add_field_types()->CopyFrom(ltm->field_types(i)); |
8310 | 0 | } |
8311 | |
|
8312 | 0 | if (CHECK_NOTNULL(ns.get())) { |
8313 | 0 | auto l = ns->LockForRead(); |
8314 | 0 | udtype->mutable_namespace_()->set_id(ns->id()); |
8315 | 0 | udtype->mutable_namespace_()->set_name(ns->name()); |
8316 | 0 | } |
8317 | 0 | } |
8318 | 0 | return Status::OK(); |
8319 | 0 | } |
8320 | | |
8321 | | Status CatalogManager::DisableTabletSplitting( |
8322 | | const DisableTabletSplittingRequestPB* req, DisableTabletSplittingResponsePB* resp, |
8323 | 0 | rpc::RpcContext* rpc) { |
8324 | 0 | const MonoDelta disable_duration = MonoDelta::FromMilliseconds(req->disable_duration_ms()); |
8325 | 0 | tablet_split_manager_.DisableSplittingFor(disable_duration); |
8326 | 0 | return Status::OK(); |
8327 | 0 | } |
8328 | | |
8329 | | Status CatalogManager::IsTabletSplittingComplete( |
8330 | | const IsTabletSplittingCompleteRequestPB* req, IsTabletSplittingCompleteResponsePB* resp, |
8331 | 0 | rpc::RpcContext* rpc) { |
8332 | 0 | TableInfoMap table_info_map; |
8333 | 0 | { |
8334 | 0 | SharedLock lock(mutex_); |
8335 | 0 | table_info_map = *table_ids_map_; |
8336 | 0 | } |
8337 | 0 | resp->set_is_tablet_splitting_complete( |
8338 | 0 | tablet_split_manager_.IsTabletSplittingComplete(table_info_map)); |
8339 | 0 | return Status::OK(); |
8340 | 0 | } |
8341 | | |
8342 | | // For non-enterprise builds, this is a no-op. |
8343 | 0 | Status CatalogManager::DeleteCDCStreamsForTable(const TableId& table) { |
8344 | 0 | return Status::OK(); |
8345 | 0 | } |
8346 | | |
8347 | 0 | Status CatalogManager::DeleteCDCStreamsForTables(const vector<TableId>& table_ids) { |
8348 | 0 | return Status::OK(); |
8349 | 0 | } |
8350 | | |
8351 | | |
8352 | 0 | bool CatalogManager::CDCStreamExistsUnlocked(const CDCStreamId& stream_id) { |
8353 | 0 | return false; |
8354 | 0 | } |
8355 | | |
8356 | 14 | Result<uint64_t> CatalogManager::IncrementYsqlCatalogVersion() { |
8357 | | |
8358 | 14 | auto l = CHECK_NOTNULL(ysql_catalog_config_.get())->LockForWrite(); |
8359 | 14 | uint64_t new_version = l->pb.ysql_catalog_config().version() + 1; |
8360 | 14 | l.mutable_data()->pb.mutable_ysql_catalog_config()->set_version(new_version); |
8361 | | |
8362 | | // Write to sys_catalog and in memory. |
8363 | 14 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), ysql_catalog_config_)); |
8364 | 14 | l.Commit(); |
8365 | | |
8366 | 14 | if (FLAGS_log_ysql_catalog_versions) { |
8367 | 0 | LOG_WITH_FUNC(WARNING) << "set catalog version: " << new_version |
8368 | 0 | << " (using old protobuf method)"; |
8369 | 0 | } |
8370 | | |
8371 | 14 | return new_version; |
8372 | 14 | } |
8373 | | |
8374 | 747 | Status CatalogManager::InitDbFinished(Status initdb_status, int64_t term) { |
8375 | 747 | if (initdb_status.ok()) { |
8376 | 747 | LOG(INFO) << "initdb completed successfully"; |
8377 | 747 | } else { |
8378 | 0 | LOG(ERROR) << "initdb failed: " << initdb_status; |
8379 | 0 | } |
8380 | | |
8381 | 747 | auto l = CHECK_NOTNULL(ysql_catalog_config_.get())->LockForWrite(); |
8382 | 747 | auto* mutable_ysql_catalog_config = l.mutable_data()->pb.mutable_ysql_catalog_config(); |
8383 | 747 | mutable_ysql_catalog_config->set_initdb_done(true); |
8384 | 747 | if (!initdb_status.ok()) { |
8385 | 0 | mutable_ysql_catalog_config->set_initdb_error(initdb_status.ToString()); |
8386 | 747 | } else { |
8387 | 747 | mutable_ysql_catalog_config->clear_initdb_error(); |
8388 | 747 | } |
8389 | | |
8390 | 747 | RETURN_NOT_OK(sys_catalog_->Upsert(term, ysql_catalog_config_)); |
8391 | 747 | l.Commit(); |
8392 | 747 | return Status::OK(); |
8393 | 747 | } |
8394 | | |
8395 | | CHECKED_STATUS CatalogManager::IsInitDbDone( |
8396 | | const IsInitDbDoneRequestPB* req, |
8397 | 2.21k | IsInitDbDoneResponsePB* resp) { |
8398 | 2.21k | auto l = CHECK_NOTNULL(ysql_catalog_config_.get())->LockForRead(); |
8399 | 2.21k | const auto& ysql_catalog_config = l->pb.ysql_catalog_config(); |
8400 | 2.21k | resp->set_pg_proc_exists(pg_proc_exists_.load(std::memory_order_acquire)); |
8401 | 2.21k | resp->set_done(ysql_catalog_config.initdb_done()); |
8402 | 2.21k | if (ysql_catalog_config.has_initdb_error() && |
8403 | 2.21k | !ysql_catalog_config.initdb_error().empty()0 ) { |
8404 | 0 | resp->set_initdb_error(ysql_catalog_config.initdb_error()); |
8405 | 0 | } |
8406 | 2.21k | return Status::OK(); |
8407 | 2.21k | } |
8408 | | |
8409 | | Status CatalogManager::GetYsqlCatalogVersion(uint64_t* catalog_version, |
8410 | 4.81M | uint64_t* last_breaking_version) { |
8411 | 4.81M | auto table_info = GetTableInfo(kPgYbCatalogVersionTableId); |
8412 | 4.81M | if (table_info != nullptr) { |
8413 | 349k | RETURN_NOT_OK(sys_catalog_->ReadYsqlCatalogVersion(kPgYbCatalogVersionTableId, |
8414 | 349k | catalog_version, |
8415 | 349k | last_breaking_version)); |
8416 | | // If the version is properly initialized, we're done. |
8417 | 349k | if ((!catalog_version || *catalog_version > 0341k ) && |
8418 | 349k | (346k !last_breaking_version346k || *last_breaking_version > 0346k )) { |
8419 | 346k | return Status::OK(); |
8420 | 346k | } |
8421 | | // However, it's possible for a table to have no entries mid-migration or if migration fails. |
8422 | | // In this case we'd like to fall back to the legacy approach. |
8423 | 349k | } |
8424 | | |
8425 | 4.46M | auto l = ysql_catalog_config_->LockForRead(); |
8426 | | // last_breaking_version is the last version (change) that invalidated ongoing transactions. |
8427 | | // If using the old (protobuf-based) version method, we do not have any information about |
8428 | | // breaking changes so assuming every change is a breaking change. |
8429 | 4.46M | if (catalog_version) { |
8430 | 4.46M | *catalog_version = l->pb.ysql_catalog_config().version(); |
8431 | 4.46M | } |
8432 | 4.46M | if (last_breaking_version) { |
8433 | 4.46M | *last_breaking_version = l->pb.ysql_catalog_config().version(); |
8434 | 4.46M | } |
8435 | 4.46M | return Status::OK(); |
8436 | 4.81M | } |
8437 | | |
8438 | 2.91k | Status CatalogManager::InitializeTransactionTablesConfig(int64_t term) { |
8439 | 2.91k | SysTransactionTablesConfigEntryPB transaction_tables_config; |
8440 | 2.91k | transaction_tables_config.set_version(0); |
8441 | | |
8442 | | // Create in memory objects. |
8443 | 2.91k | transaction_tables_config_ = new SysConfigInfo(kTransactionTablesConfigType); |
8444 | | |
8445 | | // Prepare write. |
8446 | 2.91k | auto l = transaction_tables_config_->LockForWrite(); |
8447 | 2.91k | *l.mutable_data()->pb.mutable_transaction_tables_config() = std::move(transaction_tables_config); |
8448 | | |
8449 | | // Write to sys_catalog and in memory. |
8450 | 2.91k | RETURN_NOT_OK(sys_catalog_->Upsert(term, transaction_tables_config_)); |
8451 | 2.91k | l.Commit(); |
8452 | | |
8453 | 2.91k | return Status::OK(); |
8454 | 2.91k | } |
8455 | | |
8456 | 1.09k | Status CatalogManager::IncrementTransactionTablesVersion() { |
8457 | 1.09k | auto l = CHECK_NOTNULL(transaction_tables_config_.get())->LockForWrite(); |
8458 | 1.09k | uint64_t new_version = l->pb.transaction_tables_config().version() + 1; |
8459 | 1.09k | l.mutable_data()->pb.mutable_transaction_tables_config()->set_version(new_version); |
8460 | | |
8461 | | // Write to sys_catalog and in memory. |
8462 | 1.09k | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), transaction_tables_config_)); |
8463 | 1.09k | l.Commit(); |
8464 | | |
8465 | 1.09k | LOG(INFO) << "Set transaction tables version: " << new_version; |
8466 | | |
8467 | 1.09k | return Status::OK(); |
8468 | 1.09k | } |
8469 | | |
8470 | 4.81M | uint64_t CatalogManager::GetTransactionTablesVersion() { |
8471 | 4.81M | auto l = CHECK_NOTNULL(transaction_tables_config_.get())->LockForRead(); |
8472 | 4.81M | return l->pb.transaction_tables_config().version(); |
8473 | 4.81M | } |
8474 | | |
8475 | 93 | Status CatalogManager::RegisterTsFromRaftConfig(const consensus::RaftPeerPB& peer) { |
8476 | 93 | NodeInstancePB instance_pb; |
8477 | 93 | instance_pb.set_permanent_uuid(peer.permanent_uuid()); |
8478 | 93 | instance_pb.set_instance_seqno(0); |
8479 | | |
8480 | 93 | TSRegistrationPB registration_pb; |
8481 | 93 | auto* common = registration_pb.mutable_common(); |
8482 | 93 | *common->mutable_private_rpc_addresses() = peer.last_known_private_addr(); |
8483 | 93 | *common->mutable_broadcast_addresses() = peer.last_known_broadcast_addr(); |
8484 | 93 | *common->mutable_cloud_info() = peer.cloud_info(); |
8485 | | |
8486 | | // Todo(Rahul) : May need to be changed when we implement table level overrides. |
8487 | 93 | { |
8488 | 93 | auto l = ClusterConfig()->LockForRead(); |
8489 | | // If the config has no replication info, use empty string for the placement uuid, otherwise |
8490 | | // calculate it from the reported peer. |
8491 | 93 | auto placement_uuid = l->pb.has_replication_info() |
8492 | 93 | ? VERIFY_RESULT(CatalogManagerUtil::GetPlacementUuidFromRaftPeer( |
8493 | 93 | l->pb.replication_info(), peer)) |
8494 | 93 | : ""62 ; |
8495 | 0 | common->set_placement_uuid(placement_uuid); |
8496 | 93 | } |
8497 | 0 | return master_->ts_manager()->RegisterTS(instance_pb, registration_pb, master_->MakeCloudInfoPB(), |
8498 | 93 | &master_->proxy_cache(), |
8499 | 93 | RegisteredThroughHeartbeat::kFalse); |
8500 | 93 | } |
8501 | | |
8502 | | void CatalogManager::ReconcileTabletReplicasInLocalMemoryWithReport( |
8503 | | const scoped_refptr<TabletInfo>& tablet, |
8504 | | const std::string& sender_uuid, |
8505 | | const ConsensusStatePB& consensus_state, |
8506 | 113k | const ReportedTabletPB& report) { |
8507 | 113k | auto replica_locations = std::make_shared<TabletReplicaMap>(); |
8508 | 113k | auto prev_rl = tablet->GetReplicaLocations(); |
8509 | | |
8510 | 336k | for (const consensus::RaftPeerPB& peer : consensus_state.config().peers()) { |
8511 | 336k | shared_ptr<TSDescriptor> ts_desc; |
8512 | 336k | if (!peer.has_permanent_uuid()) { |
8513 | 0 | LOG_WITH_PREFIX(WARNING) << "Missing UUID for peer" << peer.ShortDebugString(); |
8514 | 0 | continue; |
8515 | 0 | } |
8516 | 336k | if (!master_->ts_manager()->LookupTSByUUID(peer.permanent_uuid(), &ts_desc)) { |
8517 | 93 | if (!GetAtomicFlag(&FLAGS_enable_register_ts_from_raft)) { |
8518 | 0 | LOG_WITH_PREFIX(WARNING) << "Tablet server has never reported in. " |
8519 | 0 | << "Not including in replica locations map yet. Peer: " << peer.ShortDebugString() |
8520 | 0 | << "; Tablet: " << tablet->ToString(); |
8521 | 0 | continue; |
8522 | 0 | } |
8523 | | |
8524 | 93 | LOG_WITH_PREFIX(INFO) << "Tablet server has never reported in. Registering the ts using " |
8525 | 93 | << "the raft config. Peer: " << peer.ShortDebugString() |
8526 | 93 | << "; Tablet: " << tablet->ToString(); |
8527 | 93 | Status s = RegisterTsFromRaftConfig(peer); |
8528 | 93 | if (!s.ok()) { |
8529 | 9 | LOG_WITH_PREFIX(WARNING) << "Could not register ts from raft config: " << s |
8530 | 9 | << " Skip updating the replica map."; |
8531 | 9 | continue; |
8532 | 9 | } |
8533 | | |
8534 | | // Guaranteed to find the ts since we just registered. |
8535 | 84 | master_->ts_manager()->LookupTSByUUID(peer.permanent_uuid(), &ts_desc); |
8536 | 84 | if (!ts_desc.get()) { |
8537 | 0 | LOG_WITH_PREFIX(WARNING) << "Could not find ts with uuid " << peer.permanent_uuid() |
8538 | 0 | << " after registering from raft config. Skip updating the replica" |
8539 | 0 | << " map."; |
8540 | 0 | continue; |
8541 | 0 | } |
8542 | 84 | } |
8543 | | |
8544 | | // Do not update replicas in the NOT_STARTED or BOOTSTRAPPING state (unless they are stale). |
8545 | 336k | bool use_existing = false; |
8546 | 336k | const TabletReplica* existing_replica = nullptr; |
8547 | 336k | auto it = prev_rl->find(ts_desc->permanent_uuid()); |
8548 | 336k | if (it != prev_rl->end()) { |
8549 | 192k | existing_replica = &it->second; |
8550 | 192k | } |
8551 | 336k | if (existing_replica && peer.permanent_uuid() != sender_uuid192k ) { |
8552 | | // IsStarting returns true if state == NOT_STARTED or state == BOOTSTRAPPING. |
8553 | 127k | use_existing = existing_replica->IsStarting() && !existing_replica->IsStale()713 ; |
8554 | 127k | } |
8555 | 336k | if (use_existing) { |
8556 | 713 | InsertOrDie(replica_locations.get(), existing_replica->ts_desc->permanent_uuid(), |
8557 | 713 | *existing_replica); |
8558 | 336k | } else { |
8559 | 336k | TabletReplica replica; |
8560 | 336k | CreateNewReplicaForLocalMemory(ts_desc.get(), &consensus_state, report, &replica); |
8561 | 336k | auto result = replica_locations.get()->insert({replica.ts_desc->permanent_uuid(), replica}); |
8562 | 336k | LOG_IF(FATAL, !result.second) << "duplicate uuid: " << replica.ts_desc->permanent_uuid()6 ; |
8563 | 336k | if (existing_replica) { |
8564 | 191k | result.first->second.UpdateDriveInfo(existing_replica->drive_info); |
8565 | 191k | } |
8566 | 336k | } |
8567 | 336k | } |
8568 | | |
8569 | | // Update the local tablet replica set. This deviates from persistent state during bootstrapping. |
8570 | 113k | tablet->SetReplicaLocations(replica_locations); |
8571 | 113k | tablet_locations_version_.fetch_add(1, std::memory_order_acq_rel); |
8572 | 113k | } |
8573 | | |
8574 | | void CatalogManager::UpdateTabletReplicaInLocalMemory(TSDescriptor* ts_desc, |
8575 | | const ConsensusStatePB* consensus_state, |
8576 | | const ReportedTabletPB& report, |
8577 | 327k | const scoped_refptr<TabletInfo>& tablet) { |
8578 | 327k | TabletReplica replica; |
8579 | 327k | CreateNewReplicaForLocalMemory(ts_desc, consensus_state, report, &replica); |
8580 | 327k | tablet->UpdateReplicaLocations(replica); |
8581 | 327k | tablet_locations_version_.fetch_add(1, std::memory_order_acq_rel); |
8582 | 327k | } |
8583 | | |
8584 | | void CatalogManager::CreateNewReplicaForLocalMemory(TSDescriptor* ts_desc, |
8585 | | const ConsensusStatePB* consensus_state, |
8586 | | const ReportedTabletPB& report, |
8587 | 664k | TabletReplica* new_replica) { |
8588 | | // Tablets in state NOT_STARTED or BOOTSTRAPPING don't have a consensus. |
8589 | 664k | if (consensus_state == nullptr) { |
8590 | 3.41k | new_replica->role = PeerRole::NON_PARTICIPANT; |
8591 | 3.41k | new_replica->member_type = PeerMemberType::UNKNOWN_MEMBER_TYPE; |
8592 | 660k | } else { |
8593 | 660k | CHECK(consensus_state != nullptr) << "No cstate: " << ts_desc->permanent_uuid() |
8594 | 2 | << " - " << report.state(); |
8595 | 660k | new_replica->role = GetConsensusRole(ts_desc->permanent_uuid(), *consensus_state); |
8596 | 660k | new_replica->member_type = GetConsensusMemberType(ts_desc->permanent_uuid(), *consensus_state); |
8597 | 660k | } |
8598 | 664k | if (report.has_should_disable_lb_move()) { |
8599 | 660k | new_replica->should_disable_lb_move = report.should_disable_lb_move(); |
8600 | 660k | } |
8601 | 664k | if (report.has_fs_data_dir()664k ) { |
8602 | 664k | new_replica->fs_data_dir = report.fs_data_dir(); |
8603 | 664k | } |
8604 | 664k | new_replica->state = report.state(); |
8605 | 664k | new_replica->ts_desc = ts_desc; |
8606 | 664k | if (!ts_desc->registered_through_heartbeat()) { |
8607 | 5.05k | new_replica->time_updated = MonoTime::Now() - ts_desc->TimeSinceHeartbeat(); |
8608 | 5.05k | } |
8609 | 664k | } |
8610 | | |
8611 | | Status CatalogManager::GetTabletPeer(const TabletId& tablet_id, |
8612 | 2.79M | std::shared_ptr<TabletPeer>* ret_tablet_peer) const { |
8613 | | // Note: CatalogManager has only one table, 'sys_catalog', with only |
8614 | | // one tablet. |
8615 | | |
8616 | 2.79M | if (PREDICT_FALSE(!IsInitialized())) { |
8617 | | // Master puts up the consensus service first and then initiates catalog manager's creation |
8618 | | // asynchronously. So this case is possible, but harmless. The RPC will simply be retried. |
8619 | | // Previously, because we weren't checking for this condition, we would fatal down stream. |
8620 | 112 | const string& reason = "CatalogManager is not yet initialized"; |
8621 | 112 | YB_LOG_EVERY_N(WARNING, 1000) << reason13 ; |
8622 | 112 | return STATUS(ServiceUnavailable, reason); |
8623 | 112 | } |
8624 | | |
8625 | 18.4E | CHECK(sys_catalog_) << "sys_catalog_ must be initialized!"; |
8626 | | |
8627 | 2.79M | if (master_->opts().IsShellMode()) { |
8628 | 181 | return STATUS_SUBSTITUTE(NotFound, |
8629 | 181 | "In shell mode: no tablet_id $0 exists in CatalogManager.", tablet_id); |
8630 | 181 | } |
8631 | | |
8632 | 2.79M | if (2.79M sys_catalog_->tablet_id() == tablet_id2.79M && sys_catalog_->tablet_peer().get() != nullptr && |
8633 | 2.79M | sys_catalog_->tablet_peer()->CheckRunning().ok()) { |
8634 | 2.79M | *ret_tablet_peer = tablet_peer(); |
8635 | 18.4E | } else { |
8636 | 18.4E | return STATUS_SUBSTITUTE(NotFound, |
8637 | 18.4E | "no SysTable in the RUNNING state exists with tablet_id $0 in CatalogManager", tablet_id); |
8638 | 18.4E | } |
8639 | 2.79M | return Status::OK(); |
8640 | 2.79M | } |
8641 | | |
8642 | 2.84M | const NodeInstancePB& CatalogManager::NodeInstance() const { |
8643 | 2.84M | return master_->instance_pb(); |
8644 | 2.84M | } |
8645 | | |
8646 | 28.9k | Status CatalogManager::GetRegistration(ServerRegistrationPB* reg) const { |
8647 | 28.9k | return master_->GetRegistration(reg, server::RpcOnly::kTrue); |
8648 | 28.9k | } |
8649 | | |
8650 | 57 | Status CatalogManager::UpdateMastersListInMemoryAndDisk() { |
8651 | 57 | DCHECK(master_->opts().IsShellMode()); |
8652 | | |
8653 | 57 | if (!master_->opts().IsShellMode()) { |
8654 | 0 | return STATUS(IllegalState, "Cannot update master's info when process is not in shell mode."); |
8655 | 0 | } |
8656 | | |
8657 | 57 | consensus::ConsensusStatePB consensus_state; |
8658 | 57 | RETURN_NOT_OK(GetCurrentConfig(&consensus_state)); |
8659 | | |
8660 | 57 | if (!consensus_state.has_config()) { |
8661 | 0 | return STATUS(NotFound, "No Raft config found."); |
8662 | 0 | } |
8663 | | |
8664 | 57 | RETURN_NOT_OK(sys_catalog_->ConvertConfigToMasterAddresses(consensus_state.config())); |
8665 | 57 | RETURN_NOT_OK(sys_catalog_->CreateAndFlushConsensusMeta(master_->fs_manager(), |
8666 | 57 | consensus_state.config(), |
8667 | 57 | consensus_state.current_term())); |
8668 | | |
8669 | 57 | return Status::OK(); |
8670 | 57 | } |
8671 | | |
8672 | 7.94k | Status CatalogManager::EnableBgTasks() { |
8673 | 7.94k | LockGuard lock(mutex_); |
8674 | | // Initialize refresh_ysql_tablespace_info_task_. This will be used to |
8675 | | // manage the background task that refreshes tablespace info. This task |
8676 | | // will be started by the CatalogManagerBgTasks below. |
8677 | 7.94k | refresh_ysql_tablespace_info_task_.Bind(&master_->messenger()->scheduler()); |
8678 | | |
8679 | 7.94k | background_tasks_.reset(new CatalogManagerBgTasks(this)); |
8680 | 7.94k | RETURN_NOT_OK_PREPEND(background_tasks_->Init(), |
8681 | 7.94k | "Failed to initialize catalog manager background tasks"); |
8682 | | |
8683 | | // Add bg thread to rebuild yql system partitions. |
8684 | 7.94k | refresh_yql_partitions_task_.Bind(&master_->messenger()->scheduler()); |
8685 | | |
8686 | 7.94k | RETURN_NOT_OK(background_tasks_thread_pool_->SubmitFunc( |
8687 | 7.94k | [this]() { RebuildYQLSystemPartitions(); })); |
8688 | | |
8689 | 7.94k | return Status::OK(); |
8690 | 7.94k | } |
8691 | | |
8692 | 163 | Status CatalogManager::StartRemoteBootstrap(const StartRemoteBootstrapRequestPB& req) { |
8693 | 163 | const TabletId& tablet_id = req.tablet_id(); |
8694 | 163 | std::unique_lock<std::mutex> l(remote_bootstrap_mtx_, std::try_to_lock); |
8695 | 163 | if (!l.owns_lock()) { |
8696 | 105 | return STATUS_SUBSTITUTE(AlreadyPresent, |
8697 | 105 | "Remote bootstrap of tablet $0 already in progress", tablet_id); |
8698 | 105 | } |
8699 | | |
8700 | 58 | if (!master_->opts().IsShellMode()) { |
8701 | 0 | return STATUS(IllegalState, "Cannot bootstrap a master which is not in shell mode."); |
8702 | 0 | } |
8703 | | |
8704 | 58 | LOG(INFO) << "Starting remote bootstrap: " << req.ShortDebugString(); |
8705 | | |
8706 | 58 | HostPort bootstrap_peer_addr = HostPortFromPB(DesiredHostPort( |
8707 | 58 | req.source_broadcast_addr(), req.source_private_addr(), req.source_cloud_info(), |
8708 | 58 | master_->MakeCloudInfoPB())); |
8709 | | |
8710 | 58 | const string& bootstrap_peer_uuid = req.bootstrap_peer_uuid(); |
8711 | 58 | int64_t leader_term = req.caller_term(); |
8712 | | |
8713 | 58 | std::shared_ptr<TabletPeer> old_tablet_peer; |
8714 | 58 | RaftGroupMetadataPtr meta; |
8715 | 58 | bool replacing_tablet = false; |
8716 | | |
8717 | 58 | if (tablet_exists_) { |
8718 | 0 | old_tablet_peer = tablet_peer(); |
8719 | | // Nothing to recover if the remote bootstrap client start failed the last time. |
8720 | 0 | if (old_tablet_peer) { |
8721 | 0 | meta = old_tablet_peer->tablet_metadata(); |
8722 | 0 | replacing_tablet = true; |
8723 | 0 | } |
8724 | 0 | } |
8725 | | |
8726 | 58 | if (replacing_tablet) { |
8727 | | // Make sure the existing tablet peer is shut down and tombstoned. |
8728 | 0 | RETURN_NOT_OK(tserver::HandleReplacingStaleTablet(meta, |
8729 | 0 | old_tablet_peer, |
8730 | 0 | tablet_id, |
8731 | 0 | master_->fs_manager()->uuid(), |
8732 | 0 | leader_term)); |
8733 | 0 | } |
8734 | | |
8735 | 58 | LOG_WITH_PREFIX(INFO) << " Initiating remote bootstrap from peer " << bootstrap_peer_uuid |
8736 | 58 | << " (" << bootstrap_peer_addr.ToString() << ")."; |
8737 | | |
8738 | 58 | auto rb_client = std::make_unique<tserver::RemoteBootstrapClient>( |
8739 | 58 | tablet_id, master_->fs_manager()); |
8740 | | |
8741 | | // Download and persist the remote superblock in TABLET_DATA_COPYING state. |
8742 | 58 | if (replacing_tablet) { |
8743 | 0 | RETURN_NOT_OK(rb_client->SetTabletToReplace(meta, leader_term)); |
8744 | 0 | } |
8745 | 58 | RETURN_NOT_OK(rb_client->Start( |
8746 | 58 | bootstrap_peer_uuid, &master_->proxy_cache(), bootstrap_peer_addr, &meta)); |
8747 | | // This SetupTabletPeer is needed by rb_client to perform the remote bootstrap/fetch. |
8748 | | // And the SetupTablet below to perform "local bootstrap" cannot be done until the remote fetch |
8749 | | // has succeeded. So keeping them seperate for now. |
8750 | 58 | sys_catalog_->SetupTabletPeer(meta); |
8751 | 58 | if (PREDICT_FALSE(FLAGS_TEST_inject_latency_during_remote_bootstrap_secs)) { |
8752 | 1 | LOG(INFO) << "Injecting " << FLAGS_TEST_inject_latency_during_remote_bootstrap_secs |
8753 | 1 | << " seconds of latency for test"; |
8754 | 1 | SleepFor(MonoDelta::FromSeconds(FLAGS_TEST_inject_latency_during_remote_bootstrap_secs)); |
8755 | 1 | } |
8756 | | |
8757 | | // From this point onward, the superblock is persisted in TABLET_DATA_COPYING |
8758 | | // state, and we need to tombstone the tablet if additional steps prior to |
8759 | | // getting to a TABLET_DATA_READY state fail. |
8760 | 58 | tablet_exists_ = true; |
8761 | | |
8762 | | // Download all of the remote files. |
8763 | 58 | TOMBSTONE_NOT_OK(rb_client->FetchAll(tablet_peer()->status_listener()), |
8764 | 58 | meta, |
8765 | 58 | master_->fs_manager()->uuid(), |
8766 | 58 | Substitute("Remote bootstrap: Unable to fetch data from remote peer $0 ($1)", |
8767 | 58 | bootstrap_peer_uuid, bootstrap_peer_addr.ToString()), |
8768 | 58 | nullptr); |
8769 | | |
8770 | | // Write out the last files to make the new replica visible and update the |
8771 | | // TabletDataState in the superblock to TABLET_DATA_READY. |
8772 | | // Finish() will call EndRemoteSession() and wait for the leader to successfully submit a |
8773 | | // ChangeConfig request (to change this master's role from PRE_VOTER or PRE_OBSERVER to VOTER or |
8774 | | // OBSERVER respectively). If the RPC times out, we will ignore the error (since the leader could |
8775 | | // have successfully submitted the ChangeConfig request and failed to respond before in time) |
8776 | | // and check the committed config until we find that this master's role has changed, or until we |
8777 | | // time out which will cause us to tombstone the tablet. |
8778 | 58 | TOMBSTONE_NOT_OK(rb_client->Finish(), |
8779 | 58 | meta, |
8780 | 58 | master_->fs_manager()->uuid(), |
8781 | 58 | "Remote bootstrap: Failed calling Finish()", |
8782 | 58 | nullptr); |
8783 | | |
8784 | | // Synchronous tablet open for "local bootstrap". |
8785 | 58 | RETURN_NOT_OK(tserver::ShutdownAndTombstoneTabletPeerNotOk( |
8786 | 58 | sys_catalog_->OpenTablet(meta), sys_catalog_->tablet_peer(), meta, |
8787 | 58 | master_->fs_manager()->uuid(), "Remote bootstrap: Failed opening sys catalog")); |
8788 | | |
8789 | | // Set up the in-memory master list and also flush the cmeta. |
8790 | 58 | RETURN_NOT_OK(UpdateMastersListInMemoryAndDisk()); |
8791 | | |
8792 | 58 | master_->SetShellMode(false); |
8793 | | |
8794 | | // Call VerifyChangeRoleSucceeded only after we have set shell mode to false. Otherwise, |
8795 | | // CatalogManager::GetTabletPeer will always return an error, and the consensus will never get |
8796 | | // updated. |
8797 | 58 | auto status = rb_client->VerifyChangeRoleSucceeded( |
8798 | 58 | sys_catalog_->tablet_peer()->shared_consensus()); |
8799 | | |
8800 | 58 | if (!status.ok()) { |
8801 | 0 | LOG_WITH_PREFIX(WARNING) << "Remote bootstrap finished. " |
8802 | 0 | << "Failed calling VerifyChangeRoleSucceeded: " |
8803 | 0 | << status.ToString(); |
8804 | 58 | } else { |
8805 | 58 | LOG_WITH_PREFIX(INFO) << "Remote bootstrap finished successfully"; |
8806 | 58 | } |
8807 | | |
8808 | 58 | LOG(INFO) << "Master completed remote bootstrap and is out of shell mode."; |
8809 | | |
8810 | 58 | RETURN_NOT_OK(EnableBgTasks()); |
8811 | | |
8812 | 58 | return Status::OK(); |
8813 | 58 | } |
8814 | | |
8815 | | CHECKED_STATUS CatalogManager::SendAlterTableRequest(const scoped_refptr<TableInfo>& table, |
8816 | 9.83k | const AlterTableRequestPB* req) { |
8817 | 9.83k | auto tablets = table->GetTablets(); |
8818 | | |
8819 | 9.83k | bool is_ysql_table_with_transaction_metadata = |
8820 | 9.83k | table->GetTableType() == TableType::PGSQL_TABLE_TYPE && |
8821 | 9.83k | req != nullptr7.67k && |
8822 | 9.83k | req->has_transaction()5.70k && |
8823 | 9.83k | req->transaction().has_transaction_id()520 ; |
8824 | | |
8825 | 9.83k | bool alter_table_has_add_or_drop_column_step = false; |
8826 | 9.83k | if (req && (5.89k req->alter_schema_steps_size()5.89k || req->has_alter_properties()5.32k )) { |
8827 | 578 | for (const AlterTableRequestPB::Step& step : req->alter_schema_steps()) { |
8828 | 571 | if (step.type() == AlterTableRequestPB::ADD_COLUMN || |
8829 | 571 | step.type() == AlterTableRequestPB::DROP_COLUMN245 ) { |
8830 | 542 | alter_table_has_add_or_drop_column_step = true; |
8831 | 542 | break; |
8832 | 542 | } |
8833 | 571 | } |
8834 | 578 | } |
8835 | | |
8836 | 9.83k | TransactionId txn_id = TransactionId::Nil(); |
8837 | 9.83k | if (is_ysql_table_with_transaction_metadata && alter_table_has_add_or_drop_column_step520 ) { |
8838 | 390 | { |
8839 | 390 | LOG(INFO) << "Persist transaction metadata into SysTableEntryPB for table ID " << table->id(); |
8840 | 390 | TRACE("Locking table"); |
8841 | 390 | auto l = table->LockForWrite(); |
8842 | 390 | auto& tablet_data = *l.mutable_data(); |
8843 | 390 | auto& table_pb = tablet_data.pb; |
8844 | 390 | table_pb.mutable_transaction()->CopyFrom(req->transaction()); |
8845 | | |
8846 | | // Update sys-catalog with the transaction ID. |
8847 | 390 | TRACE("Updating table metadata on disk"); |
8848 | 390 | RETURN_NOT_OK(master_->catalog_manager_impl()->sys_catalog_->Upsert( |
8849 | 390 | master_->catalog_manager()->leader_ready_term(), table.get())); |
8850 | | |
8851 | | // Update the in-memory state. |
8852 | 390 | TRACE("Committing in-memory state"); |
8853 | 390 | l.Commit(); |
8854 | 390 | } |
8855 | 390 | txn_id = VERIFY_RESULT(FullyDecodeTransactionId(req->transaction().transaction_id())); |
8856 | 390 | } |
8857 | | |
8858 | 27.7k | for (const scoped_refptr<TabletInfo>& tablet : tablets)9.83k { |
8859 | 27.7k | auto call = std::make_shared<AsyncAlterTable>(master_, AsyncTaskPool(), tablet, table, txn_id); |
8860 | 27.7k | tablet->table()->AddTask(call); |
8861 | 27.7k | if (PREDICT_FALSE(FLAGS_TEST_slowdown_alter_table_rpcs_ms > 0)) { |
8862 | 0 | LOG(INFO) << "Sleeping for " << tablet->id() << " " |
8863 | 0 | << FLAGS_TEST_slowdown_alter_table_rpcs_ms |
8864 | 0 | << "ms before sending async alter table request"; |
8865 | 0 | SleepFor(MonoDelta::FromMilliseconds(FLAGS_TEST_slowdown_alter_table_rpcs_ms)); |
8866 | 0 | } |
8867 | 27.7k | RETURN_NOT_OK(ScheduleTask(call)); |
8868 | 27.7k | } |
8869 | 9.83k | return Status::OK(); |
8870 | 9.83k | } |
8871 | | |
8872 | | void CatalogManager::SendCopartitionTabletRequest(const scoped_refptr<TabletInfo>& tablet, |
8873 | 0 | const scoped_refptr<TableInfo>& table) { |
8874 | 0 | auto call = std::make_shared<AsyncCopartitionTable>(master_, AsyncTaskPool(), tablet, table); |
8875 | 0 | table->AddTask(call); |
8876 | 0 | WARN_NOT_OK(ScheduleTask(call), "Failed to send copartition table request"); |
8877 | 0 | } |
8878 | | |
8879 | | Status CatalogManager::SendSplitTabletRequest( |
8880 | | const scoped_refptr<TabletInfo>& tablet, std::array<TabletId, kNumSplitParts> new_tablet_ids, |
8881 | 140 | const std::string& split_encoded_key, const std::string& split_partition_key) { |
8882 | 140 | VLOG(2) << "Scheduling SplitTablet request to leader tserver for source tablet ID: " |
8883 | 0 | << tablet->tablet_id() << ", after-split tablet IDs: " << AsString(new_tablet_ids); |
8884 | 140 | auto call = std::make_shared<AsyncSplitTablet>( |
8885 | 140 | master_, AsyncTaskPool(), tablet, new_tablet_ids, split_encoded_key, split_partition_key, |
8886 | 140 | &tablet_split_manager_); |
8887 | 140 | tablet->table()->AddTask(call); |
8888 | 140 | return ScheduleTask(call); |
8889 | 140 | } |
8890 | | |
8891 | | void CatalogManager::DeleteTabletReplicas( |
8892 | 24.9k | TabletInfo* tablet, const std::string& msg, HideOnly hide_only) { |
8893 | 24.9k | auto locations = tablet->GetReplicaLocations(); |
8894 | 24.9k | LOG(INFO) << "Sending DeleteTablet for " << locations->size() |
8895 | 24.9k | << " replicas of tablet " << tablet->tablet_id(); |
8896 | 73.9k | for (const auto& r : *locations) { |
8897 | 73.9k | SendDeleteTabletRequest(tablet->tablet_id(), TABLET_DATA_DELETED, boost::none, tablet->table(), |
8898 | 73.9k | r.second.ts_desc, msg, hide_only); |
8899 | 73.9k | } |
8900 | 24.9k | } |
8901 | | |
8902 | 13.9k | Status CatalogManager::CheckIfForbiddenToDeleteTabletOf(const scoped_refptr<TableInfo>& table) { |
8903 | | // Do not delete the system catalog tablet. |
8904 | 13.9k | if (IsSystemTable(*table)) { |
8905 | 8.00k | return STATUS(InvalidArgument, "It is not allowed to delete system tables"); |
8906 | 8.00k | } |
8907 | | // Do not delete the tablet of a colocated table. |
8908 | 5.97k | if (table->IsColocatedUserTable()) { |
8909 | 117 | return STATUS(InvalidArgument, "It is not allowed to delete tablets of the colocated tables."); |
8910 | 117 | } |
8911 | 5.85k | return Status::OK(); |
8912 | 5.97k | } |
8913 | | |
8914 | | Status CatalogManager::DeleteTabletsAndSendRequests( |
8915 | 13.9k | const TableInfoPtr& table, const RepeatedBytes& retained_by_snapshot_schedules) { |
8916 | | // Silently fail if tablet deletion is forbidden so table deletion can continue executing. |
8917 | 13.9k | if (!CheckIfForbiddenToDeleteTabletOf(table).ok()) { |
8918 | 8.11k | return Status::OK(); |
8919 | 8.11k | } |
8920 | | |
8921 | 5.85k | auto tablets = table->GetTablets(IncludeInactive::kTrue); |
8922 | | |
8923 | 46.7k | std::sort(tablets.begin(), tablets.end(), [](const auto& lhs, const auto& rhs) { |
8924 | 46.7k | return lhs->tablet_id() < rhs->tablet_id(); |
8925 | 46.7k | }); |
8926 | | |
8927 | 5.85k | string deletion_msg = "Table deleted at " + LocalTimeAsString(); |
8928 | 5.85k | RETURN_NOT_OK(DeleteTabletListAndSendRequests( |
8929 | 5.85k | tablets, deletion_msg, retained_by_snapshot_schedules)); |
8930 | | |
8931 | 5.85k | if (table->IsColocatedParentTable()) { |
8932 | 8 | LockGuard lock(mutex_); |
8933 | 8 | colocated_tablet_ids_map_.erase(table->namespace_id()); |
8934 | 5.84k | } else if (table->IsTablegroupParentTable()) { |
8935 | | // In the case of dropped tablegroup parent table, need to delete tablegroup info. |
8936 | 52 | LockGuard lock(mutex_); |
8937 | 52 | const auto& tablegroup_id = table_tablegroup_ids_map_[table->id()]; |
8938 | 52 | tablegroup_ids_map_.erase(tablegroup_id); |
8939 | 52 | tablegroup_tablet_ids_map_[table->namespace_id()].erase(tablegroup_id); |
8940 | 52 | table_tablegroup_ids_map_.erase(table->id()); |
8941 | 52 | } |
8942 | 5.85k | return Status::OK(); |
8943 | 5.85k | } |
8944 | | |
8945 | | Status CatalogManager::DeleteTabletListAndSendRequests( |
8946 | | const std::vector<scoped_refptr<TabletInfo>>& tablets, const std::string& deletion_msg, |
8947 | 5.85k | const google::protobuf::RepeatedPtrField<std::string>& retained_by_snapshot_schedules) { |
8948 | 5.85k | struct TabletData { |
8949 | 5.85k | TabletInfoPtr tablet; |
8950 | 5.85k | TabletInfo::WriteLock lock; |
8951 | 5.85k | HideOnly hide_only; |
8952 | 5.85k | }; |
8953 | 5.85k | std::vector<TabletData> tablets_data; |
8954 | 5.85k | tablets_data.reserve(tablets.size()); |
8955 | 5.85k | std::vector<TabletInfo*> tablet_infos; |
8956 | 5.85k | tablet_infos.reserve(tablets_data.size()); |
8957 | 5.85k | std::vector<TabletInfoPtr> marked_as_hidden; |
8958 | | |
8959 | | // Grab tablets and tablet write locks. The list should already be in tablet_id sorted order. |
8960 | 5.85k | { |
8961 | 5.85k | SharedLock read_lock(mutex_); |
8962 | 24.9k | for (const auto& tablet : tablets) { |
8963 | 24.9k | tablets_data.push_back(TabletData { |
8964 | 24.9k | .tablet = tablet, |
8965 | 24.9k | .lock = tablet->LockForWrite(), |
8966 | | // Hide tablet if it is retained by snapshot schedule, or is part of a cdc stream. |
8967 | 24.9k | .hide_only = HideOnly(!retained_by_snapshot_schedules.empty()), |
8968 | 24.9k | }); |
8969 | 24.9k | if (!tablets_data.back().hide_only) { |
8970 | | // Also check if this tablet is part of a cdc stream and is not already hidden. If this is |
8971 | | // a cdc stream producer and is already hidden, then we should delete this tablet. |
8972 | 24.9k | tablets_data.back().hide_only = HideOnly( |
8973 | 24.9k | IsTableCdcProducer(*tablet->table()) && !tablets_data.back().lock->ListedAsHidden()0 ); |
8974 | 24.9k | } |
8975 | | |
8976 | 24.9k | tablet_infos.emplace_back(tablet.get()); |
8977 | 24.9k | } |
8978 | 5.85k | } |
8979 | | |
8980 | | // Use the same hybrid time for all hidden tablets. |
8981 | 5.85k | HybridTime hide_hybrid_time = master_->clock()->Now(); |
8982 | | |
8983 | | // Mark the tablets as deleted. |
8984 | 24.9k | for (auto& tablet_data : tablets_data) { |
8985 | 24.9k | auto& tablet = tablet_data.tablet; |
8986 | 24.9k | auto& tablet_lock = tablet_data.lock; |
8987 | | |
8988 | 24.9k | bool was_hidden = tablet_lock->ListedAsHidden(); |
8989 | | // Inactive tablet now, so remove it from partitions_. |
8990 | | // After all the tablets have been deleted from the tservers, we remove it from tablets_. |
8991 | 24.9k | tablet->table()->RemoveTablet(tablet->id(), DeactivateOnly::kTrue); |
8992 | | |
8993 | 24.9k | if (tablet_data.hide_only) { |
8994 | 12 | LOG(INFO) << "Hiding tablet " << tablet->tablet_id(); |
8995 | 12 | tablet_lock.mutable_data()->pb.set_hide_hybrid_time(hide_hybrid_time.ToUint64()); |
8996 | 12 | *tablet_lock.mutable_data()->pb.mutable_retained_by_snapshot_schedules() = |
8997 | 12 | retained_by_snapshot_schedules; |
8998 | 24.9k | } else { |
8999 | 24.9k | LOG(INFO) << "Deleting tablet " << tablet->tablet_id(); |
9000 | 24.9k | tablet_lock.mutable_data()->set_state(SysTabletsEntryPB::DELETED, deletion_msg); |
9001 | 24.9k | } |
9002 | 24.9k | if (tablet_lock->ListedAsHidden() && !was_hidden12 ) { |
9003 | 12 | marked_as_hidden.push_back(tablet); |
9004 | 12 | } |
9005 | 24.9k | } |
9006 | | |
9007 | | // Update all the tablet states in raft in bulk. |
9008 | 5.85k | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), tablet_infos)); |
9009 | | |
9010 | | // Commit the change. |
9011 | 24.9k | for (auto& tablet_data : tablets_data)5.85k { |
9012 | 24.9k | auto& tablet = tablet_data.tablet; |
9013 | 24.9k | auto& tablet_lock = tablet_data.lock; |
9014 | | |
9015 | 24.9k | tablet_lock.Commit(); |
9016 | 24.9k | LOG(INFO) << (tablet_data.hide_only ? "Hid tablet "12 : "Deleted tablet "24.9k ) << tablet->tablet_id(); |
9017 | | |
9018 | 24.9k | DeleteTabletReplicas(tablet.get(), deletion_msg, tablet_data.hide_only); |
9019 | 24.9k | } |
9020 | | |
9021 | 5.85k | if (!marked_as_hidden.empty()) { |
9022 | 4 | LockGuard lock(mutex_); |
9023 | 4 | hidden_tablets_.insert(hidden_tablets_.end(), marked_as_hidden.begin(), marked_as_hidden.end()); |
9024 | 4 | } |
9025 | | |
9026 | 5.85k | return Status::OK(); |
9027 | 5.85k | } |
9028 | | |
9029 | | void CatalogManager::SendDeleteTabletRequest( |
9030 | | const TabletId& tablet_id, |
9031 | | TabletDataState delete_type, |
9032 | | const boost::optional<int64_t>& cas_config_opid_index_less_or_equal, |
9033 | | const scoped_refptr<TableInfo>& table, |
9034 | | TSDescriptor* ts_desc, |
9035 | | const string& reason, |
9036 | 73.9k | bool hide_only) { |
9037 | 73.9k | if (PREDICT_FALSE(GetAtomicFlag(&FLAGS_TEST_disable_tablet_deletion))) { |
9038 | 3 | return; |
9039 | 3 | } |
9040 | 73.9k | LOG_WITH_PREFIX(INFO) |
9041 | 73.9k | << (hide_only ? "Hiding"36 : "Deleting"73.8k ) << " tablet " << tablet_id << " on peer " |
9042 | 73.9k | << ts_desc->permanent_uuid() << " with delete type " |
9043 | 73.9k | << TabletDataState_Name(delete_type) << " (" << reason << ")"; |
9044 | 73.9k | auto call = std::make_shared<AsyncDeleteReplica>(master_, AsyncTaskPool(), |
9045 | 73.9k | ts_desc->permanent_uuid(), table, tablet_id, delete_type, |
9046 | 73.9k | cas_config_opid_index_less_or_equal, reason); |
9047 | 73.9k | if (hide_only) { |
9048 | 36 | call->set_hide_only(hide_only); |
9049 | 36 | } |
9050 | 73.9k | if (table != nullptr) { |
9051 | 73.9k | table->AddTask(call); |
9052 | 73.9k | } |
9053 | | |
9054 | 73.9k | auto status = ScheduleTask(call); |
9055 | 73.9k | WARN_NOT_OK(status, Substitute("Failed to send delete request for tablet $0", tablet_id)); |
9056 | | // TODO(bogdan): does the pending delete semantics need to change? |
9057 | 73.9k | if (status.ok()) { |
9058 | 73.9k | ts_desc->AddPendingTabletDelete(tablet_id); |
9059 | 73.9k | } |
9060 | 73.9k | } |
9061 | | |
9062 | | void CatalogManager::SendLeaderStepDownRequest( |
9063 | | const scoped_refptr<TabletInfo>& tablet, const ConsensusStatePB& cstate, |
9064 | | const string& change_config_ts_uuid, bool should_remove, |
9065 | 54.0k | const string& new_leader_uuid) { |
9066 | 54.0k | auto task = std::make_shared<AsyncTryStepDown>( |
9067 | 54.0k | master_, AsyncTaskPool(), tablet, cstate, change_config_ts_uuid, should_remove, |
9068 | 54.0k | new_leader_uuid); |
9069 | 54.0k | tablet->table()->AddTask(task); |
9070 | 54.0k | Status status = ScheduleTask(task); |
9071 | 54.0k | WARN_NOT_OK(status, Substitute("Failed to send new $0 request", task->type_name())); |
9072 | 54.0k | } |
9073 | | |
9074 | | // TODO: refactor this into a joint method with the add one. |
9075 | | void CatalogManager::SendRemoveServerRequest( |
9076 | | const scoped_refptr<TabletInfo>& tablet, const ConsensusStatePB& cstate, |
9077 | 1.71k | const string& change_config_ts_uuid) { |
9078 | | // Check if the user wants the leader to be stepped down. |
9079 | 1.71k | auto task = std::make_shared<AsyncRemoveServerTask>( |
9080 | 1.71k | master_, AsyncTaskPool(), tablet, cstate, change_config_ts_uuid); |
9081 | 1.71k | tablet->table()->AddTask(task); |
9082 | 1.71k | WARN_NOT_OK(ScheduleTask(task), Substitute("Failed to send new $0 request", task->type_name())); |
9083 | 1.71k | } |
9084 | | |
9085 | | void CatalogManager::SendAddServerRequest( |
9086 | | const scoped_refptr<TabletInfo>& tablet, PeerMemberType member_type, |
9087 | 2.17k | const ConsensusStatePB& cstate, const string& change_config_ts_uuid) { |
9088 | 2.17k | auto task = std::make_shared<AsyncAddServerTask>(master_, AsyncTaskPool(), tablet, member_type, |
9089 | 2.17k | cstate, change_config_ts_uuid); |
9090 | 2.17k | tablet->table()->AddTask(task); |
9091 | 2.17k | WARN_NOT_OK( |
9092 | 2.17k | ScheduleTask(task), |
9093 | 2.17k | Substitute("Failed to send AddServer of tserver $0 to tablet $1", |
9094 | 2.17k | change_config_ts_uuid, tablet.get()->ToString())); |
9095 | 2.17k | } |
9096 | | |
9097 | | void CatalogManager::GetPendingServerTasksUnlocked( |
9098 | | const TableId &table_uuid, |
9099 | | TabletToTabletServerMap *add_replica_tasks_map, |
9100 | | TabletToTabletServerMap *remove_replica_tasks_map, |
9101 | 891k | TabletToTabletServerMap *stepdown_leader_tasks_map) { |
9102 | | |
9103 | 891k | auto table = GetTableInfoUnlocked(table_uuid); |
9104 | 891k | for (const auto& task : table->GetTasks()) { |
9105 | 128k | TabletToTabletServerMap* outputMap = nullptr; |
9106 | 128k | if (task->type() == MonitoredTask::ASYNC_ADD_SERVER) { |
9107 | 415 | outputMap = add_replica_tasks_map; |
9108 | 127k | } else if (task->type() == MonitoredTask::ASYNC_REMOVE_SERVER) { |
9109 | 662 | outputMap = remove_replica_tasks_map; |
9110 | 127k | } else if (task->type() == MonitoredTask::ASYNC_TRY_STEP_DOWN) { |
9111 | | // Store new_leader_uuid instead of change_config_ts_uuid. |
9112 | 569 | auto raft_task = static_cast<AsyncTryStepDown*>(task.get()); |
9113 | 569 | (*stepdown_leader_tasks_map)[raft_task->tablet_id()] = raft_task->new_leader_uuid(); |
9114 | 569 | continue; |
9115 | 569 | } |
9116 | 127k | if (outputMap) { |
9117 | 1.07k | auto raft_task = static_cast<CommonInfoForRaftTask*>(task.get()); |
9118 | 1.07k | (*outputMap)[raft_task->tablet_id()] = raft_task->change_config_ts_uuid(); |
9119 | 1.07k | } |
9120 | 127k | } |
9121 | 891k | } |
9122 | | |
9123 | | void CatalogManager::ExtractTabletsToProcess( |
9124 | | TabletInfos *tablets_to_delete, |
9125 | 1.56M | TableToTabletInfos *tablets_to_process) { |
9126 | 1.56M | SharedLock lock(mutex_); |
9127 | | |
9128 | | // TODO: At the moment we loop through all the tablets |
9129 | | // we can keep a set of tablets waiting for "assignment" |
9130 | | // or just a counter to avoid to take the lock and loop through the tablets |
9131 | | // if everything is "stable". |
9132 | | |
9133 | 34.3M | for (const TabletInfoMap::value_type& entry : *tablet_map_) { |
9134 | 34.3M | scoped_refptr<TabletInfo> tablet = entry.second; |
9135 | 34.3M | auto table = tablet->table(); |
9136 | 34.3M | if (!table) { |
9137 | | // Tablet is orphaned or in preparing state, continue. |
9138 | 0 | continue; |
9139 | 0 | } |
9140 | | |
9141 | | // acquire table lock before tablets. |
9142 | 34.3M | auto table_lock = table->LockForRead(); |
9143 | 34.3M | auto tablet_lock = tablet->LockForRead(); |
9144 | | |
9145 | | // If the table is deleted or the tablet was replaced at table creation time. |
9146 | 34.3M | if (tablet_lock->is_deleted() || table_lock->started_deleting()32.2M ) { |
9147 | | // Process this table deletion only once (tombstones for table may remain longer). |
9148 | 2.04M | if (table_ids_map_->find(tablet->table()->id()) != table_ids_map_->end()) { |
9149 | 2.04M | tablets_to_delete->push_back(tablet); |
9150 | 2.04M | } |
9151 | | // Don't process deleted tables regardless. |
9152 | 2.04M | continue; |
9153 | 2.04M | } |
9154 | | |
9155 | | // Running tablets. |
9156 | 32.2M | if (tablet_lock->is_running()) { |
9157 | | // TODO: handle last update > not responding timeout? |
9158 | 32.1M | continue; |
9159 | 32.1M | } |
9160 | | |
9161 | | // Tablets not yet assigned or with a report just received. |
9162 | 86.9k | (*tablets_to_process)[tablet->table()->id()].push_back(tablet); |
9163 | 86.9k | } |
9164 | 1.56M | } |
9165 | | |
9166 | 1.51M | bool CatalogManager::AreTablesDeleting() { |
9167 | 1.51M | SharedLock lock(mutex_); |
9168 | | |
9169 | 79.7M | for (const TableInfoMap::value_type& entry : *table_ids_map_) { |
9170 | 79.7M | scoped_refptr<TableInfo> table(entry.second); |
9171 | 79.7M | auto table_lock = table->LockForRead(); |
9172 | | // TODO(jason): possibly change this to started_deleting when we begin removing DELETED tables |
9173 | | // from table_ids_map_ (see CleanUpDeletedTables). |
9174 | 79.7M | if (table_lock->is_deleting()) { |
9175 | 48 | return true; |
9176 | 48 | } |
9177 | 79.7M | } |
9178 | 1.51M | return false; |
9179 | 1.51M | } |
9180 | | |
9181 | | struct DeferredAssignmentActions { |
9182 | | std::vector<TabletInfo*> modified_tablets; |
9183 | | std::vector<TabletInfo*> needs_create_rpc; |
9184 | | }; |
9185 | | |
9186 | | void CatalogManager::HandleAssignPreparingTablet(TabletInfo* tablet, |
9187 | 48.5k | DeferredAssignmentActions* deferred) { |
9188 | | // The tablet was just created (probably by a CreateTable RPC). |
9189 | | // Update the state to "creating" to be ready for the creation request. |
9190 | 48.5k | tablet->mutable_metadata()->mutable_dirty()->set_state( |
9191 | 48.5k | SysTabletsEntryPB::CREATING, "Sending initial creation of tablet"); |
9192 | 48.5k | deferred->modified_tablets.push_back(tablet); |
9193 | 48.5k | deferred->needs_create_rpc.push_back(tablet); |
9194 | 48.5k | VLOG(1) << "Assign new tablet " << tablet->ToString()0 ; |
9195 | 48.5k | } |
9196 | | |
9197 | | void CatalogManager::HandleAssignCreatingTablet(TabletInfo* tablet, |
9198 | | DeferredAssignmentActions* deferred, |
9199 | 38.3k | vector<scoped_refptr<TabletInfo>>* new_tablets) { |
9200 | 38.3k | MonoDelta time_since_updated = |
9201 | 38.3k | MonoTime::Now().GetDeltaSince(tablet->last_update_time()); |
9202 | 38.3k | int64_t remaining_timeout_ms = |
9203 | 38.3k | FLAGS_tablet_creation_timeout_ms - time_since_updated.ToMilliseconds(); |
9204 | | |
9205 | 38.3k | if (tablet->LockForRead()->pb.has_split_parent_tablet_id()) { |
9206 | | // No need to recreate post-split tablets, since this is always done on source tablet replicas. |
9207 | 405 | VLOG(2) << "Post-split tablet " << AsString(tablet) << " still being created."0 ; |
9208 | 405 | return; |
9209 | 405 | } |
9210 | | // Skip the tablet if the assignment timeout is not yet expired. |
9211 | 37.9k | if (remaining_timeout_ms > 0) { |
9212 | 37.9k | VLOG(2) << "Tablet " << tablet->ToString() << " still being created. " |
9213 | 0 | << remaining_timeout_ms << "ms remain until timeout."; |
9214 | 37.9k | return; |
9215 | 37.9k | } |
9216 | | |
9217 | 10 | const PersistentTabletInfo& old_info = tablet->metadata().state(); |
9218 | | |
9219 | | // The "tablet creation" was already sent, but we didn't receive an answer |
9220 | | // within the timeout. So the tablet will be replaced by a new one. |
9221 | 10 | TabletInfoPtr replacement; |
9222 | 10 | { |
9223 | 10 | LockGuard lock(mutex_); |
9224 | 10 | replacement = CreateTabletInfo(tablet->table().get(), old_info.pb.partition()); |
9225 | 10 | } |
9226 | 10 | LOG(WARNING) << "Tablet " << tablet->ToString() << " was not created within " |
9227 | 10 | << "the allowed timeout. Replacing with a new tablet " |
9228 | 10 | << replacement->tablet_id(); |
9229 | | |
9230 | 10 | tablet->table()->ReplaceTablet(tablet, replacement); |
9231 | 10 | { |
9232 | 10 | LockGuard lock(mutex_); |
9233 | 10 | auto tablet_map_checkout = tablet_map_.CheckOut(); |
9234 | 10 | (*tablet_map_checkout)[replacement->tablet_id()] = replacement; |
9235 | 10 | } |
9236 | | |
9237 | | // Mark old tablet as replaced. |
9238 | 10 | tablet->mutable_metadata()->mutable_dirty()->set_state( |
9239 | 10 | SysTabletsEntryPB::REPLACED, |
9240 | 10 | Substitute("Replaced by $0 at $1", |
9241 | 10 | replacement->tablet_id(), LocalTimeAsString())); |
9242 | | |
9243 | | // Mark new tablet as being created. |
9244 | 10 | replacement->mutable_metadata()->mutable_dirty()->set_state( |
9245 | 10 | SysTabletsEntryPB::CREATING, |
9246 | 10 | Substitute("Replacement for $0", tablet->tablet_id())); |
9247 | | |
9248 | 10 | deferred->modified_tablets.push_back(tablet); |
9249 | 10 | deferred->modified_tablets.push_back(replacement.get()); |
9250 | 10 | deferred->needs_create_rpc.push_back(replacement.get()); |
9251 | 10 | VLOG(1) << "Replaced tablet " << tablet->tablet_id() |
9252 | 0 | << " with " << replacement->tablet_id() |
9253 | 0 | << " (table " << tablet->table()->ToString() << ")"; |
9254 | | |
9255 | 10 | new_tablets->push_back(replacement); |
9256 | 10 | } |
9257 | | |
9258 | | // TODO: we could batch the IO onto a background thread. |
9259 | | Status CatalogManager::HandleTabletSchemaVersionReport( |
9260 | 97.4k | TabletInfo *tablet, uint32_t version, const scoped_refptr<TableInfo>& table_info) { |
9261 | 97.4k | scoped_refptr<TableInfo> table; |
9262 | 97.4k | if (table_info) { |
9263 | 31.5k | table = table_info; |
9264 | 65.9k | } else { |
9265 | 65.9k | table = tablet->table(); |
9266 | 65.9k | } |
9267 | | |
9268 | | // Update the schema version if it's the latest. |
9269 | 97.4k | tablet->set_reported_schema_version(table->id(), version); |
9270 | 97.4k | VLOG_WITH_PREFIX_AND_FUNC156 (1) |
9271 | 156 | << "Tablet " << tablet->tablet_id() << " reported version " << version; |
9272 | | |
9273 | | // Verify if it's the last tablet report, and the alter completed. |
9274 | 97.4k | { |
9275 | 97.4k | auto l = table->LockForRead(); |
9276 | 97.4k | if (l->pb.state() != SysTablesEntryPB::ALTERING) { |
9277 | 69.7k | VLOG_WITH_PREFIX_AND_FUNC1 (2) << "Table " << table->ToString() << " is not altering"1 ; |
9278 | 69.7k | return Status::OK(); |
9279 | 69.7k | } |
9280 | | |
9281 | 27.6k | uint32_t current_version = l->pb.version(); |
9282 | 27.6k | if (table->IsAlterInProgress(current_version)) { |
9283 | 17.1k | VLOG_WITH_PREFIX_AND_FUNC0 (2) << "Table " << table->ToString() << " has IsAlterInProgress (" |
9284 | 0 | << current_version << ")"; |
9285 | 17.1k | return Status::OK(); |
9286 | 17.1k | } |
9287 | 27.6k | } |
9288 | | |
9289 | 10.5k | return MultiStageAlterTable::LaunchNextTableInfoVersionIfNecessary(this, table, version); |
9290 | 27.6k | } |
9291 | | |
9292 | | Status CatalogManager::ProcessPendingAssignmentsPerTable( |
9293 | 21.4k | const TableId& table_id, const TabletInfos& tablets, CMGlobalLoadState* global_load_state) { |
9294 | 21.4k | VLOG(1) << "Processing pending assignments"0 ; |
9295 | | |
9296 | 21.4k | TSDescriptorVector ts_descs = GetAllLiveNotBlacklistedTServers(); |
9297 | | |
9298 | | // Initialize this table load state. |
9299 | 21.4k | CMPerTableLoadState table_load_state(global_load_state); |
9300 | 21.4k | InitializeTableLoadState(table_id, ts_descs, &table_load_state); |
9301 | 21.4k | table_load_state.SortLoad(); |
9302 | | |
9303 | | // Take write locks on all tablets to be processed, and ensure that they are |
9304 | | // unlocked at the end of this scope. |
9305 | 86.9k | for (const scoped_refptr<TabletInfo>& tablet : tablets) { |
9306 | 86.9k | tablet->mutable_metadata()->StartMutation(); |
9307 | 86.9k | } |
9308 | 21.4k | ScopedInfoCommitter<TabletInfo> unlocker_in(&tablets); |
9309 | | |
9310 | | // Any tablets created by the helper functions will also be created in a |
9311 | | // locked state, so we must ensure they are unlocked before we return to |
9312 | | // avoid deadlocks. |
9313 | 21.4k | TabletInfos new_tablets; |
9314 | 21.4k | ScopedInfoCommitter<TabletInfo> unlocker_out(&new_tablets); |
9315 | | |
9316 | 21.4k | DeferredAssignmentActions deferred; |
9317 | | |
9318 | | // Iterate over each of the tablets and handle it, whatever state |
9319 | | // it may be in. The actions required for the tablet are collected |
9320 | | // into 'deferred'. |
9321 | 86.9k | for (const scoped_refptr<TabletInfo>& tablet : tablets) { |
9322 | 86.9k | SysTabletsEntryPB::State t_state = tablet->metadata().state().pb.state(); |
9323 | | |
9324 | 86.9k | switch (t_state) { |
9325 | 48.5k | case SysTabletsEntryPB::PREPARING: |
9326 | 48.5k | HandleAssignPreparingTablet(tablet.get(), &deferred); |
9327 | 48.5k | break; |
9328 | | |
9329 | 38.3k | case SysTabletsEntryPB::CREATING: |
9330 | 38.3k | HandleAssignCreatingTablet(tablet.get(), &deferred, &new_tablets); |
9331 | 38.3k | break; |
9332 | | |
9333 | 144 | default: |
9334 | 144 | VLOG(2) << "Nothing to do for tablet " << tablet->tablet_id() << ": state = " |
9335 | 0 | << SysTabletsEntryPB_State_Name(t_state); |
9336 | 144 | break; |
9337 | 86.9k | } |
9338 | 86.9k | } |
9339 | | |
9340 | | // Nothing to do. |
9341 | 21.4k | if (deferred.modified_tablets.empty() && |
9342 | 21.4k | deferred.needs_create_rpc.empty()13.1k ) { |
9343 | 13.1k | return Status::OK(); |
9344 | 13.1k | } |
9345 | | |
9346 | | // For those tablets which need to be created in this round, assign replicas. |
9347 | 8.30k | Status s; |
9348 | 8.30k | std::unordered_set<TableInfo*> ok_status_tables; |
9349 | 48.4k | for (TabletInfo *tablet : deferred.needs_create_rpc) { |
9350 | | // NOTE: if we fail to select replicas on the first pass (due to |
9351 | | // insufficient Tablet Servers being online), we will still try |
9352 | | // again unless the tablet/table creation is cancelled. |
9353 | 48.4k | LOG(INFO) << "Selecting replicas for tablet " << tablet->id(); |
9354 | 48.4k | s = SelectReplicasForTablet(ts_descs, tablet, &table_load_state, global_load_state); |
9355 | 48.4k | if (!s.ok()) { |
9356 | 21 | s = s.CloneAndPrepend(Substitute( |
9357 | 21 | "An error occurred while selecting replicas for tablet $0: $1", |
9358 | 21 | tablet->tablet_id(), s.ToString())); |
9359 | 21 | tablet->table()->SetCreateTableErrorStatus(s); |
9360 | 21 | break; |
9361 | 48.4k | } else { |
9362 | 48.4k | ok_status_tables.emplace(tablet->table().get()); |
9363 | 48.4k | } |
9364 | 48.4k | } |
9365 | | |
9366 | | // Update the sys catalog with the new set of tablets/metadata. |
9367 | 8.30k | if (s.ok()) { |
9368 | | // If any of the ok_status_tables had an error in the previous iterations, we |
9369 | | // need to clear up the error status to reflect that all the create tablets have now |
9370 | | // succeded. |
9371 | 8.28k | for (TableInfo* table : ok_status_tables) { |
9372 | 8.28k | table->SetCreateTableErrorStatus(Status::OK()); |
9373 | 8.28k | } |
9374 | | |
9375 | 8.28k | s = sys_catalog_->Upsert(leader_ready_term(), deferred.modified_tablets); |
9376 | 8.28k | if (!s.ok()) { |
9377 | 1 | s = s.CloneAndPrepend("An error occurred while persisting the updated tablet metadata"); |
9378 | 1 | } |
9379 | 8.28k | } |
9380 | | |
9381 | 8.30k | if (!s.ok()) { |
9382 | 22 | LOG(WARNING) << "Aborting the current task due to error: " << s.ToString(); |
9383 | | // If there was an error, abort any mutations started by the current task. |
9384 | | // NOTE: Lock order should be lock_ -> table -> tablet. |
9385 | | // We currently have a bunch of tablets locked and need to unlock first to ensure this holds. |
9386 | | |
9387 | 22 | std::sort(new_tablets.begin(), new_tablets.end(), [](const auto& lhs, const auto& rhs) { |
9388 | 0 | return lhs->table().get() < rhs->table().get(); |
9389 | 0 | }); |
9390 | 22 | { |
9391 | 22 | std::string current_table_name; |
9392 | 22 | TableInfoPtr current_table; |
9393 | 22 | for (auto& tablet_to_remove : new_tablets) { |
9394 | 0 | if (tablet_to_remove->table()->RemoveTablet(tablet_to_remove->tablet_id())) { |
9395 | 0 | if (VLOG_IS_ON(1)) { |
9396 | 0 | if (current_table != tablet_to_remove->table()) { |
9397 | 0 | current_table = tablet_to_remove->table(); |
9398 | 0 | current_table_name = current_table->name(); |
9399 | 0 | } |
9400 | 0 | LOG(INFO) << "Removed tablet " << tablet_to_remove->tablet_id() << " from table " |
9401 | 0 | << current_table_name; |
9402 | 0 | } |
9403 | 0 | } |
9404 | 0 | } |
9405 | 22 | } |
9406 | | |
9407 | 22 | unlocker_out.Abort(); // tablet.unlock |
9408 | 22 | unlocker_in.Abort(); |
9409 | | |
9410 | 22 | { |
9411 | 22 | LockGuard lock(mutex_); // lock_.lock |
9412 | 22 | auto tablet_map_checkout = tablet_map_.CheckOut(); |
9413 | 22 | for (auto& tablet_to_remove : new_tablets) { |
9414 | | // Potential race condition above, but it's okay if a background thread deleted this. |
9415 | 0 | tablet_map_checkout->erase(tablet_to_remove->tablet_id()); |
9416 | 0 | } |
9417 | 22 | } |
9418 | 22 | return s; |
9419 | 22 | } |
9420 | | |
9421 | | // Send DeleteTablet requests to tablet servers serving deleted tablets. |
9422 | | // This is asynchronous / non-blocking. |
9423 | 48.4k | for (auto* tablet : deferred.modified_tablets)8.28k { |
9424 | 48.4k | if (tablet->metadata().dirty().is_deleted()) { |
9425 | | // Actual delete, because we delete tablet replica. |
9426 | 10 | DeleteTabletReplicas(tablet, tablet->metadata().dirty().pb.state_msg(), HideOnly::kFalse); |
9427 | 10 | } |
9428 | 48.4k | } |
9429 | | // Send the CreateTablet() requests to the servers. This is asynchronous / non-blocking. |
9430 | 8.28k | return SendCreateTabletRequests(deferred.needs_create_rpc); |
9431 | 8.30k | } |
9432 | | |
9433 | | Status CatalogManager::SelectReplicasForTablet( |
9434 | | const TSDescriptorVector& ts_descs, TabletInfo* tablet, |
9435 | 48.4k | CMPerTableLoadState* per_table_state, CMGlobalLoadState* global_state) { |
9436 | 48.4k | auto table_guard = tablet->table()->LockForRead(); |
9437 | | |
9438 | 48.4k | if (!table_guard->pb.IsInitialized()) { |
9439 | 0 | return STATUS_SUBSTITUTE(InvalidArgument, |
9440 | 0 | "TableInfo for tablet $0 is not initialized (aborted CreateTable attempt?)", |
9441 | 0 | tablet->tablet_id()); |
9442 | 0 | } |
9443 | | |
9444 | 48.4k | const auto& replication_info = |
9445 | 48.4k | VERIFY_RESULT(GetTableReplicationInfo(table_guard->pb.replication_info(), |
9446 | 48.4k | tablet->table()->TablespaceIdForTableCreation())); |
9447 | | |
9448 | | // Select the set of replicas for the tablet. |
9449 | 0 | ConsensusStatePB* cstate = tablet->mutable_metadata()->mutable_dirty() |
9450 | 48.4k | ->pb.mutable_committed_consensus_state(); |
9451 | 48.4k | VLOG_WITH_FUNC0 (3) << "Committed consensus state: " << AsString(cstate)0 ; |
9452 | 48.4k | cstate->set_current_term(kMinimumTerm); |
9453 | 48.4k | consensus::RaftConfigPB *config = cstate->mutable_config(); |
9454 | 48.4k | config->set_opid_index(consensus::kInvalidOpIdIndex); |
9455 | | |
9456 | 48.4k | Status s = HandlePlacementUsingReplicationInfo( |
9457 | 48.4k | replication_info, ts_descs, config, per_table_state, global_state); |
9458 | 48.4k | if (!s.ok()) { |
9459 | 21 | return s; |
9460 | 21 | } |
9461 | | |
9462 | 48.4k | std::ostringstream out; |
9463 | 48.4k | out << "Initial tserver uuids for tablet " << tablet->tablet_id() << ": "; |
9464 | 140k | for (const RaftPeerPB& peer : config->peers()) { |
9465 | 140k | out << peer.permanent_uuid() << " "; |
9466 | 140k | } |
9467 | | |
9468 | 48.4k | if (VLOG_IS_ON(0)) { |
9469 | 48.4k | out.str(); |
9470 | 48.4k | } |
9471 | | |
9472 | 48.4k | VLOG_WITH_FUNC1 (3) << "Committed consensus state has been updated to: " << AsString(cstate)1 ; |
9473 | | |
9474 | 48.4k | return Status::OK(); |
9475 | 48.4k | } |
9476 | | |
9477 | | void CatalogManager::GetTsDescsFromPlacementInfo(const PlacementInfoPB& placement_info, |
9478 | | const TSDescriptorVector& all_ts_descs, |
9479 | 105k | TSDescriptorVector* ts_descs) { |
9480 | 105k | ts_descs->clear(); |
9481 | 312k | for (const auto& ts_desc : all_ts_descs) { |
9482 | 312k | if (placement_info.has_placement_uuid()) { |
9483 | 5.47k | string placement_uuid = placement_info.placement_uuid(); |
9484 | 5.47k | if (ts_desc->placement_uuid() == placement_uuid) { |
9485 | 3.51k | ts_descs->push_back(ts_desc); |
9486 | 3.51k | } |
9487 | 306k | } else if (ts_desc->placement_uuid() == "") { |
9488 | | // Since the placement info has no placement id, we know it is live, so we add this ts. |
9489 | 306k | ts_descs->push_back(ts_desc); |
9490 | 306k | } |
9491 | 312k | } |
9492 | 105k | } |
9493 | | |
9494 | | Status CatalogManager::HandlePlacementUsingReplicationInfo( |
9495 | | const ReplicationInfoPB& replication_info, |
9496 | | const TSDescriptorVector& all_ts_descs, |
9497 | | consensus::RaftConfigPB* config, |
9498 | | CMPerTableLoadState* per_table_state, |
9499 | 48.4k | CMGlobalLoadState* global_state) { |
9500 | | // Validate if we have enough tservers to put the replicas. |
9501 | 48.4k | ValidateReplicationInfoRequestPB req; |
9502 | 48.4k | req.mutable_replication_info()->CopyFrom(replication_info); |
9503 | 48.4k | ValidateReplicationInfoResponsePB resp; |
9504 | 48.4k | RETURN_NOT_OK(ValidateReplicationInfo(&req, &resp)); |
9505 | | |
9506 | 48.4k | TSDescriptorVector ts_descs; |
9507 | 48.4k | GetTsDescsFromPlacementInfo(replication_info.live_replicas(), all_ts_descs, &ts_descs); |
9508 | 48.4k | RETURN_NOT_OK(HandlePlacementUsingPlacementInfo( |
9509 | 48.4k | replication_info.live_replicas(), ts_descs, PeerMemberType::VOTER, |
9510 | 48.4k | config, per_table_state, global_state)); |
9511 | 48.5k | for (int i = 0; 48.4k i < replication_info.read_replicas_size(); i++148 ) { |
9512 | 148 | GetTsDescsFromPlacementInfo(replication_info.read_replicas(i), all_ts_descs, &ts_descs); |
9513 | 148 | RETURN_NOT_OK(HandlePlacementUsingPlacementInfo( |
9514 | 148 | replication_info.read_replicas(i), ts_descs, PeerMemberType::OBSERVER, |
9515 | 148 | config, per_table_state, global_state)); |
9516 | 148 | } |
9517 | 48.4k | return Status::OK(); |
9518 | 48.4k | } |
9519 | | |
9520 | | Status CatalogManager::HandlePlacementUsingPlacementInfo(const PlacementInfoPB& placement_info, |
9521 | | const TSDescriptorVector& ts_descs, |
9522 | | PeerMemberType member_type, |
9523 | | consensus::RaftConfigPB* config, |
9524 | | CMPerTableLoadState* per_table_state, |
9525 | 48.5k | CMGlobalLoadState* global_state) { |
9526 | 48.5k | size_t nreplicas = GetNumReplicasFromPlacementInfo(placement_info); |
9527 | 48.5k | size_t ntservers = ts_descs.size(); |
9528 | | // Keep track of servers we've already selected, so that we don't attempt to |
9529 | | // put two replicas on the same host. |
9530 | 48.5k | set<TabletServerId> already_selected_ts; |
9531 | 48.5k | if (placement_info.placement_blocks().empty()) { |
9532 | | // If we don't have placement info, just place the replicas as before, distributed across the |
9533 | | // whole cluster. |
9534 | | // We cannot put more than ntservers replicas. |
9535 | 47.7k | nreplicas = min(nreplicas, ntservers); |
9536 | 47.7k | SelectReplicas(ts_descs, nreplicas, config, &already_selected_ts, member_type, |
9537 | 47.7k | per_table_state, global_state); |
9538 | 47.7k | } else { |
9539 | | // TODO(bogdan): move to separate function |
9540 | | // |
9541 | | // If we do have placement info, we'll try to use the same power of two algorithm, but also |
9542 | | // match the requested policies. We'll assign the minimum requested replicas in each combination |
9543 | | // of cloud.region.zone and then if we still have leftover replicas, we'll assign those |
9544 | | // in any of the allowed areas. |
9545 | 799 | auto all_allowed_ts = VERIFY_RESULT(FindTServersForPlacementInfo(placement_info, ts_descs)); |
9546 | | |
9547 | | // Loop through placements and assign to respective available TSs. |
9548 | 0 | size_t min_replica_count_sum = 0; |
9549 | 1.05k | for (const auto& pb : placement_info.placement_blocks()) { |
9550 | | // This works because currently we don't allow placement blocks to overlap. |
9551 | 1.05k | auto available_ts_descs = VERIFY_RESULT(FindTServersForPlacementBlock(pb, ts_descs)); |
9552 | 0 | size_t available_ts_descs_size = available_ts_descs.size(); |
9553 | 1.05k | size_t min_num_replicas = pb.min_num_replicas(); |
9554 | | // We cannot put more than the available tablet servers in that placement block. |
9555 | 1.05k | size_t num_replicas = min(min_num_replicas, available_ts_descs_size); |
9556 | 1.05k | min_replica_count_sum += min_num_replicas; |
9557 | 1.05k | SelectReplicas(available_ts_descs, num_replicas, config, &already_selected_ts, member_type, |
9558 | 1.05k | per_table_state, global_state); |
9559 | 1.05k | } |
9560 | | |
9561 | 799 | size_t replicas_left = nreplicas - min_replica_count_sum; |
9562 | 799 | size_t max_tservers_left = all_allowed_ts.size() - already_selected_ts.size(); |
9563 | | // Upper bounded by the tservers left. |
9564 | 799 | replicas_left = min(replicas_left, max_tservers_left); |
9565 | 799 | DCHECK_GE(replicas_left, 0); |
9566 | 799 | if (replicas_left > 0) { |
9567 | | // No need to do an extra check here, as we checked early if we have enough to cover all |
9568 | | // requested placements and checked individually per placement info, if we could cover the |
9569 | | // minimums. |
9570 | 12 | SelectReplicas(all_allowed_ts, replicas_left, config, &already_selected_ts, member_type, |
9571 | 12 | per_table_state, global_state); |
9572 | 12 | } |
9573 | 799 | } |
9574 | 48.5k | return Status::OK(); |
9575 | 48.5k | } |
9576 | | |
9577 | | Result<vector<shared_ptr<TSDescriptor>>> CatalogManager::FindTServersForPlacementInfo( |
9578 | | const PlacementInfoPB& placement_info, |
9579 | 1.67k | const TSDescriptorVector& ts_descs) const { |
9580 | | |
9581 | 1.67k | vector<shared_ptr<TSDescriptor>> all_allowed_ts; |
9582 | 6.38k | for (const auto& ts : ts_descs) { |
9583 | 7.84k | for (const auto& pb : placement_info.placement_blocks()) { |
9584 | 7.84k | if (ts->MatchesCloudInfo(pb.cloud_info())) { |
9585 | 3.32k | all_allowed_ts.push_back(ts); |
9586 | 3.32k | break; |
9587 | 3.32k | } |
9588 | 7.84k | } |
9589 | 6.38k | } |
9590 | | |
9591 | 1.67k | return all_allowed_ts; |
9592 | 1.67k | } |
9593 | | |
9594 | | Result<vector<shared_ptr<TSDescriptor>>> CatalogManager::FindTServersForPlacementBlock( |
9595 | | const PlacementBlockPB& placement_block, |
9596 | 2.19k | const TSDescriptorVector& ts_descs) { |
9597 | | |
9598 | 2.19k | vector<shared_ptr<TSDescriptor>> allowed_ts; |
9599 | 2.19k | const auto& cloud_info = placement_block.cloud_info(); |
9600 | 8.85k | for (const auto& ts : ts_descs) { |
9601 | 8.85k | if (ts->MatchesCloudInfo(cloud_info)) { |
9602 | 3.29k | allowed_ts.push_back(ts); |
9603 | 3.29k | } |
9604 | 8.85k | } |
9605 | | |
9606 | 2.19k | return allowed_ts; |
9607 | 2.19k | } |
9608 | | |
9609 | 8.27k | Status CatalogManager::SendCreateTabletRequests(const vector<TabletInfo*>& tablets) { |
9610 | 8.27k | auto schedules_to_tablets_map = VERIFY_RESULT(MakeSnapshotSchedulesToObjectIdsMap( |
9611 | 8.27k | SysRowEntryType::TABLET)); |
9612 | 48.3k | for (TabletInfo *tablet : tablets) { |
9613 | 48.3k | const consensus::RaftConfigPB& config = |
9614 | 48.3k | tablet->metadata().dirty().pb.committed_consensus_state().config(); |
9615 | 48.3k | tablet->set_last_update_time(MonoTime::Now()); |
9616 | 48.3k | std::vector<SnapshotScheduleId> schedules; |
9617 | 48.3k | for (const auto& pair : schedules_to_tablets_map) { |
9618 | 39 | if (std::binary_search(pair.second.begin(), pair.second.end(), tablet->id())) { |
9619 | 24 | schedules.push_back(pair.first); |
9620 | 24 | } |
9621 | 39 | } |
9622 | 140k | for (const RaftPeerPB& peer : config.peers()) { |
9623 | 140k | auto task = std::make_shared<AsyncCreateReplica>(master_, AsyncTaskPool(), |
9624 | 140k | peer.permanent_uuid(), tablet, schedules); |
9625 | 140k | tablet->table()->AddTask(task); |
9626 | 140k | WARN_NOT_OK(ScheduleTask(task), "Failed to send new tablet request"); |
9627 | 140k | } |
9628 | 48.3k | } |
9629 | | |
9630 | 8.27k | return Status::OK(); |
9631 | 8.27k | } |
9632 | | |
9633 | | // If responses have been received from sufficient replicas (including hinted leader), |
9634 | | // pick proposed leader and start election. |
9635 | | void CatalogManager::StartElectionIfReady( |
9636 | 153k | const consensus::ConsensusStatePB& cstate, TabletInfo* tablet) { |
9637 | 153k | auto replicas = tablet->GetReplicaLocations(); |
9638 | 153k | int num_voters = 0; |
9639 | 459k | for (const auto& peer : cstate.config().peers()) { |
9640 | 459k | if (peer.member_type() == PeerMemberType::VOTER) { |
9641 | 457k | ++num_voters; |
9642 | 457k | } |
9643 | 459k | } |
9644 | 153k | int majority_size = num_voters / 2 + 1; |
9645 | 153k | int running_voters = 0; |
9646 | 459k | for (const auto& replica : *replicas) { |
9647 | 459k | if (replica.second.member_type == PeerMemberType::VOTER) { |
9648 | 457k | ++running_voters; |
9649 | 457k | } |
9650 | 459k | } |
9651 | | |
9652 | 153k | VLOG_WITH_PREFIX0 (4) |
9653 | 0 | << __func__ << ": T " << tablet->tablet_id() << ": " << AsString(*replicas) << ", voters: " |
9654 | 0 | << running_voters << "/" << majority_size; |
9655 | | |
9656 | 153k | if (running_voters < majority_size) { |
9657 | 0 | VLOG_WITH_PREFIX(4) << __func__ << ": Not enough voters"; |
9658 | 0 | return; |
9659 | 0 | } |
9660 | | |
9661 | 153k | ReplicationInfoPB replication_info; |
9662 | 153k | { |
9663 | 153k | auto l = ClusterConfig()->LockForRead(); |
9664 | 153k | replication_info = l->pb.replication_info(); |
9665 | 153k | } |
9666 | | |
9667 | | // Find tservers that can be leaders for a tablet. |
9668 | 153k | TSDescriptorVector ts_descs = GetAllLiveNotBlacklistedTServers(); |
9669 | | |
9670 | 153k | std::vector<std::string> possible_leaders; |
9671 | 459k | for (const auto& replica : *replicas) { |
9672 | 935k | for (const auto& ts_desc : ts_descs) { |
9673 | 935k | if (ts_desc->permanent_uuid() == replica.first) { |
9674 | 459k | if (ts_desc->IsAcceptingLeaderLoad(replication_info)) { |
9675 | 457k | possible_leaders.push_back(replica.first); |
9676 | 457k | } |
9677 | 459k | break; |
9678 | 459k | } |
9679 | 935k | } |
9680 | 459k | } |
9681 | | |
9682 | 153k | if (FLAGS_TEST_create_table_leader_hint_min_lexicographic) { |
9683 | 6 | std::string min_lexicographic; |
9684 | 18 | for (const auto& peer : cstate.config().peers()) { |
9685 | 18 | if (peer.member_type() == PeerMemberType::VOTER) { |
9686 | 18 | if (min_lexicographic.empty() || peer.permanent_uuid() < min_lexicographic12 ) { |
9687 | 6 | min_lexicographic = peer.permanent_uuid(); |
9688 | 6 | } |
9689 | 18 | } |
9690 | 18 | } |
9691 | 6 | if (min_lexicographic.empty() || !replicas->count(min_lexicographic)) { |
9692 | 0 | LOG_WITH_PREFIX(INFO) |
9693 | 0 | << __func__ << ": Min lexicographic is not yet ready: " << min_lexicographic; |
9694 | 0 | return; |
9695 | 0 | } |
9696 | 6 | possible_leaders = { min_lexicographic }; |
9697 | 6 | } |
9698 | | |
9699 | 153k | if (possible_leaders.empty()) { |
9700 | 124 | VLOG_WITH_PREFIX0 (4) << __func__ << ": Cannot pick candidate"0 ; |
9701 | 124 | return; |
9702 | 124 | } |
9703 | | |
9704 | 153k | if (!tablet->InitiateElection()) { |
9705 | 106k | VLOG_WITH_PREFIX0 (4) << __func__ << ": Already initiated"0 ; |
9706 | 106k | return; |
9707 | 106k | } |
9708 | | |
9709 | 47.8k | const auto& protege = RandomElement(possible_leaders); |
9710 | | |
9711 | 47.8k | LOG_WITH_PREFIX(INFO) |
9712 | 47.8k | << "Starting election at " << tablet->tablet_id() << " in favor of " << protege; |
9713 | | |
9714 | 47.8k | auto task = std::make_shared<AsyncStartElection>(master_, AsyncTaskPool(), protege, tablet); |
9715 | 47.8k | tablet->table()->AddTask(task); |
9716 | 47.8k | WARN_NOT_OK(task->Run(), "Failed to send new tablet start election request"); |
9717 | 47.8k | } |
9718 | | |
9719 | | shared_ptr<TSDescriptor> CatalogManager::SelectReplica( |
9720 | | const TSDescriptorVector& ts_descs, |
9721 | | set<TabletServerId>* excluded, |
9722 | 140k | CMPerTableLoadState* per_table_state, CMGlobalLoadState* global_state) { |
9723 | 140k | shared_ptr<TSDescriptor> found_ts; |
9724 | 144k | for (const auto& sorted_load : per_table_state->sorted_load_) { |
9725 | | // Don't consider a tserver that has already been considered for this tablet. |
9726 | 144k | if (excluded->count(sorted_load)) { |
9727 | 3 | continue; |
9728 | 3 | } |
9729 | | // Only choose from the set of allowed tservers for this tablet. |
9730 | 290k | auto it = std::find_if(ts_descs.begin(), ts_descs.end(), [&sorted_load](const auto& ts) 144k { |
9731 | 290k | return ts->permanent_uuid() == sorted_load; |
9732 | 290k | }); |
9733 | | |
9734 | 144k | if (it != ts_descs.end()) { |
9735 | 140k | found_ts = *it; |
9736 | 140k | break; |
9737 | 140k | } |
9738 | 144k | } |
9739 | | |
9740 | 140k | return found_ts; |
9741 | 140k | } |
9742 | | |
9743 | | void CatalogManager::SelectReplicas( |
9744 | | const TSDescriptorVector& ts_descs, size_t nreplicas, consensus::RaftConfigPB* config, |
9745 | | set<TabletServerId>* already_selected_ts, PeerMemberType member_type, |
9746 | 48.8k | CMPerTableLoadState* per_table_state, CMGlobalLoadState* global_state) { |
9747 | 48.8k | DCHECK_LE(nreplicas, ts_descs.size()); |
9748 | | |
9749 | 189k | for (size_t i = 0; i < nreplicas; ++i140k ) { |
9750 | 140k | shared_ptr<TSDescriptor> ts = SelectReplica( |
9751 | 140k | ts_descs, already_selected_ts, per_table_state, global_state); |
9752 | 140k | InsertOrDie(already_selected_ts, ts->permanent_uuid()); |
9753 | | // Update the load state at global and table level. |
9754 | 140k | per_table_state->per_ts_load_[ts->permanent_uuid()]++; |
9755 | 140k | global_state->per_ts_load_[ts->permanent_uuid()]++; |
9756 | 140k | per_table_state->SortLoad(); |
9757 | | |
9758 | | // Increment the number of pending replicas so that we take this selection into |
9759 | | // account when assigning replicas for other tablets of the same table. This |
9760 | | // value decays back to 0 over time. |
9761 | 140k | ts->IncrementRecentReplicaCreations(); |
9762 | | |
9763 | 140k | TSRegistrationPB reg = ts->GetRegistration(); |
9764 | | |
9765 | 140k | RaftPeerPB *peer = config->add_peers(); |
9766 | 140k | peer->set_permanent_uuid(ts->permanent_uuid()); |
9767 | | |
9768 | | // TODO: This is temporary, we will use only UUIDs. |
9769 | 140k | TakeRegistration(reg.mutable_common(), peer); |
9770 | 140k | peer->set_member_type(member_type); |
9771 | 140k | } |
9772 | 48.8k | } |
9773 | | |
9774 | | Status CatalogManager::ConsensusStateToTabletLocations(const consensus::ConsensusStatePB& cstate, |
9775 | 216k | TabletLocationsPB* locs_pb) { |
9776 | 559k | for (const consensus::RaftPeerPB& peer : cstate.config().peers()) { |
9777 | 559k | TabletLocationsPB_ReplicaPB* replica_pb = locs_pb->add_replicas(); |
9778 | 559k | if (!peer.has_permanent_uuid()) { |
9779 | 0 | return STATUS_SUBSTITUTE(IllegalState, "Missing UUID $0", peer.ShortDebugString()); |
9780 | 0 | } |
9781 | 559k | replica_pb->set_role(GetConsensusRole(peer.permanent_uuid(), cstate)); |
9782 | 559k | if (peer.has_member_type()559k ) { |
9783 | 559k | replica_pb->set_member_type(peer.member_type()); |
9784 | 18.4E | } else { |
9785 | 18.4E | replica_pb->set_member_type(PeerMemberType::UNKNOWN_MEMBER_TYPE); |
9786 | 18.4E | } |
9787 | 559k | TSInfoPB* tsinfo_pb = replica_pb->mutable_ts_info(); |
9788 | 559k | tsinfo_pb->set_permanent_uuid(peer.permanent_uuid()); |
9789 | 559k | CopyRegistration(peer, tsinfo_pb); |
9790 | 559k | } |
9791 | 216k | return Status::OK(); |
9792 | 216k | } |
9793 | | |
9794 | | Status CatalogManager::BuildLocationsForTablet(const scoped_refptr<TabletInfo>& tablet, |
9795 | | TabletLocationsPB* locs_pb, |
9796 | 516k | IncludeInactive include_inactive) { |
9797 | 516k | { |
9798 | 516k | auto l_tablet = tablet->LockForRead(); |
9799 | 516k | if (l_tablet->is_hidden() && !include_inactive0 ) { |
9800 | 0 | return STATUS_FORMAT(NotFound, "Tablet hidden", tablet->id()); |
9801 | 0 | } |
9802 | 516k | locs_pb->set_table_id(l_tablet->pb.table_id()); |
9803 | 516k | *locs_pb->mutable_table_ids() = l_tablet->pb.table_ids(); |
9804 | 516k | } |
9805 | | |
9806 | | // For system tables, the set of replicas is always the set of masters. |
9807 | 516k | if (system_tablets_.find(tablet->id()) != system_tablets_.end()) { |
9808 | 216k | consensus::ConsensusStatePB master_consensus; |
9809 | 216k | RETURN_NOT_OK(GetCurrentConfig(&master_consensus)); |
9810 | 216k | locs_pb->set_tablet_id(tablet->tablet_id()); |
9811 | 216k | locs_pb->set_stale(false); |
9812 | 216k | const auto initial_size = locs_pb->replicas_size(); |
9813 | 216k | RETURN_NOT_OK(ConsensusStateToTabletLocations(master_consensus, locs_pb)); |
9814 | 216k | const auto capabilities = Capabilities(); |
9815 | | // Set capabilities of master node for all newly created system table locations. |
9816 | 216k | for (auto i = locs_pb->mutable_replicas()->begin() + initial_size, |
9817 | 775k | end = locs_pb->mutable_replicas()->end(); i != end; ++i558k ) { |
9818 | 558k | *i->mutable_ts_info()->mutable_capabilities() = google::protobuf::RepeatedField<CapabilityId>( |
9819 | 558k | capabilities.begin(), capabilities.end()); |
9820 | 558k | } |
9821 | 216k | return Status::OK(); |
9822 | 216k | } |
9823 | | |
9824 | 300k | TSRegistrationPB reg; |
9825 | | |
9826 | 300k | std::shared_ptr<const TabletReplicaMap> locs; |
9827 | 300k | consensus::ConsensusStatePB cstate; |
9828 | 300k | { |
9829 | 300k | auto l_tablet = tablet->LockForRead(); |
9830 | 300k | if (PREDICT_FALSE(l_tablet->is_deleted())) { |
9831 | 358 | std::vector<TabletId> split_tablet_ids; |
9832 | 358 | for (const auto& split_tablet_id : l_tablet->pb.split_tablet_ids()) { |
9833 | 4 | split_tablet_ids.push_back(split_tablet_id); |
9834 | 4 | } |
9835 | 358 | return STATUS( |
9836 | 358 | NotFound, "Tablet deleted", l_tablet->pb.state_msg(), |
9837 | 358 | SplitChildTabletIdsData(split_tablet_ids)); |
9838 | 358 | } |
9839 | | |
9840 | 299k | if (PREDICT_FALSE(!l_tablet->is_running())) { |
9841 | 9.69k | return STATUS_FORMAT(ServiceUnavailable, "Tablet $0 not running", tablet->id()); |
9842 | 9.69k | } |
9843 | | |
9844 | 290k | locs = tablet->GetReplicaLocations(); |
9845 | 290k | if (locs->empty() && l_tablet->pb.has_committed_consensus_state()398 ) { |
9846 | 398 | cstate = l_tablet->pb.committed_consensus_state(); |
9847 | 398 | } |
9848 | | |
9849 | 290k | const auto& metadata = tablet->metadata().state().pb; |
9850 | 290k | locs_pb->mutable_partition()->CopyFrom(metadata.partition()); |
9851 | 290k | locs_pb->set_split_depth(metadata.split_depth()); |
9852 | 290k | locs_pb->set_split_parent_tablet_id(metadata.split_parent_tablet_id()); |
9853 | 290k | for (const auto& split_tablet_id : metadata.split_tablet_ids()) { |
9854 | 148 | *locs_pb->add_split_tablet_ids() = split_tablet_id; |
9855 | 148 | } |
9856 | 290k | } |
9857 | | |
9858 | 0 | locs_pb->set_tablet_id(tablet->tablet_id()); |
9859 | 290k | locs_pb->set_stale(locs->empty()); |
9860 | | |
9861 | | // If the locations are cached. |
9862 | 290k | if (!locs->empty()) { |
9863 | 289k | if (cstate.IsInitialized() && |
9864 | 289k | locs->size() != implicit_cast<size_t>(cstate.config().peers_size())0 ) { |
9865 | 0 | LOG(WARNING) << "Cached tablet replicas " << locs->size() << " does not match consensus " |
9866 | 0 | << cstate.config().peers_size(); |
9867 | 0 | } |
9868 | | |
9869 | 853k | for (const auto& replica : *locs) { |
9870 | 853k | TabletLocationsPB_ReplicaPB* replica_pb = locs_pb->add_replicas(); |
9871 | 853k | replica_pb->set_role(replica.second.role); |
9872 | 853k | replica_pb->set_member_type(replica.second.member_type); |
9873 | 853k | auto tsinfo_pb = replica.second.ts_desc->GetTSInformationPB(); |
9874 | | |
9875 | 853k | TSInfoPB* out_ts_info = replica_pb->mutable_ts_info(); |
9876 | 853k | out_ts_info->set_permanent_uuid(tsinfo_pb->tserver_instance().permanent_uuid()); |
9877 | 853k | CopyRegistration(tsinfo_pb->registration().common(), out_ts_info); |
9878 | 853k | out_ts_info->set_placement_uuid(tsinfo_pb->registration().common().placement_uuid()); |
9879 | 853k | *out_ts_info->mutable_capabilities() = tsinfo_pb->registration().capabilities(); |
9880 | 853k | } |
9881 | 289k | return Status::OK(); |
9882 | 289k | } |
9883 | | |
9884 | | // If the locations were not cached. |
9885 | | // TODO: Why would this ever happen? See KUDU-759. |
9886 | 480 | if (cstate.IsInitialized()) { |
9887 | 398 | RETURN_NOT_OK(ConsensusStateToTabletLocations(cstate, locs_pb)); |
9888 | 398 | } |
9889 | | |
9890 | 480 | return Status::OK(); |
9891 | 480 | } |
9892 | | |
9893 | 1.51M | Result<shared_ptr<tablet::AbstractTablet>> CatalogManager::GetSystemTablet(const TabletId& id) { |
9894 | 1.51M | const auto iter = system_tablets_.find(id); |
9895 | 1.51M | if (iter == system_tablets_.end()) { |
9896 | 0 | return STATUS_SUBSTITUTE(InvalidArgument, "$0 is not a valid system tablet id", id); |
9897 | 0 | } |
9898 | 1.51M | return iter->second; |
9899 | 1.51M | } |
9900 | | |
9901 | | Status CatalogManager::GetTabletLocations( |
9902 | 19.7k | const TabletId& tablet_id, TabletLocationsPB* locs_pb, IncludeInactive include_inactive) { |
9903 | 19.7k | scoped_refptr<TabletInfo> tablet_info; |
9904 | 19.7k | { |
9905 | 19.7k | SharedLock lock(mutex_); |
9906 | 19.7k | if (!FindCopy(*tablet_map_, tablet_id, &tablet_info)) { |
9907 | 3.35k | return STATUS_SUBSTITUTE(NotFound, "Unknown tablet $0", tablet_id); |
9908 | 3.35k | } |
9909 | 19.7k | } |
9910 | 16.3k | Status s = GetTabletLocations(tablet_info, locs_pb, include_inactive); |
9911 | | |
9912 | 16.3k | auto num_replicas = GetReplicationFactorForTablet(tablet_info); |
9913 | 16.3k | if (num_replicas.ok()16.3k && *num_replicas > 0 && |
9914 | 16.3k | implicit_cast<size_t>(locs_pb->replicas().size()) != *num_replicas) { |
9915 | 1.05k | YB_LOG_EVERY_N_SECS(WARNING, 1) |
9916 | 405 | << "Expected replicas " << num_replicas << " but found " |
9917 | 405 | << locs_pb->replicas().size() << " for tablet " << tablet_info->id() << ": " |
9918 | 405 | << locs_pb->ShortDebugString() << THROTTLE_MSG; |
9919 | 1.05k | } |
9920 | 16.3k | return s; |
9921 | 19.7k | } |
9922 | | |
9923 | | Status CatalogManager::GetTabletLocations( |
9924 | | scoped_refptr<TabletInfo> tablet_info, |
9925 | | TabletLocationsPB* locs_pb, |
9926 | 86.0k | IncludeInactive include_inactive) { |
9927 | 86.0k | DCHECK_EQ(locs_pb->replicas().size(), 0); |
9928 | 86.0k | locs_pb->mutable_replicas()->Clear(); |
9929 | 86.0k | return BuildLocationsForTablet(tablet_info, locs_pb, include_inactive); |
9930 | 86.0k | } |
9931 | | |
9932 | | Status CatalogManager::GetTableLocations( |
9933 | | const GetTableLocationsRequestPB* req, |
9934 | 228k | GetTableLocationsResponsePB* resp) { |
9935 | 228k | VLOG(4) << "GetTableLocations: " << req->ShortDebugString()3 ; |
9936 | | |
9937 | | // If start-key is > end-key report an error instead of swap the two |
9938 | | // since probably there is something wrong app-side. |
9939 | 228k | if (req->has_partition_key_start() && req->has_partition_key_end()72.7k |
9940 | 228k | && req->partition_key_start() > req->partition_key_end()1 ) { |
9941 | 1 | return STATUS(InvalidArgument, "start partition key is greater than the end partition key"); |
9942 | 1 | } |
9943 | | |
9944 | 228k | if (req->max_returned_locations() <= 0) { |
9945 | 0 | return STATUS(InvalidArgument, "max_returned_locations must be greater than 0"); |
9946 | 0 | } |
9947 | | |
9948 | 228k | scoped_refptr<TableInfo> table = VERIFY_RESULT228k (FindTable(req->table()));228k |
9949 | | |
9950 | 228k | if (table->IsCreateInProgress()) { |
9951 | 7.31k | resp->set_creating(true); |
9952 | 7.31k | } |
9953 | | |
9954 | 228k | auto l = table->LockForRead(); |
9955 | 228k | RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp)); |
9956 | | |
9957 | 228k | vector<scoped_refptr<TabletInfo>> tablets; |
9958 | 228k | table->GetTabletsInRange(req, &tablets); |
9959 | | |
9960 | 228k | IncludeInactive include_inactive(req->has_include_inactive() && req->include_inactive()492 ); |
9961 | 228k | bool require_tablets_runnings = req->require_tablets_running(); |
9962 | | |
9963 | 228k | int expected_live_replicas = 0; |
9964 | 228k | int expected_read_replicas = 0; |
9965 | 228k | GetExpectedNumberOfReplicas(&expected_live_replicas, &expected_read_replicas); |
9966 | 366k | for (const scoped_refptr<TabletInfo>& tablet : tablets) { |
9967 | 366k | TabletLocationsPB* locs_pb = resp->add_tablet_locations(); |
9968 | 366k | locs_pb->set_expected_live_replicas(expected_live_replicas); |
9969 | 366k | locs_pb->set_expected_read_replicas(expected_read_replicas); |
9970 | 366k | auto status = BuildLocationsForTablet(tablet, locs_pb, include_inactive); |
9971 | 366k | if (!status.ok()) { |
9972 | | // Not running. |
9973 | 9.67k | if (require_tablets_runnings) { |
9974 | 9.53k | resp->mutable_tablet_locations()->Clear(); |
9975 | 9.53k | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, status); |
9976 | 9.53k | } |
9977 | 138 | resp->mutable_tablet_locations()->RemoveLast(); |
9978 | 138 | } |
9979 | 366k | } |
9980 | | |
9981 | 219k | resp->set_table_type(l->pb.table_type()); |
9982 | 219k | resp->set_partition_list_version(l->pb.partition_list_version()); |
9983 | | |
9984 | 219k | return Status::OK(); |
9985 | 228k | } |
9986 | | |
9987 | 5.43M | Status CatalogManager::GetCurrentConfig(consensus::ConsensusStatePB* cpb) const { |
9988 | 5.43M | auto tablet_peer = sys_catalog_->tablet_peer(); |
9989 | 5.43M | auto consensus = tablet_peer ? tablet_peer->shared_consensus()5.41M : nullptr18.0k ; |
9990 | 5.43M | if (!consensus) { |
9991 | 17.1k | std::string uuid = master_->fs_manager()->uuid(); |
9992 | 17.1k | return STATUS_FORMAT(IllegalState, "Node $0 peer not initialized.", uuid); |
9993 | 17.1k | } |
9994 | | |
9995 | 5.41M | *cpb = consensus->ConsensusState(CONSENSUS_CONFIG_COMMITTED); |
9996 | | |
9997 | 5.41M | return Status::OK(); |
9998 | 5.43M | } |
9999 | | |
10000 | 0 | void CatalogManager::DumpState(std::ostream* out, bool on_disk_dump) const { |
10001 | 0 | NamespaceInfoMap namespace_ids_copy; |
10002 | 0 | TableInfoMap ids_copy; |
10003 | 0 | TableInfoByNameMap names_copy; |
10004 | 0 | TabletInfoMap tablets_copy; |
10005 | | |
10006 | | // Copy the internal state so that, if the output stream blocks, |
10007 | | // we don't end up holding the lock for a long time. |
10008 | 0 | { |
10009 | 0 | SharedLock lock(mutex_); |
10010 | 0 | namespace_ids_copy = namespace_ids_map_; |
10011 | 0 | ids_copy = *table_ids_map_; |
10012 | 0 | names_copy = table_names_map_; |
10013 | 0 | tablets_copy = *tablet_map_; |
10014 | 0 | } |
10015 | |
|
10016 | 0 | *out << "Dumping current state of master.\nNamespaces:\n"; |
10017 | 0 | for (const NamespaceInfoMap::value_type& e : namespace_ids_copy) { |
10018 | 0 | NamespaceInfo* t = e.second.get(); |
10019 | 0 | auto l = t->LockForRead(); |
10020 | 0 | const NamespaceName& name = l->name(); |
10021 | |
|
10022 | 0 | *out << t->id() << ":\n"; |
10023 | 0 | *out << " name: \"" << strings::CHexEscape(name) << "\"\n"; |
10024 | 0 | *out << " metadata: " << l->pb.ShortDebugString() << "\n"; |
10025 | 0 | } |
10026 | |
|
10027 | 0 | *out << "Tables:\n"; |
10028 | 0 | for (const TableInfoMap::value_type& e : ids_copy) { |
10029 | 0 | TableInfo* t = e.second.get(); |
10030 | 0 | TabletInfos table_tablets; |
10031 | 0 | { |
10032 | 0 | auto l = t->LockForRead(); |
10033 | 0 | const TableName& name = l->name(); |
10034 | 0 | const NamespaceId& namespace_id = l->namespace_id(); |
10035 | | // Find namespace by its ID. |
10036 | 0 | scoped_refptr<NamespaceInfo> ns = FindPtrOrNull(namespace_ids_copy, namespace_id); |
10037 | |
|
10038 | 0 | *out << t->id() << ":\n"; |
10039 | 0 | *out << " namespace id: \"" << strings::CHexEscape(namespace_id) << "\"\n"; |
10040 | |
|
10041 | 0 | if (ns != nullptr) { |
10042 | 0 | *out << " namespace name: \"" << strings::CHexEscape(ns->name()) << "\"\n"; |
10043 | 0 | } |
10044 | |
|
10045 | 0 | *out << " name: \"" << strings::CHexEscape(name) << "\"\n"; |
10046 | | // Erase from the map, so later we can check that we don't have |
10047 | | // any orphaned tables in the by-name map that aren't in the |
10048 | | // by-id map. |
10049 | 0 | if (names_copy.erase({namespace_id, name}) != 1) { |
10050 | 0 | *out << " [not present in by-name map]\n"; |
10051 | 0 | } |
10052 | 0 | *out << " metadata: " << l->pb.ShortDebugString() << "\n"; |
10053 | |
|
10054 | 0 | *out << " tablets:\n"; |
10055 | 0 | table_tablets = t->GetTablets(); |
10056 | 0 | } |
10057 | 0 | for (const scoped_refptr<TabletInfo>& tablet : table_tablets) { |
10058 | 0 | auto l_tablet = tablet->LockForRead(); |
10059 | 0 | *out << " " << tablet->tablet_id() << ": " |
10060 | 0 | << l_tablet->pb.ShortDebugString() << "\n"; |
10061 | |
|
10062 | 0 | if (tablets_copy.erase(tablet->tablet_id()) != 1) { |
10063 | 0 | *out << " [ERROR: not present in CM tablet map!]\n"; |
10064 | 0 | } |
10065 | 0 | } |
10066 | 0 | } |
10067 | |
|
10068 | 0 | if (!tablets_copy.empty()) { |
10069 | 0 | *out << "Orphaned tablets (not referenced by any table):\n"; |
10070 | 0 | for (const TabletInfoMap::value_type& entry : tablets_copy) { |
10071 | 0 | const scoped_refptr<TabletInfo>& tablet = entry.second; |
10072 | 0 | auto l_tablet = tablet->LockForRead(); |
10073 | 0 | *out << " " << tablet->tablet_id() << ": " |
10074 | 0 | << l_tablet->pb.ShortDebugString() << "\n"; |
10075 | 0 | } |
10076 | 0 | } |
10077 | |
|
10078 | 0 | if (!names_copy.empty()) { |
10079 | 0 | *out << "Orphaned tables (in by-name map, but not id map):\n"; |
10080 | 0 | for (const TableInfoByNameMap::value_type& e : names_copy) { |
10081 | 0 | *out << e.second->id() << ":\n"; |
10082 | 0 | *out << " namespace id: \"" << strings::CHexEscape(e.first.first) << "\"\n"; |
10083 | 0 | *out << " name: \"" << CHexEscape(e.first.second) << "\"\n"; |
10084 | 0 | } |
10085 | 0 | } |
10086 | |
|
10087 | 0 | master_->DumpMasterOptionsInfo(out); |
10088 | |
|
10089 | 0 | if (on_disk_dump) { |
10090 | 0 | consensus::ConsensusStatePB cur_consensus_state; |
10091 | | // TODO: proper error handling below. |
10092 | 0 | CHECK_OK(GetCurrentConfig(&cur_consensus_state)); |
10093 | 0 | *out << "Current raft config: " << cur_consensus_state.ShortDebugString() << "\n"; |
10094 | 0 | } |
10095 | 0 | } |
10096 | | |
10097 | | Status CatalogManager::PeerStateDump(const vector<RaftPeerPB>& peers, |
10098 | | const DumpMasterStateRequestPB* req, |
10099 | 0 | DumpMasterStateResponsePB* resp) { |
10100 | 0 | std::unique_ptr<MasterClusterProxy> peer_proxy; |
10101 | 0 | Endpoint sockaddr; |
10102 | 0 | MonoTime timeout = MonoTime::Now(); |
10103 | 0 | DumpMasterStateRequestPB peer_req; |
10104 | 0 | rpc::RpcController rpc; |
10105 | |
|
10106 | 0 | timeout.AddDelta(MonoDelta::FromMilliseconds(FLAGS_master_ts_rpc_timeout_ms)); |
10107 | 0 | rpc.set_deadline(timeout); |
10108 | 0 | peer_req.set_on_disk(req->on_disk()); |
10109 | 0 | peer_req.set_return_dump_as_string(req->return_dump_as_string()); |
10110 | 0 | string dump; |
10111 | |
|
10112 | 0 | for (const RaftPeerPB& peer : peers) { |
10113 | 0 | HostPort hostport = HostPortFromPB(DesiredHostPort(peer, master_->MakeCloudInfoPB())); |
10114 | 0 | peer_proxy = std::make_unique<MasterClusterProxy>(&master_->proxy_cache(), hostport); |
10115 | |
|
10116 | 0 | DumpMasterStateResponsePB peer_resp; |
10117 | 0 | rpc.Reset(); |
10118 | |
|
10119 | 0 | RETURN_NOT_OK(peer_proxy->DumpState(peer_req, &peer_resp, &rpc)); |
10120 | | |
10121 | 0 | if (peer_resp.has_error()) { |
10122 | 0 | LOG(WARNING) << "Hit err " << peer_resp.ShortDebugString() << " during peer " |
10123 | 0 | << peer.ShortDebugString() << " state dump."; |
10124 | 0 | return StatusFromPB(peer_resp.error().status()); |
10125 | 0 | } else if (req->return_dump_as_string()) { |
10126 | 0 | dump += peer_resp.dump(); |
10127 | 0 | } |
10128 | 0 | } |
10129 | | |
10130 | 0 | if (req->return_dump_as_string()) { |
10131 | 0 | resp->set_dump(resp->dump() + dump); |
10132 | 0 | } |
10133 | 0 | return Status::OK(); |
10134 | 0 | } |
10135 | | |
10136 | 1.56M | void CatalogManager::ReportMetrics() { |
10137 | | // Report metrics on how many tservers are alive. |
10138 | 1.56M | TSDescriptorVector ts_descs; |
10139 | 1.56M | master_->ts_manager()->GetAllLiveDescriptors(&ts_descs); |
10140 | 1.56M | const auto num_live_servers = ts_descs.size(); |
10141 | 1.56M | metric_num_tablet_servers_live_->set_value(narrow_cast<uint32_t>(num_live_servers)); |
10142 | | |
10143 | 1.56M | master_->ts_manager()->GetAllDescriptors(&ts_descs); |
10144 | 1.56M | metric_num_tablet_servers_dead_->set_value( |
10145 | 1.56M | narrow_cast<uint32_t>(ts_descs.size() - num_live_servers)); |
10146 | 1.56M | } |
10147 | | |
10148 | 994k | void CatalogManager::ResetMetrics() { |
10149 | 994k | metric_num_tablet_servers_live_->set_value(0); |
10150 | 994k | metric_num_tablet_servers_dead_->set_value(0); |
10151 | 994k | } |
10152 | | |
10153 | | |
10154 | 377k | std::string CatalogManager::LogPrefix() const { |
10155 | 377k | if (tablet_peer()) { |
10156 | 377k | return consensus::MakeTabletLogPrefix( |
10157 | 377k | tablet_peer()->tablet_id(), tablet_peer()->permanent_uuid()); |
10158 | 377k | } else { |
10159 | 58 | return consensus::MakeTabletLogPrefix( |
10160 | 58 | kSysCatalogTabletId, master_->fs_manager()->uuid()); |
10161 | 58 | } |
10162 | 377k | } |
10163 | | |
10164 | 0 | void CatalogManager::SetLoadBalancerEnabled(bool is_enabled) { |
10165 | 0 | load_balance_policy_->SetLoadBalancerEnabled(is_enabled); |
10166 | 0 | } |
10167 | | |
10168 | 1 | bool CatalogManager::IsLoadBalancerEnabled() { |
10169 | 1 | return load_balance_policy_->IsLoadBalancerEnabled(); |
10170 | 1 | } |
10171 | | |
10172 | 1.54M | MonoDelta CatalogManager::TimeSinceElectedLeader() { |
10173 | 1.54M | return MonoTime::Now() - time_elected_leader_.load(); |
10174 | 1.54M | } |
10175 | | |
10176 | 37 | Status CatalogManager::GoIntoShellMode() { |
10177 | 37 | if (master_->IsShellMode()) { |
10178 | 0 | return STATUS(IllegalState, "Master is already in shell mode."); |
10179 | 0 | } |
10180 | | |
10181 | 37 | LOG(INFO) << "Starting going into shell mode."; |
10182 | 37 | master_->SetShellMode(true); |
10183 | | |
10184 | 37 | { |
10185 | 37 | LockGuard lock(mutex_); |
10186 | 37 | RETURN_NOT_OK(sys_catalog_->GoIntoShellMode()); |
10187 | 37 | background_tasks_->Shutdown(); |
10188 | 37 | background_tasks_.reset(); |
10189 | 37 | } |
10190 | 0 | { |
10191 | 37 | std::lock_guard<std::mutex> l(remote_bootstrap_mtx_); |
10192 | 37 | tablet_exists_ = false; |
10193 | 37 | } |
10194 | | |
10195 | 37 | LOG(INFO) << "Done going into shell mode."; |
10196 | | |
10197 | 37 | return Status::OK(); |
10198 | 37 | } |
10199 | | |
10200 | 334 | Status CatalogManager::GetClusterConfig(GetMasterClusterConfigResponsePB* resp) { |
10201 | 334 | return GetClusterConfig(resp->mutable_cluster_config()); |
10202 | 334 | } |
10203 | | |
10204 | 7.14M | Status CatalogManager::GetClusterConfig(SysClusterConfigEntryPB* config) { |
10205 | 7.14M | auto cluster_config = ClusterConfig(); |
10206 | 7.14M | DCHECK(cluster_config) << "Missing cluster config for master!"934 ; |
10207 | 7.14M | auto l = cluster_config->LockForRead(); |
10208 | 7.14M | *config = l->pb; |
10209 | 7.14M | return Status::OK(); |
10210 | 7.14M | } |
10211 | | |
10212 | | Status CatalogManager::SetClusterConfig( |
10213 | 172 | const ChangeMasterClusterConfigRequestPB* req, ChangeMasterClusterConfigResponsePB* resp) { |
10214 | 172 | SysClusterConfigEntryPB config(req->cluster_config()); |
10215 | | |
10216 | 172 | if (config.has_server_blacklist()) { |
10217 | 27 | config.mutable_server_blacklist()->set_initial_replica_load(narrow_cast<int32_t>( |
10218 | 27 | GetNumRelevantReplicas(config.server_blacklist(), false /* leaders_only */))); |
10219 | 27 | LOG(INFO) << Format("Set blacklist of total tservers: $0, with initial load: $1", |
10220 | 27 | config.server_blacklist().hosts().size(), |
10221 | 27 | config.server_blacklist().initial_replica_load()); |
10222 | 27 | } |
10223 | 172 | if (config.has_leader_blacklist()) { |
10224 | 22 | config.mutable_leader_blacklist()->set_initial_leader_load(narrow_cast<int32_t>( |
10225 | 22 | GetNumRelevantReplicas(config.leader_blacklist(), true /* leaders_only */))); |
10226 | 22 | LOG(INFO) << Format("Set leader blacklist of total tservers: $0, with initial load: $1", |
10227 | 22 | config.leader_blacklist().hosts().size(), |
10228 | 22 | config.leader_blacklist().initial_leader_load()); |
10229 | 22 | } |
10230 | | |
10231 | 172 | auto cluster_config = ClusterConfig(); |
10232 | 172 | auto l = cluster_config->LockForWrite(); |
10233 | | // We should only set the config, if the caller provided us with a valid update to the |
10234 | | // existing config. |
10235 | 172 | if (l->pb.version() != config.version()) { |
10236 | 0 | Status s = STATUS_SUBSTITUTE(IllegalState, |
10237 | 0 | "Config version does not match, got $0, but most recent one is $1. Should call Get again", |
10238 | 0 | config.version(), l->pb.version()); |
10239 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::CONFIG_VERSION_MISMATCH, s); |
10240 | 0 | } |
10241 | | |
10242 | 172 | if (config.cluster_uuid() != l->pb.cluster_uuid()) { |
10243 | 1 | Status s = STATUS(InvalidArgument, "Config cluster UUID cannot be updated"); |
10244 | 1 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_CLUSTER_CONFIG, s); |
10245 | 1 | } |
10246 | | |
10247 | | // TODO(bogdan): should this live here? |
10248 | 171 | const ReplicationInfoPB& replication_info = config.replication_info(); |
10249 | 188 | for (int i = 0; i < replication_info.read_replicas_size(); i++17 ) { |
10250 | 17 | if (!replication_info.read_replicas(i).has_placement_uuid()) { |
10251 | 0 | Status s = STATUS(IllegalState, |
10252 | 0 | "All read-only clusters must have a placement uuid specified"); |
10253 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_CLUSTER_CONFIG, s); |
10254 | 0 | } |
10255 | 17 | } |
10256 | | |
10257 | | // Validate placement information according to rules defined. |
10258 | 171 | if (replication_info.has_live_replicas()) { |
10259 | 134 | Status s = CatalogManagerUtil::IsPlacementInfoValid(replication_info.live_replicas()); |
10260 | 134 | if (!s.ok()) { |
10261 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_CLUSTER_CONFIG, s); |
10262 | 0 | } |
10263 | 134 | } |
10264 | | |
10265 | 171 | l.mutable_data()->pb.CopyFrom(config); |
10266 | | // Bump the config version, to indicate an update. |
10267 | 171 | l.mutable_data()->pb.set_version(config.version() + 1); |
10268 | | |
10269 | 171 | LOG(INFO) << "Updating cluster config to " << config.version() + 1; |
10270 | | |
10271 | 171 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), cluster_config.get())); |
10272 | | |
10273 | 171 | l.Commit(); |
10274 | | |
10275 | 171 | return Status::OK(); |
10276 | 171 | } |
10277 | | |
10278 | | Status CatalogManager::ValidateReplicationInfo( |
10279 | 56.8k | const ValidateReplicationInfoRequestPB* req, ValidateReplicationInfoResponsePB* resp) { |
10280 | 56.8k | TSDescriptorVector all_ts_descs; |
10281 | 56.8k | { |
10282 | 56.8k | BlacklistSet blacklist = VERIFY_RESULT(BlacklistSetFromPB()); |
10283 | 0 | master_->ts_manager()->GetAllLiveDescriptors(&all_ts_descs, blacklist); |
10284 | 56.8k | } |
10285 | | // We don't need any validation checks for read replica placements |
10286 | | // because they aren't a part of any raft quorum underneath. |
10287 | | // Technically, it is ok to have even 0 read replica nodes for them upfront. |
10288 | | // We only need it for the primary cluster replicas. |
10289 | 0 | TSDescriptorVector ts_descs; |
10290 | 56.8k | GetTsDescsFromPlacementInfo(req->replication_info().live_replicas(), all_ts_descs, &ts_descs); |
10291 | 56.8k | Status s = CheckValidPlacementInfo(req->replication_info().live_replicas(), all_ts_descs, resp); |
10292 | 56.8k | if (!s.ok()) { |
10293 | 34 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_TABLE_REPLICATION_INFO, s); |
10294 | 34 | } |
10295 | 56.8k | return Status::OK(); |
10296 | 56.8k | } |
10297 | | |
10298 | | Status CatalogManager::SetPreferredZones( |
10299 | 3 | const SetPreferredZonesRequestPB* req, SetPreferredZonesResponsePB* resp) { |
10300 | 3 | auto cluster_config = ClusterConfig(); |
10301 | 3 | auto l = cluster_config->LockForWrite(); |
10302 | 3 | auto replication_info = l.mutable_data()->pb.mutable_replication_info(); |
10303 | 3 | replication_info->clear_affinitized_leaders(); |
10304 | | |
10305 | 5 | for (const auto& cloud_info : req->preferred_zones()) { |
10306 | 5 | const auto& placement_info = replication_info->live_replicas(); |
10307 | 5 | if (!CatalogManagerUtil::DoesPlacementInfoContainCloudInfo(placement_info, cloud_info)) { |
10308 | 0 | Status s = STATUS_FORMAT(InvalidArgument, "Placement info $0 does not contain cloud info $1", |
10309 | 0 | placement_info, TSDescriptor::generate_placement_id(cloud_info)); |
10310 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_CLUSTER_CONFIG, s); |
10311 | 0 | } |
10312 | 5 | *replication_info->add_affinitized_leaders() = cloud_info; |
10313 | 5 | } |
10314 | | |
10315 | 3 | l.mutable_data()->pb.set_version(l.mutable_data()->pb.version() + 1); |
10316 | | |
10317 | 3 | LOG(INFO) << "Updating cluster config to " << l.mutable_data()->pb.version(); |
10318 | | |
10319 | 3 | Status s = sys_catalog_->Upsert(leader_ready_term(), cluster_config.get()); |
10320 | 3 | if (!s.ok()) { |
10321 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_CLUSTER_CONFIG, s); |
10322 | 0 | } |
10323 | | |
10324 | 3 | l.Commit(); |
10325 | | |
10326 | 3 | return Status::OK(); |
10327 | 3 | } |
10328 | | |
10329 | 55.7k | Result<size_t> CatalogManager::GetReplicationFactor() { |
10330 | 55.7k | auto cluster_config = ClusterConfig(); |
10331 | 55.7k | DCHECK(cluster_config) << "Missing cluster config for master!"0 ; |
10332 | 55.7k | auto l = cluster_config->LockForRead(); |
10333 | 55.7k | const ReplicationInfoPB& replication_info = l->pb.replication_info(); |
10334 | 55.7k | return GetNumReplicasFromPlacementInfo(replication_info.live_replicas()); |
10335 | 55.7k | } |
10336 | | |
10337 | | Result<size_t> CatalogManager::GetReplicationFactorForTablet( |
10338 | 16.3k | const scoped_refptr<TabletInfo>& tablet) { |
10339 | | // For system tables, the set of replicas is always the set of masters. |
10340 | 16.3k | if (system_tablets_.find(tablet->id()) != system_tablets_.end()) { |
10341 | 481 | consensus::ConsensusStatePB master_consensus; |
10342 | 481 | RETURN_NOT_OK(GetCurrentConfig(&master_consensus)); |
10343 | 481 | return master_consensus.config().peers().size(); |
10344 | 481 | } |
10345 | 15.9k | int num_live_replicas = 0, num_read_replicas = 0; |
10346 | 15.9k | GetExpectedNumberOfReplicas(&num_live_replicas, &num_read_replicas); |
10347 | 15.9k | return num_live_replicas + num_read_replicas; |
10348 | 16.3k | } |
10349 | | |
10350 | 259k | void CatalogManager::GetExpectedNumberOfReplicas(int* num_live_replicas, int* num_read_replicas) { |
10351 | 259k | auto l = ClusterConfig()->LockForRead(); |
10352 | 259k | const ReplicationInfoPB& replication_info = l->pb.replication_info(); |
10353 | 259k | *num_live_replicas = narrow_cast<int>(GetNumReplicasFromPlacementInfo( |
10354 | 259k | replication_info.live_replicas())); |
10355 | 259k | for (const auto& read_replica_placement_info : replication_info.read_replicas()) { |
10356 | 1.93k | *num_read_replicas += read_replica_placement_info.num_replicas(); |
10357 | 1.93k | } |
10358 | 259k | } |
10359 | | |
10360 | 6.81k | Result<string> CatalogManager::placement_uuid() const { |
10361 | 6.81k | auto cluster_config = ClusterConfig(); |
10362 | 6.81k | if (!cluster_config) { |
10363 | 0 | return STATUS(IllegalState, "Missing cluster config for master!"); |
10364 | 0 | } |
10365 | 6.81k | auto l = cluster_config->LockForRead(); |
10366 | 6.81k | const ReplicationInfoPB& replication_info = l->pb.replication_info(); |
10367 | 6.81k | return replication_info.live_replicas().placement_uuid(); |
10368 | 6.81k | } |
10369 | | |
10370 | | Status CatalogManager::IsLoadBalanced(const IsLoadBalancedRequestPB* req, |
10371 | 239 | IsLoadBalancedResponsePB* resp) { |
10372 | 239 | if (req->has_expected_num_servers()) { |
10373 | 238 | TSDescriptorVector ts_descs; |
10374 | 238 | master_->ts_manager()->GetAllLiveDescriptors(&ts_descs); |
10375 | | |
10376 | 238 | if (implicit_cast<size_t>(req->expected_num_servers()) > ts_descs.size()) { |
10377 | 9 | Status s = STATUS_SUBSTITUTE(IllegalState, |
10378 | 9 | "Found $0, which is below the expected number of servers $1.", |
10379 | 9 | ts_descs.size(), req->expected_num_servers()); |
10380 | 9 | return SetupError(resp->mutable_error(), MasterErrorPB::CAN_RETRY_LOAD_BALANCE_CHECK, s); |
10381 | 9 | } |
10382 | 238 | } |
10383 | | |
10384 | 230 | Status s = load_balance_policy_->IsIdle(); |
10385 | 230 | if (!s.ok()) { |
10386 | 194 | return SetupError(resp->mutable_error(), MasterErrorPB::CAN_RETRY_LOAD_BALANCE_CHECK, s); |
10387 | 194 | } |
10388 | | |
10389 | 36 | return Status::OK(); |
10390 | 230 | } |
10391 | | |
10392 | | Status CatalogManager::IsLoadBalancerIdle(const IsLoadBalancerIdleRequestPB* req, |
10393 | 2.48k | IsLoadBalancerIdleResponsePB* resp) { |
10394 | 2.48k | Status s = load_balance_policy_->IsIdle(); |
10395 | 2.48k | if (!s.ok()) { |
10396 | 1.98k | return SetupError(resp->mutable_error(), MasterErrorPB::LOAD_BALANCER_RECENTLY_ACTIVE, s); |
10397 | 1.98k | } |
10398 | | |
10399 | 502 | return Status::OK(); |
10400 | 2.48k | } |
10401 | | |
10402 | | Status CatalogManager::AreLeadersOnPreferredOnly(const AreLeadersOnPreferredOnlyRequestPB* req, |
10403 | 147 | AreLeadersOnPreferredOnlyResponsePB* resp) { |
10404 | | // If we have cluster replication info, then only fetch live tservers (ignore read replicas). |
10405 | 147 | TSDescriptorVector ts_descs; |
10406 | 147 | string live_replicas_placement_uuid = ""; |
10407 | 147 | { |
10408 | 147 | auto l = ClusterConfig()->LockForRead(); |
10409 | 147 | const ReplicationInfoPB& cluster_replication_info = l->pb.replication_info(); |
10410 | 147 | if (cluster_replication_info.has_live_replicas()) { |
10411 | 109 | live_replicas_placement_uuid = cluster_replication_info.live_replicas().placement_uuid(); |
10412 | 109 | } |
10413 | 147 | } |
10414 | | |
10415 | 147 | { |
10416 | 147 | BlacklistSet blacklist = VERIFY_RESULT(BlacklistSetFromPB()); |
10417 | 147 | if (live_replicas_placement_uuid.empty()) { |
10418 | 146 | master_->ts_manager()->GetAllLiveDescriptors(&ts_descs, blacklist); |
10419 | 146 | } else { |
10420 | 1 | master_->ts_manager()->GetAllLiveDescriptorsInCluster( |
10421 | 1 | &ts_descs, live_replicas_placement_uuid, |
10422 | 1 | blacklist); |
10423 | 1 | } |
10424 | 147 | } |
10425 | | |
10426 | | // Only need to fetch if txn tables are not using preferred zones. |
10427 | 0 | vector<TableInfoPtr> tables; |
10428 | 147 | if (!FLAGS_transaction_tables_use_preferred_zones) { |
10429 | 147 | tables = master_->catalog_manager()->GetTables(GetTablesMode::kRunning); |
10430 | 147 | } |
10431 | | |
10432 | 147 | auto l = ClusterConfig()->LockForRead(); |
10433 | 147 | Status s = CatalogManagerUtil::AreLeadersOnPreferredOnly( |
10434 | 147 | ts_descs, l->pb.replication_info(), tables); |
10435 | 147 | if (!s.ok()) { |
10436 | 132 | return SetupError( |
10437 | 132 | resp->mutable_error(), MasterErrorPB::CAN_RETRY_ARE_LEADERS_ON_PREFERRED_ONLY_CHECK, s); |
10438 | 132 | } |
10439 | | |
10440 | 15 | return Status::OK(); |
10441 | 147 | } |
10442 | | |
10443 | 2.49k | int64_t CatalogManager::GetNumRelevantReplicas(const BlacklistPB& blacklist, bool leaders_only) { |
10444 | 2.49k | int64_t res = 0; |
10445 | 2.49k | SharedLock lock(mutex_); |
10446 | 87.1k | for (const TabletInfoMap::value_type& entry : *tablet_map_) { |
10447 | 87.1k | scoped_refptr<TabletInfo> tablet = entry.second; |
10448 | 87.1k | auto l = tablet->LockForRead(); |
10449 | | // Not checking being created on purpose as we do not want initial load to be under accounted. |
10450 | 87.1k | if (!tablet->table() || |
10451 | 87.1k | PREDICT_FALSE(l->is_deleted())) { |
10452 | 0 | continue; |
10453 | 0 | } |
10454 | | |
10455 | 87.1k | auto locs = tablet->GetReplicaLocations(); |
10456 | 134k | for (const auto& replica : *locs) { |
10457 | 134k | if (leaders_only && replica.second.role != PeerRole::LEADER21.9k ) { |
10458 | 14.6k | continue; |
10459 | 14.6k | } |
10460 | 368k | for (int i = 0; 119k i < blacklist.hosts_size(); i++248k ) { |
10461 | 295k | if (replica.second.ts_desc->IsRunningOn(blacklist.hosts(i))) { |
10462 | 46.9k | ++res; |
10463 | 46.9k | break; |
10464 | 46.9k | } |
10465 | 295k | } |
10466 | 119k | } |
10467 | 87.1k | } |
10468 | | |
10469 | 2.49k | return res; |
10470 | 2.49k | } |
10471 | | |
10472 | | Status CatalogManager::FillHeartbeatResponse(const TSHeartbeatRequestPB* req, |
10473 | 0 | TSHeartbeatResponsePB* resp) { |
10474 | 0 | return Status::OK(); |
10475 | 0 | } |
10476 | | |
10477 | 2.06k | Status CatalogManager::GetLoadMoveCompletionPercent(GetLoadMovePercentResponsePB* resp) { |
10478 | 2.06k | return GetLoadMoveCompletionPercent(resp, false); |
10479 | 2.06k | } |
10480 | | |
10481 | 388 | Status CatalogManager::GetLeaderBlacklistCompletionPercent(GetLoadMovePercentResponsePB* resp) { |
10482 | 388 | return GetLoadMoveCompletionPercent(resp, true); |
10483 | 388 | } |
10484 | | |
10485 | | Status CatalogManager::GetLoadMoveCompletionPercent(GetLoadMovePercentResponsePB* resp, |
10486 | 2.44k | bool blacklist_leader) { |
10487 | 2.44k | auto l = ClusterConfig()->LockForRead(); |
10488 | | |
10489 | | // Fine to pass in empty defaults if server_blacklist or leader_blacklist is not filled. |
10490 | 2.44k | const BlacklistPB& state = blacklist_leader ? l->pb.leader_blacklist()388 : l->pb.server_blacklist()2.06k ; |
10491 | 2.44k | int64_t blacklist_replicas = GetNumRelevantReplicas(state, blacklist_leader); |
10492 | 2.44k | int64_t initial_load = (blacklist_leader) ? |
10493 | 2.06k | state.initial_leader_load()388 : state.initial_replica_load(); |
10494 | | // If we are starting up and don't find any load on the tservers, return progress as 0. |
10495 | | // We expect that by blacklist_progress_initial_delay_secs time, this should go away and if the |
10496 | | // load is reported as 0 on the blacklisted tservers after this time then it means that |
10497 | | // the transfer is successfully complete. |
10498 | 2.44k | if (blacklist_replicas == 0 && |
10499 | 2.44k | TimeSinceElectedLeader() <= MonoDelta::FromSeconds(FLAGS_blacklist_progress_initial_delay_secs)1.19k ) { |
10500 | 970 | LOG(INFO) << "Master leadership has changed. Reporting progress as 0 until the catalog " << |
10501 | 970 | "manager gets the correct estimates of the remaining load on the blacklisted" << |
10502 | 970 | "tservers."; |
10503 | 970 | resp->set_percent(0); |
10504 | 970 | resp->set_total(initial_load); |
10505 | 970 | resp->set_remaining(initial_load); |
10506 | 970 | return Status::OK(); |
10507 | 970 | } |
10508 | | |
10509 | | // On change of master leader, initial_load_ information may be lost temporarily. Reset to |
10510 | | // current value to avoid reporting progress percent as 100. Note that doing so will report |
10511 | | // progress percent as 0 instead. |
10512 | | // TODO(Sanket): This might be no longer relevant after we persist and load the initial load |
10513 | | // on failover. Need to investigate. |
10514 | 1.47k | if (initial_load < blacklist_replicas) { |
10515 | 0 | LOG(INFO) << Format("Initial load: $0, current load: $1." |
10516 | 0 | " Initial load is less than the current load. Probably a master leader change." |
10517 | 0 | " Reporting progress as 0", state.initial_replica_load(), |
10518 | 0 | blacklist_replicas); |
10519 | 0 | initial_load = blacklist_replicas; |
10520 | 0 | } |
10521 | | |
10522 | 1.47k | LOG(INFO) << "Blacklisted count " << blacklist_replicas |
10523 | 1.47k | << " across " << state.hosts_size() |
10524 | 1.47k | << " servers, with initial load " << initial_load; |
10525 | | |
10526 | | // Case when a blacklisted servers did not have any starting load. |
10527 | 1.47k | if (initial_load == 0) { |
10528 | 64 | resp->set_percent(100); |
10529 | 64 | return Status::OK(); |
10530 | 64 | } |
10531 | | |
10532 | 1.41k | resp->set_percent( |
10533 | 1.41k | 100 - (static_cast<double>(blacklist_replicas) * 100 / initial_load)); |
10534 | 1.41k | resp->set_remaining(blacklist_replicas); |
10535 | 1.41k | resp->set_total(initial_load); |
10536 | | |
10537 | 1.41k | return Status::OK(); |
10538 | 1.47k | } |
10539 | | |
10540 | 3.10k | void CatalogManager::AbortAndWaitForAllTasks(const vector<scoped_refptr<TableInfo>>& tables) { |
10541 | 3.61k | for (const auto& t : tables) { |
10542 | 3.61k | VLOG(1) << "Aborting tasks for table " << t->ToString()0 ; |
10543 | 3.61k | t->AbortTasksAndClose(); |
10544 | 3.61k | } |
10545 | 3.61k | for (const auto& t : tables) { |
10546 | 3.61k | VLOG(1) << "Waiting on Aborting tasks for table " << t->ToString()0 ; |
10547 | 3.61k | t->WaitTasksCompletion(); |
10548 | 3.61k | } |
10549 | 3.10k | VLOG(1) << "Waiting on Aborting tasks done"0 ; |
10550 | 3.10k | } |
10551 | | |
10552 | 497k | void CatalogManager::HandleNewTableId(const TableId& table_id) { |
10553 | 497k | if (table_id == kPgProcTableId) { |
10554 | | // Needed to track whether initdb has started running. |
10555 | 763 | pg_proc_exists_.store(true, std::memory_order_release); |
10556 | 763 | } |
10557 | 497k | } |
10558 | | |
10559 | 499k | scoped_refptr<TableInfo> CatalogManager::NewTableInfo(TableId id) { |
10560 | 499k | return make_scoped_refptr<TableInfo>(id, tasks_tracker_); |
10561 | 499k | } |
10562 | | |
10563 | 363k | Status CatalogManager::ScheduleTask(std::shared_ptr<RetryingTSRpcTask> task) { |
10564 | 363k | Status s = async_task_pool_->SubmitFunc([task]() { |
10565 | 363k | WARN_NOT_OK(task->Run(), "Failed task"); |
10566 | 363k | }); |
10567 | | // If we are not able to enqueue, abort the task. |
10568 | 363k | if (!s.ok()) { |
10569 | 0 | task->AbortAndReturnPrevState(s); |
10570 | 0 | } |
10571 | 363k | return s; |
10572 | 363k | } |
10573 | | |
10574 | | Status CatalogManager::CollectTable( |
10575 | | const TableDescription& table_description, |
10576 | | CollectFlags flags, |
10577 | | std::vector<TableDescription>* all_tables, |
10578 | 45 | std::unordered_set<NamespaceId>* parent_colocated_table_ids) { |
10579 | 45 | auto lock = table_description.table_info->LockForRead(); |
10580 | 45 | if (lock->started_hiding()) { |
10581 | 4 | VLOG_WITH_PREFIX_AND_FUNC0 (4) |
10582 | 0 | << "Rejected hidden table: " << AsString(table_description.table_info); |
10583 | 4 | return Status::OK(); |
10584 | 4 | } |
10585 | 41 | if (lock->started_deleting()) { |
10586 | 0 | VLOG_WITH_PREFIX_AND_FUNC(4) |
10587 | 0 | << "Rejected deleted table: " << AsString(table_description.table_info); |
10588 | 0 | return Status::OK(); |
10589 | 0 | } |
10590 | 41 | if (flags.Test(CollectFlag::kIncludeParentColocatedTable) && lock->pb.colocated()30 ) { |
10591 | | // If a table is colocated, add its parent colocated table as well. |
10592 | 0 | const auto parent_table_id = |
10593 | 0 | table_description.namespace_info->id() + kColocatedParentTableIdSuffix; |
10594 | 0 | auto result = parent_colocated_table_ids->insert(parent_table_id); |
10595 | 0 | if (result.second) { |
10596 | | // We have not processed this parent table id yet, so do that now. |
10597 | 0 | TableIdentifierPB parent_table_pb; |
10598 | 0 | parent_table_pb.set_table_id(parent_table_id); |
10599 | 0 | parent_table_pb.mutable_namespace_()->set_id(table_description.namespace_info->id()); |
10600 | 0 | all_tables->push_back(VERIFY_RESULT(DescribeTable( |
10601 | 0 | parent_table_pb, flags.Test(CollectFlag::kSucceedIfCreateInProgress)))); |
10602 | 0 | } |
10603 | 0 | } |
10604 | 41 | all_tables->push_back(table_description); |
10605 | | |
10606 | 41 | if (flags.Test(CollectFlag::kAddIndexes)) { |
10607 | 6 | TRACE(Substitute("Locking object with id $0", table_description.table_info->id())); |
10608 | | |
10609 | 6 | if (lock->is_index()) { |
10610 | 0 | return STATUS(InvalidArgument, "Expected table, but found index", |
10611 | 0 | table_description.table_info->id(), |
10612 | 0 | MasterError(MasterErrorPB::INVALID_TABLE_TYPE)); |
10613 | 0 | } |
10614 | | |
10615 | 6 | if (lock->table_type() == PGSQL_TABLE_TYPE) { |
10616 | 0 | return STATUS(InvalidArgument, "Getting indexes for YSQL table is not supported", |
10617 | 0 | table_description.table_info->id(), |
10618 | 0 | MasterError(MasterErrorPB::INVALID_TABLE_TYPE)); |
10619 | 0 | } |
10620 | | |
10621 | 6 | auto collect_index_flags = flags; |
10622 | | // Don't need to collect indexes for index. |
10623 | 6 | collect_index_flags.Reset(CollectFlag::kAddIndexes); |
10624 | 6 | for (const auto& index_info : lock->pb.indexes()) { |
10625 | 0 | LOG_IF(DFATAL, table_description.table_info->id() != index_info.indexed_table_id()) |
10626 | 0 | << "Wrong indexed table id in index descriptor"; |
10627 | 0 | TableIdentifierPB index_id_pb; |
10628 | 0 | index_id_pb.set_table_id(index_info.table_id()); |
10629 | 0 | index_id_pb.mutable_namespace_()->set_id(table_description.namespace_info->id()); |
10630 | 0 | auto index_description = VERIFY_RESULT(DescribeTable( |
10631 | 0 | index_id_pb, flags.Test(CollectFlag::kSucceedIfCreateInProgress))); |
10632 | 0 | RETURN_NOT_OK(CollectTable( |
10633 | 0 | index_description, collect_index_flags, all_tables, parent_colocated_table_ids)); |
10634 | 0 | } |
10635 | 6 | } |
10636 | | |
10637 | 41 | return Status::OK(); |
10638 | 41 | } |
10639 | | |
10640 | | Result<vector<TableDescription>> CatalogManager::CollectTables( |
10641 | | const google::protobuf::RepeatedPtrField<TableIdentifierPB>& table_identifiers, |
10642 | | CollectFlags flags, |
10643 | 51 | std::unordered_set<NamespaceId>* namespaces) { |
10644 | 51 | std::vector<std::pair<TableInfoPtr, CollectFlags>> table_with_flags; |
10645 | | |
10646 | 51 | { |
10647 | 51 | SharedLock lock(mutex_); |
10648 | 53 | for (const auto& table_id_pb : table_identifiers) { |
10649 | 53 | if (table_id_pb.table_name().empty() && table_id_pb.table_id().empty()47 && |
10650 | 53 | table_id_pb.has_namespace_()39 ) { |
10651 | 39 | auto namespace_info = FindNamespaceUnlocked(table_id_pb.namespace_()); |
10652 | 39 | if (!namespace_info.ok()) { |
10653 | 0 | if (namespace_info.status().IsNotFound()) { |
10654 | 0 | continue; |
10655 | 0 | } |
10656 | 0 | return namespace_info.status(); |
10657 | 0 | } |
10658 | 39 | if (namespaces) { |
10659 | 39 | namespaces->insert((**namespace_info).id()); |
10660 | 39 | } |
10661 | | |
10662 | | |
10663 | 39 | auto ns_collect_flags = flags; |
10664 | | // Don't collect indexes, since they should be in the same namespace and will be collected |
10665 | | // as regular tables. |
10666 | | // It is necessary because we don't support kAddIndexes for YSQL tables. |
10667 | 39 | ns_collect_flags.Reset(CollectFlag::kAddIndexes); |
10668 | 39 | VLOG_WITH_PREFIX_AND_FUNC0 (1) |
10669 | 0 | << "Collecting all tables from: " << (**namespace_info).ToString() << ", specified as: " |
10670 | 0 | << table_id_pb.namespace_().ShortDebugString(); |
10671 | 8.74k | for (const auto& id_and_table : *table_ids_map_) { |
10672 | 8.74k | if (id_and_table.second->is_system()) { |
10673 | 8.67k | VLOG_WITH_PREFIX_AND_FUNC0 (4) << "Rejected system table: " << AsString(id_and_table)0 ; |
10674 | 8.67k | continue; |
10675 | 8.67k | } |
10676 | 64 | auto lock = id_and_table.second->LockForRead(); |
10677 | 64 | if (lock->namespace_id() != (**namespace_info).id()) { |
10678 | 33 | VLOG_WITH_PREFIX_AND_FUNC0 (4) |
10679 | 0 | << "Rejected table from other namespace: " << AsString(id_and_table); |
10680 | 33 | continue; |
10681 | 33 | } |
10682 | 31 | VLOG_WITH_PREFIX_AND_FUNC0 (4) << "Accepted: " << AsString(id_and_table)0 ; |
10683 | 31 | table_with_flags.emplace_back(id_and_table.second, ns_collect_flags); |
10684 | 31 | } |
10685 | 39 | } else { |
10686 | 14 | auto table = VERIFY_RESULT(FindTableUnlocked(table_id_pb)); |
10687 | 0 | VLOG_WITH_PREFIX_AND_FUNC(1) << "Collecting table: " << table->ToString(); |
10688 | 14 | table_with_flags.emplace_back(table, flags); |
10689 | 14 | } |
10690 | 53 | } |
10691 | 51 | } |
10692 | | |
10693 | 51 | std::sort(table_with_flags.begin(), table_with_flags.end(), [](const auto& p1, const auto& p2) { |
10694 | 7 | return p1.first->id() < p2.first->id(); |
10695 | 7 | }); |
10696 | 51 | std::vector<TableDescription> all_tables; |
10697 | 51 | std::unordered_set<NamespaceId> parent_colocated_table_ids; |
10698 | 51 | const TableId* table_id = nullptr; |
10699 | 51 | for (auto& table_and_flags : table_with_flags) { |
10700 | 45 | if (table_id && *table_id == table_and_flags.first->id()7 ) { |
10701 | 0 | return STATUS_FORMAT(InternalError, "Table collected twice $0", *table_id); |
10702 | 0 | } |
10703 | 45 | auto description = VERIFY_RESULT(DescribeTable( |
10704 | 45 | table_and_flags.first, |
10705 | 45 | table_and_flags.second.Test(CollectFlag::kSucceedIfCreateInProgress))); |
10706 | 45 | RETURN_NOT_OK(CollectTable( |
10707 | 45 | description, table_and_flags.second, &all_tables, &parent_colocated_table_ids)); |
10708 | 45 | table_id = &table_and_flags.first->id(); |
10709 | 45 | } |
10710 | | |
10711 | 51 | return all_tables; |
10712 | 51 | } |
10713 | | |
10714 | | Result<std::vector<TableDescription>> CatalogManager::CollectTables( |
10715 | | const google::protobuf::RepeatedPtrField<TableIdentifierPB>& table_identifiers, |
10716 | | bool add_indexes, |
10717 | 10 | bool include_parent_colocated_table) { |
10718 | 10 | CollectFlags flags; |
10719 | 10 | flags.SetIf(CollectFlag::kAddIndexes, add_indexes); |
10720 | 10 | flags.SetIf(CollectFlag::kIncludeParentColocatedTable, include_parent_colocated_table); |
10721 | 10 | return CollectTables(table_identifiers, flags); |
10722 | 10 | } |
10723 | | |
10724 | 3.00k | Status CatalogManager::GetYQLPartitionsVTable(std::shared_ptr<SystemTablet>* tablet) { |
10725 | 3.00k | scoped_refptr<TableInfo> table = FindPtrOrNull(table_names_map_, |
10726 | 3.00k | std::make_pair(kSystemNamespaceId, kSystemPartitionsTableName)); |
10727 | 3.00k | SCHECK(table != nullptr, NotFound, "YQL system.partitions table not found"); |
10728 | | |
10729 | 3.00k | auto tablets = table->GetTablets(); |
10730 | 3.00k | SCHECK(tablets.size() == 1, NotFound, "YQL system.partitions tablet not found"); |
10731 | 3.00k | *tablet = std::dynamic_pointer_cast<SystemTablet>( |
10732 | 3.00k | VERIFY_RESULT(GetSystemTablet(tablets[0]->tablet_id()))); |
10733 | 0 | return Status::OK(); |
10734 | 3.00k | } |
10735 | | |
10736 | 245k | void CatalogManager::RebuildYQLSystemPartitions() { |
10737 | 245k | if (YQLPartitionsVTable::GeneratePartitionsVTableWithBgTask() || |
10738 | 245k | YQLPartitionsVTable::GeneratePartitionsVTableOnChanges()245k ) { |
10739 | 245k | SCOPED_LEADER_SHARED_LOCK(l, this); |
10740 | 245k | if (l.catalog_status().ok() && l.leader_status().ok()243k ) { |
10741 | 153k | if (system_partitions_tablet_ != nullptr) { |
10742 | 153k | Status s; |
10743 | 153k | if (YQLPartitionsVTable::GeneratePartitionsVTableWithBgTask()) { |
10744 | | // If we are not generating the vtable on changes, then we need to do a full refresh. |
10745 | 3 | s = ResultToStatus(GetYqlPartitionsVtable().GenerateAndCacheData()); |
10746 | 153k | } else { |
10747 | | // Otherwise, we can simply update the cached vtable with the internal map. |
10748 | 153k | s = GetYqlPartitionsVtable().UpdateCache(); |
10749 | 153k | } |
10750 | 153k | if (!s.ok()) { |
10751 | 0 | LOG(ERROR) << "Error rebuilding system.partitions: " << s.ToString(); |
10752 | 0 | } |
10753 | 153k | } else { |
10754 | 0 | LOG(ERROR) << "Error finding system.partitions vtable."; |
10755 | 0 | } |
10756 | 153k | } |
10757 | 245k | } |
10758 | | |
10759 | 245k | auto wait_time = FLAGS_partitions_vtable_cache_refresh_secs * 1s; |
10760 | 245k | if (wait_time <= 0s) { |
10761 | 245k | wait_time = kDefaultYQLPartitionsRefreshBgTaskSleep; |
10762 | 245k | } |
10763 | 245k | refresh_yql_partitions_task_.Schedule([this](const Status& status) { |
10764 | 237k | WARN_NOT_OK( |
10765 | 237k | background_tasks_thread_pool_->SubmitFunc([this]() { RebuildYQLSystemPartitions(); }), |
10766 | 237k | "Failed to schedule: RebuildYQLSystemPartitions"); |
10767 | 237k | }, wait_time); |
10768 | 245k | } |
10769 | | |
10770 | 1.56M | Status CatalogManager::SysCatalogRespectLeaderAffinity() { |
10771 | 1.56M | auto l = ClusterConfig()->LockForRead(); |
10772 | | |
10773 | 1.56M | const auto& affinitized_leaders = l->pb.replication_info().affinitized_leaders(); |
10774 | 1.56M | if (affinitized_leaders.empty()) { |
10775 | 1.56M | return Status::OK(); |
10776 | 1.56M | } |
10777 | | |
10778 | 271 | for (const CloudInfoPB& cloud_info : affinitized_leaders)244 { |
10779 | | // Do nothing if already in an affinitized zone. |
10780 | 271 | if (CatalogManagerUtil::IsCloudInfoEqual(cloud_info, server_registration_.cloud_info())) { |
10781 | 152 | return Status::OK(); |
10782 | 152 | } |
10783 | 271 | } |
10784 | | |
10785 | | // Not in affinitized zone, try finding a master to send a step down request to. |
10786 | 92 | std::vector<ServerEntryPB> masters; |
10787 | 92 | RETURN_NOT_OK(master_->ListMasters(&masters)); |
10788 | | |
10789 | 213 | for (const ServerEntryPB& master : masters)92 { |
10790 | 213 | auto master_cloud_info = master.registration().cloud_info(); |
10791 | | |
10792 | 257 | for (const CloudInfoPB& config_cloud_info : affinitized_leaders) { |
10793 | 257 | if (CatalogManagerUtil::IsCloudInfoEqual(config_cloud_info, master_cloud_info)) { |
10794 | 0 | if (PREDICT_FALSE( |
10795 | 0 | GetAtomicFlag(&FLAGS_TEST_crash_server_on_sys_catalog_leader_affinity_move))) { |
10796 | 0 | LOG_WITH_PREFIX(FATAL) << "For test: Crashing the server instead of performing sys " |
10797 | 0 | "catalog leader affinity move."; |
10798 | 0 | } |
10799 | 0 | YB_LOG_WITH_PREFIX_EVERY_N_SECS(INFO, 10) |
10800 | 0 | << "Sys catalog tablet is not in an affinitized zone, " |
10801 | 0 | << "sending step down request to master uuid " |
10802 | 0 | << master.instance_id().permanent_uuid() |
10803 | 0 | << " in zone " |
10804 | 0 | << TSDescriptor::generate_placement_id(master_cloud_info); |
10805 | 0 | std::shared_ptr<TabletPeer> tablet_peer; |
10806 | 0 | RETURN_NOT_OK(GetTabletPeer(sys_catalog_->tablet_id(), &tablet_peer)); |
10807 | | |
10808 | 0 | consensus::LeaderStepDownRequestPB req; |
10809 | 0 | req.set_tablet_id(sys_catalog_->tablet_id()); |
10810 | 0 | req.set_dest_uuid(sys_catalog_->tablet_peer()->permanent_uuid()); |
10811 | 0 | req.set_new_leader_uuid(master.instance_id().permanent_uuid()); |
10812 | |
|
10813 | 0 | consensus::LeaderStepDownResponsePB resp; |
10814 | 0 | RETURN_NOT_OK(tablet_peer->consensus()->StepDown(&req, &resp)); |
10815 | 0 | if (resp.has_error()) { |
10816 | 0 | YB_LOG_WITH_PREFIX_EVERY_N_SECS(INFO, 10) << "Step down failed: " |
10817 | 0 | << resp.error().status().message(); |
10818 | 0 | break; |
10819 | 0 | } |
10820 | 0 | LOG_WITH_PREFIX(INFO) << "Successfully stepped down to new master"; |
10821 | 0 | return Status::OK(); |
10822 | 0 | } |
10823 | 257 | } |
10824 | 213 | } |
10825 | | |
10826 | 92 | return STATUS(NotFound, "Couldn't step down to a master in an affinitized zone"); |
10827 | 92 | } |
10828 | | |
10829 | 255k | Result<BlacklistSet> CatalogManager::BlacklistSetFromPB() const { |
10830 | 255k | auto cluster_config = ClusterConfig(); |
10831 | 255k | if (!cluster_config) { |
10832 | 2 | return STATUS(IllegalState, "Cluster config not found."); |
10833 | 2 | } |
10834 | 255k | auto l = cluster_config->LockForRead(); |
10835 | | |
10836 | 255k | const auto& blacklist_pb = l->pb.server_blacklist(); |
10837 | 255k | BlacklistSet blacklist_set; |
10838 | 255k | for (int i = 0; i < blacklist_pb.hosts_size(); i++110 ) { |
10839 | 110 | blacklist_set.insert(HostPortFromPB(blacklist_pb.hosts(i))); |
10840 | 110 | } |
10841 | | |
10842 | 255k | return blacklist_set; |
10843 | 255k | } |
10844 | | |
10845 | | void CatalogManager::ProcessTabletStorageMetadata( |
10846 | | const std::string& ts_uuid, |
10847 | 2.19M | const TabletDriveStorageMetadataPB& storage_metadata) { |
10848 | 2.19M | const string& tablet_id = storage_metadata.tablet_id(); |
10849 | 2.19M | scoped_refptr<TabletInfo> tablet; |
10850 | 2.19M | { |
10851 | 2.19M | SharedLock lock(mutex_); |
10852 | 2.19M | tablet = FindPtrOrNull(*tablet_map_, tablet_id); |
10853 | 2.19M | } |
10854 | 2.19M | if (!tablet) { |
10855 | 0 | VLOG(1) << Format("Tablet $0 not found on ts $1", tablet_id, ts_uuid); |
10856 | 0 | return; |
10857 | 0 | } |
10858 | 2.19M | TabletReplicaDriveInfo drive_info{ |
10859 | 2.19M | storage_metadata.sst_file_size(), |
10860 | 2.19M | storage_metadata.wal_file_size(), |
10861 | 2.19M | storage_metadata.uncompressed_sst_file_size(), |
10862 | 2.19M | storage_metadata.may_have_orphaned_post_split_data()}; |
10863 | 2.19M | tablet->UpdateReplicaDriveInfo(ts_uuid, drive_info); |
10864 | 2.19M | } |
10865 | | |
10866 | 173k | void CatalogManager::CheckTableDeleted(const TableInfoPtr& table) { |
10867 | 173k | if (!FLAGS_master_drop_table_after_task_response) { |
10868 | 0 | return; |
10869 | 0 | } |
10870 | | // Since this is called after every successful async DeleteTablet, it's possible if all tasks |
10871 | | // complete, for us to mark the table as DELETED/HIDDEN asap. This is desirable as clients will |
10872 | | // wait for this before returning success to the user. |
10873 | | // |
10874 | | // However, if tasks fail, timeout, or are aborted, we still have the background thread as a |
10875 | | // catch all. |
10876 | 173k | auto lock = MaybeTransitionTableToDeleted(table); |
10877 | 173k | if (!lock.locked()) { |
10878 | 167k | return; |
10879 | 167k | } |
10880 | 5.93k | Status s = sys_catalog_->Upsert(leader_ready_term(), table); |
10881 | 5.93k | if (!s.ok()) { |
10882 | 2 | LOG_WITH_PREFIX(WARNING) |
10883 | 2 | << "Error marking table as " |
10884 | 2 | << (table->LockForRead()->started_deleting() ? "DELETED" : "HIDDEN"0 ) << ": " << s; |
10885 | 2 | return; |
10886 | 2 | } |
10887 | 5.93k | lock.Commit(); |
10888 | 5.93k | } |
10889 | | |
10890 | 635k | const YQLPartitionsVTable& CatalogManager::GetYqlPartitionsVtable() const { |
10891 | 635k | return down_cast<const YQLPartitionsVTable&>(system_partitions_tablet_->QLStorage()); |
10892 | 635k | } |
10893 | | |
10894 | | void CatalogManager::InitializeTableLoadState( |
10895 | 21.4k | const TableId& table_id, TSDescriptorVector ts_descs, CMPerTableLoadState* state) { |
10896 | 63.0k | for (const auto& ts : ts_descs) { |
10897 | | // Touch every tserver with 0 load. |
10898 | 63.0k | state->per_ts_load_[ts->permanent_uuid()]; |
10899 | | // Insert into the sorted list. |
10900 | 63.0k | state->sorted_load_.emplace_back(ts->permanent_uuid()); |
10901 | 63.0k | } |
10902 | | |
10903 | 21.4k | auto table_info = GetTableInfo(table_id); |
10904 | | |
10905 | 21.4k | if (!table_info) { |
10906 | 0 | return; |
10907 | 0 | } |
10908 | 21.4k | CatalogManagerUtil::FillTableLoadState(table_info, state); |
10909 | 21.4k | } |
10910 | | |
10911 | | void CatalogManager::InitializeGlobalLoadState( |
10912 | 20.5k | TSDescriptorVector ts_descs, CMGlobalLoadState* state) { |
10913 | 60.2k | for (const auto& ts : ts_descs) { |
10914 | | // Touch every tserver with 0 load. |
10915 | 60.2k | state->per_ts_load_[ts->permanent_uuid()]; |
10916 | 60.2k | } |
10917 | | |
10918 | 20.5k | SharedLock l(mutex_); |
10919 | 7.36M | for (const auto& id_and_info : *table_ids_map_) { |
10920 | | // Ignore system, colocated and deleting/deleted tables. |
10921 | 7.36M | { |
10922 | 7.36M | auto l = id_and_info.second->LockForRead(); |
10923 | 7.36M | if (IsSystemTable(*(id_and_info.second)) || |
10924 | 7.36M | id_and_info.second->IsColocatedUserTable()298k || |
10925 | 7.36M | l->started_deleting()297k ) { |
10926 | 7.20M | continue; |
10927 | 7.20M | } |
10928 | 7.36M | } |
10929 | 158k | CatalogManagerUtil::FillTableLoadState(id_and_info.second, state); |
10930 | 158k | } |
10931 | 20.5k | } |
10932 | | |
10933 | 12.1M | std::shared_ptr<ClusterConfigInfo> CatalogManager::ClusterConfig() const { |
10934 | 12.1M | yb::SharedLock<decltype(config_mutex_)> lock(config_mutex_); |
10935 | 12.1M | return cluster_config_; |
10936 | 12.1M | } |
10937 | | |
10938 | | } // namespace master |
10939 | | } // namespace yb |