/Users/deen/code/yugabyte-db/src/yb/master/catalog_manager.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // |
18 | | // The following only applies to changes made to this file as part of YugaByte development. |
19 | | // |
20 | | // Portions Copyright (c) YugaByte, Inc. |
21 | | // |
22 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
23 | | // in compliance with the License. You may obtain a copy of the License at |
24 | | // |
25 | | // http://www.apache.org/licenses/LICENSE-2.0 |
26 | | // |
27 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
28 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
29 | | // or implied. See the License for the specific language governing permissions and limitations |
30 | | // under the License. |
31 | | // |
32 | | // ================================================================================================ |
33 | | // |
34 | | // The catalog manager handles the current list of tables |
35 | | // and tablets in the cluster, as well as their current locations. |
36 | | // Since most operations in the master go through these data |
37 | | // structures, locking is carefully managed here to prevent unnecessary |
38 | | // contention and deadlocks: |
39 | | // |
40 | | // - each structure has an internal spinlock used for operations that |
41 | | // are purely in-memory (eg the current status of replicas) |
42 | | // - data that is persisted on disk is stored in separate PersistentTable(t)Info |
43 | | // structs. These are managed using copy-on-write so that writers may block |
44 | | // writing them back to disk while not impacting concurrent readers. |
45 | | // |
46 | | // Usage rules: |
47 | | // - You may obtain READ locks in any order. READ locks should never block, |
48 | | // since they only conflict with COMMIT which is a purely in-memory operation. |
49 | | // Thus they are deadlock-free. |
50 | | // - If you need a WRITE lock on both a table and one or more of its tablets, |
51 | | // acquire the lock on the table first. This strict ordering prevents deadlocks. |
52 | | // |
53 | | // ================================================================================================ |
54 | | |
55 | | #include "yb/master/catalog_manager.h" |
56 | | |
57 | | #include <stdlib.h> |
58 | | |
59 | | #include <algorithm> |
60 | | #include <atomic> |
61 | | #include <bitset> |
62 | | #include <functional> |
63 | | #include <memory> |
64 | | #include <mutex> |
65 | | #include <set> |
66 | | #include <string> |
67 | | #include <unordered_map> |
68 | | #include <vector> |
69 | | |
70 | | #include <boost/optional.hpp> |
71 | | #include <glog/logging.h> |
72 | | |
73 | | #include "yb/client/client-internal.h" |
74 | | #include "yb/client/client.h" |
75 | | #include "yb/client/schema.h" |
76 | | #include "yb/client/universe_key_client.h" |
77 | | |
78 | | #include "yb/common/common.pb.h" |
79 | | #include "yb/common/common_flags.h" |
80 | | #include "yb/common/key_encoder.h" |
81 | | #include "yb/common/partial_row.h" |
82 | | #include "yb/common/partition.h" |
83 | | #include "yb/common/ql_type.h" |
84 | | #include "yb/common/roles_permissions.h" |
85 | | #include "yb/common/schema.h" |
86 | | #include "yb/common/wire_protocol.h" |
87 | | |
88 | | #include "yb/consensus/consensus.h" |
89 | | #include "yb/consensus/consensus.pb.h" |
90 | | #include "yb/consensus/consensus_util.h" |
91 | | #include "yb/consensus/metadata.pb.h" |
92 | | #include "yb/consensus/opid_util.h" |
93 | | #include "yb/consensus/quorum_util.h" |
94 | | |
95 | | #include "yb/docdb/doc_key.h" |
96 | | |
97 | | #include "yb/gutil/atomicops.h" |
98 | | #include "yb/gutil/bind.h" |
99 | | #include "yb/gutil/casts.h" |
100 | | #include "yb/gutil/map-util.h" |
101 | | #include "yb/gutil/mathlimits.h" |
102 | | #include "yb/gutil/stl_util.h" |
103 | | #include "yb/gutil/strings/escaping.h" |
104 | | #include "yb/gutil/strings/join.h" |
105 | | #include "yb/gutil/strings/substitute.h" |
106 | | #include "yb/gutil/sysinfo.h" |
107 | | #include "yb/gutil/walltime.h" |
108 | | |
109 | | #include "yb/master/master_fwd.h" |
110 | | #include "yb/master/async_rpc_tasks.h" |
111 | | #include "yb/master/backfill_index.h" |
112 | | #include "yb/master/catalog_entity_info.h" |
113 | | #include "yb/master/catalog_loaders.h" |
114 | | #include "yb/master/catalog_manager-internal.h" |
115 | | #include "yb/master/catalog_manager_bg_tasks.h" |
116 | | #include "yb/master/catalog_manager_util.h" |
117 | | #include "yb/master/cluster_balance.h" |
118 | | #include "yb/master/encryption_manager.h" |
119 | | #include "yb/master/master.h" |
120 | | #include "yb/master/master_admin.pb.h" |
121 | | #include "yb/master/master_client.pb.h" |
122 | | #include "yb/master/master_cluster.proxy.h" |
123 | | #include "yb/master/master_dcl.pb.h" |
124 | | #include "yb/master/master_ddl.pb.h" |
125 | | #include "yb/master/master_encryption.pb.h" |
126 | | #include "yb/master/master_error.h" |
127 | | #include "yb/master/master_heartbeat.pb.h" |
128 | | #include "yb/master/master_replication.pb.h" |
129 | | #include "yb/master/master_util.h" |
130 | | #include "yb/master/permissions_manager.h" |
131 | | #include "yb/master/scoped_leader_shared_lock-internal.h" |
132 | | #include "yb/master/sys_catalog.h" |
133 | | #include "yb/master/sys_catalog_constants.h" |
134 | | #include "yb/master/ts_descriptor.h" |
135 | | #include "yb/master/yql_aggregates_vtable.h" |
136 | | #include "yb/master/yql_auth_resource_role_permissions_index.h" |
137 | | #include "yb/master/yql_auth_role_permissions_vtable.h" |
138 | | #include "yb/master/yql_auth_roles_vtable.h" |
139 | | #include "yb/master/yql_columns_vtable.h" |
140 | | #include "yb/master/yql_empty_vtable.h" |
141 | | #include "yb/master/yql_functions_vtable.h" |
142 | | #include "yb/master/yql_indexes_vtable.h" |
143 | | #include "yb/master/yql_keyspaces_vtable.h" |
144 | | #include "yb/master/yql_local_vtable.h" |
145 | | #include "yb/master/yql_partitions_vtable.h" |
146 | | #include "yb/master/yql_peers_vtable.h" |
147 | | #include "yb/master/yql_size_estimates_vtable.h" |
148 | | #include "yb/master/yql_tables_vtable.h" |
149 | | #include "yb/master/yql_triggers_vtable.h" |
150 | | #include "yb/master/yql_types_vtable.h" |
151 | | #include "yb/master/yql_views_vtable.h" |
152 | | #include "yb/master/ysql_transaction_ddl.h" |
153 | | |
154 | | #include "yb/rpc/messenger.h" |
155 | | #include "yb/rpc/rpc_controller.h" |
156 | | |
157 | | #include "yb/tablet/operations/change_metadata_operation.h" |
158 | | #include "yb/tablet/tablet.h" |
159 | | #include "yb/tablet/tablet_metadata.h" |
160 | | #include "yb/tablet/tablet_peer.h" |
161 | | #include "yb/tablet/tablet_retention_policy.h" |
162 | | |
163 | | #include "yb/tserver/remote_bootstrap_client.h" |
164 | | #include "yb/tserver/ts_tablet_manager.h" |
165 | | |
166 | | #include "yb/util/atomic.h" |
167 | | #include "yb/util/countdown_latch.h" |
168 | | #include "yb/util/debug-util.h" |
169 | | #include "yb/util/debug/trace_event.h" |
170 | | #include "yb/util/flag_tags.h" |
171 | | #include "yb/util/format.h" |
172 | | #include "yb/util/hash_util.h" |
173 | | #include "yb/util/locks.h" |
174 | | #include "yb/util/math_util.h" |
175 | | #include "yb/util/metrics.h" |
176 | | #include "yb/util/monotime.h" |
177 | | #include "yb/util/net/net_util.h" |
178 | | #include "yb/util/oid_generator.h" |
179 | | #include "yb/util/random_util.h" |
180 | | #include "yb/util/rw_mutex.h" |
181 | | #include "yb/util/semaphore.h" |
182 | | #include "yb/util/shared_lock.h" |
183 | | #include "yb/util/size_literals.h" |
184 | | #include "yb/util/status.h" |
185 | | #include "yb/util/status_format.h" |
186 | | #include "yb/util/status_log.h" |
187 | | #include "yb/util/stopwatch.h" |
188 | | #include "yb/util/string_util.h" |
189 | | #include "yb/util/sync_point.h" |
190 | | #include "yb/util/thread.h" |
191 | | #include "yb/util/threadpool.h" |
192 | | #include "yb/util/trace.h" |
193 | | #include "yb/util/tsan_util.h" |
194 | | #include "yb/util/uuid.h" |
195 | | |
196 | | #include "yb/yql/pgwrapper/pg_wrapper.h" |
197 | | #include "yb/yql/redis/redisserver/redis_constants.h" |
198 | | |
199 | | using namespace std::literals; |
200 | | using namespace yb::size_literals; |
201 | | |
202 | | DEFINE_int32(master_ts_rpc_timeout_ms, 30 * 1000, // 30 sec |
203 | | "Timeout used for the Master->TS async rpc calls."); |
204 | | TAG_FLAG(master_ts_rpc_timeout_ms, advanced); |
205 | | |
206 | | DEFINE_int32(tablet_creation_timeout_ms, 30 * 1000, // 30 sec |
207 | | "Timeout used by the master when attempting to create tablet " |
208 | | "replicas during table creation."); |
209 | | TAG_FLAG(tablet_creation_timeout_ms, advanced); |
210 | | |
211 | | DEFINE_test_flag(bool, disable_tablet_deletion, false, |
212 | | "Whether catalog manager should disable tablet deletion."); |
213 | | |
214 | | DEFINE_bool(catalog_manager_wait_for_new_tablets_to_elect_leader, true, |
215 | | "Whether the catalog manager should wait for a newly created tablet to " |
216 | | "elect a leader before considering it successfully created. " |
217 | | "This is disabled in some tests where we explicitly manage leader " |
218 | | "election."); |
219 | | TAG_FLAG(catalog_manager_wait_for_new_tablets_to_elect_leader, hidden); |
220 | | |
221 | | DEFINE_int32(catalog_manager_inject_latency_in_delete_table_ms, 0, |
222 | | "Number of milliseconds that the master will sleep in DeleteTable."); |
223 | | TAG_FLAG(catalog_manager_inject_latency_in_delete_table_ms, hidden); |
224 | | |
225 | | DECLARE_int32(catalog_manager_bg_task_wait_ms); |
226 | | |
227 | | DEFINE_int32(replication_factor, 3, |
228 | | "Default number of replicas for tables that do not have the num_replicas set."); |
229 | | TAG_FLAG(replication_factor, advanced); |
230 | | |
231 | | DEFINE_int32(max_create_tablets_per_ts, 50, |
232 | | "The number of tablets per TS that can be requested for a new table."); |
233 | | TAG_FLAG(max_create_tablets_per_ts, advanced); |
234 | | |
235 | | DEFINE_int32(catalog_manager_report_batch_size, 1, |
236 | | "The max number of tablets evaluated in the heartbeat as a single SysCatalog update."); |
237 | | TAG_FLAG(catalog_manager_report_batch_size, advanced); |
238 | | |
239 | | DEFINE_int32(master_failover_catchup_timeout_ms, 30 * 1000 * yb::kTimeMultiplier, // 30 sec |
240 | | "Amount of time to give a newly-elected leader master to load" |
241 | | " the previous master's metadata and become active. If this time" |
242 | | " is exceeded, the node crashes."); |
243 | | TAG_FLAG(master_failover_catchup_timeout_ms, advanced); |
244 | | TAG_FLAG(master_failover_catchup_timeout_ms, experimental); |
245 | | |
246 | | DEFINE_bool(master_tombstone_evicted_tablet_replicas, true, |
247 | | "Whether the Master should tombstone (delete) tablet replicas that " |
248 | | "are no longer part of the latest reported raft config."); |
249 | | TAG_FLAG(master_tombstone_evicted_tablet_replicas, hidden); |
250 | | DECLARE_bool(master_ignore_deleted_on_load); |
251 | | |
252 | | // Temporary. Can be removed after long-run testing. |
253 | | DEFINE_bool(master_ignore_stale_cstate, true, |
254 | | "Whether Master processes the raft config when the version is lower."); |
255 | | TAG_FLAG(master_ignore_stale_cstate, hidden); |
256 | | |
257 | | DEFINE_bool(catalog_manager_check_ts_count_for_create_table, true, |
258 | | "Whether the master should ensure that there are enough live tablet " |
259 | | "servers to satisfy the provided replication count before allowing " |
260 | | "a table to be created."); |
261 | | TAG_FLAG(catalog_manager_check_ts_count_for_create_table, hidden); |
262 | | |
263 | | DEFINE_test_flag(bool, catalog_manager_check_yql_partitions_exist_for_is_create_table_done, true, |
264 | | "Whether the master should ensure that all of a table's tablets are " |
265 | | "in the YQL system.partitions vtable during the IsCreateTableDone check."); |
266 | | |
267 | | METRIC_DEFINE_gauge_uint32(cluster, num_tablet_servers_live, |
268 | | "Number of live tservers in the cluster", yb::MetricUnit::kUnits, |
269 | | "The number of tablet servers that have responded or done a heartbeat " |
270 | | "in the time interval defined by the gflag " |
271 | | "FLAGS_tserver_unresponsive_timeout_ms."); |
272 | | |
273 | | METRIC_DEFINE_gauge_uint32(cluster, num_tablet_servers_dead, |
274 | | "Number of dead tservers in the cluster", yb::MetricUnit::kUnits, |
275 | | "The number of tablet servers that have not responded or done a " |
276 | | "heartbeat in the time interval defined by the gflag " |
277 | | "FLAGS_tserver_unresponsive_timeout_ms."); |
278 | | |
279 | | DEFINE_test_flag(uint64, inject_latency_during_remote_bootstrap_secs, 0, |
280 | | "Number of seconds to sleep during a remote bootstrap."); |
281 | | |
282 | | DEFINE_test_flag(uint64, inject_latency_during_tablet_report_ms, 0, |
283 | | "Number of milliseconds to sleep during the processing of a tablet batch."); |
284 | | |
285 | | DEFINE_test_flag(bool, catalog_manager_simulate_system_table_create_failure, false, |
286 | | "This is only used in tests to simulate a failure where the table information is " |
287 | | "persisted in syscatalog, but the tablet information is not yet persisted and " |
288 | | "there is a failure."); |
289 | | |
290 | | DEFINE_string(cluster_uuid, "", "Cluster UUID to be used by this cluster"); |
291 | | TAG_FLAG(cluster_uuid, hidden); |
292 | | |
293 | | DECLARE_int32(yb_num_shards_per_tserver); |
294 | | |
295 | | DEFINE_int32(transaction_table_num_tablets, 0, |
296 | | "Number of tablets to use when creating the transaction status table." |
297 | | "0 to use transaction_table_num_tablets_per_tserver."); |
298 | | |
299 | | DEFINE_int32(transaction_table_num_tablets_per_tserver, kAutoDetectNumShardsPerTServer, |
300 | | "The default number of tablets per tablet server for transaction status table. If the value is " |
301 | | "-1, the system automatically determines an appropriate value based on number of CPU cores."); |
302 | | |
303 | | DEFINE_bool(auto_create_local_transaction_tables, true, |
304 | | "Whether or not to create local transaction status tables automatically on table " |
305 | | "creation with a tablespace with placement specified."); |
306 | | |
307 | | DEFINE_test_flag(bool, name_transaction_tables_with_tablespace_id, false, |
308 | | "This is only used in tests to make associating automatically created transaction " |
309 | | "tables with their tablespaces easier, and causes transaction tables created " |
310 | | "automatically for tablespaces to include the tablespace oid in their names."); |
311 | | |
312 | | DEFINE_bool(master_enable_metrics_snapshotter, false, "Should metrics snapshotter be enabled"); |
313 | | |
314 | | DEFINE_int32(metrics_snapshots_table_num_tablets, 0, |
315 | | "Number of tablets to use when creating the metrics snapshots table." |
316 | | "0 to use the same default num tablets as for regular tables."); |
317 | | |
318 | | DEFINE_bool(disable_index_backfill, false, |
319 | | "A kill switch to disable multi-stage backfill for YCQL indexes."); |
320 | | TAG_FLAG(disable_index_backfill, runtime); |
321 | | TAG_FLAG(disable_index_backfill, hidden); |
322 | | |
323 | | DEFINE_bool(disable_index_backfill_for_non_txn_tables, true, |
324 | | "A kill switch to disable multi-stage backfill for user enforced YCQL indexes. " |
325 | | "Note that enabling this feature may cause the create index flow to be slow. " |
326 | | "This is needed to ensure the safety of the index backfill process. See also " |
327 | | "index_backfill_upperbound_for_user_enforced_txn_duration_ms"); |
328 | | TAG_FLAG(disable_index_backfill_for_non_txn_tables, runtime); |
329 | | TAG_FLAG(disable_index_backfill_for_non_txn_tables, hidden); |
330 | | |
331 | | DEFINE_bool(enable_transactional_ddl_gc, true, |
332 | | "A kill switch for transactional DDL GC. Temporary safety measure."); |
333 | | TAG_FLAG(enable_transactional_ddl_gc, runtime); |
334 | | TAG_FLAG(enable_transactional_ddl_gc, hidden); |
335 | | |
336 | | DEFINE_bool( |
337 | | hide_pg_catalog_table_creation_logs, false, |
338 | | "Whether to hide detailed log messages for PostgreSQL catalog table creation. " |
339 | | "This cuts down test logs significantly."); |
340 | | TAG_FLAG(hide_pg_catalog_table_creation_logs, hidden); |
341 | | |
342 | | DEFINE_test_flag(int32, simulate_slow_table_create_secs, 0, |
343 | | "Simulates a slow table creation by sleeping after the table has been added to memory."); |
344 | | |
345 | | DEFINE_test_flag(int32, simulate_slow_system_tablet_bootstrap_secs, 0, |
346 | | "Simulates a slow tablet bootstrap by adding a sleep before system tablet init."); |
347 | | |
348 | | DEFINE_test_flag(bool, return_error_if_namespace_not_found, false, |
349 | | "Return an error from ListTables if a namespace id is not found in the map"); |
350 | | |
351 | | DEFINE_test_flag(bool, hang_on_namespace_transition, false, |
352 | | "Used in tests to simulate a lapse between issuing a namespace op and final processing."); |
353 | | |
354 | | DEFINE_test_flag(bool, simulate_crash_after_table_marked_deleting, false, |
355 | | "Crash yb-master after table's state is set to DELETING. This skips tablets deletion."); |
356 | | |
357 | | DEFINE_bool(master_drop_table_after_task_response, true, |
358 | | "Mark a table as DELETED as soon as we get all the responses from all the TS."); |
359 | | TAG_FLAG(master_drop_table_after_task_response, advanced); |
360 | | TAG_FLAG(master_drop_table_after_task_response, runtime); |
361 | | |
362 | | DECLARE_int32(yb_client_admin_operation_timeout_sec); |
363 | | |
364 | | DEFINE_test_flag(bool, tablegroup_master_only, false, |
365 | | "This is only for MasterTest to be able to test tablegroups without the" |
366 | | " transaction status table being created."); |
367 | | |
368 | | DEFINE_bool(enable_register_ts_from_raft, true, "Whether to register a tserver from the consensus " |
369 | | "information of a reported tablet."); |
370 | | |
371 | | DECLARE_int32(tserver_unresponsive_timeout_ms); |
372 | | |
373 | | DEFINE_bool(use_create_table_leader_hint, true, |
374 | | "Whether the Master should hint which replica for each tablet should " |
375 | | "be leader initially on tablet creation."); |
376 | | TAG_FLAG(use_create_table_leader_hint, runtime); |
377 | | |
378 | | DEFINE_test_flag(bool, create_table_leader_hint_min_lexicographic, false, |
379 | | "Whether the Master should hint replica with smallest lexicographic rank for each " |
380 | | "tablet as leader initially on tablet creation."); |
381 | | |
382 | | DEFINE_double(heartbeat_safe_deadline_ratio, .20, |
383 | | "When the heartbeat deadline has this percentage of time remaining, " |
384 | | "the master should halt tablet report processing so it can respond in time."); |
385 | | DECLARE_int32(heartbeat_rpc_timeout_ms); |
386 | | DECLARE_CAPABILITY(TabletReportLimit); |
387 | | |
388 | | DEFINE_int32(partitions_vtable_cache_refresh_secs, 0, |
389 | | "Amount of time to wait before refreshing the system.partitions cached vtable. " |
390 | | "If generate_partitions_vtable_on_changes is set, then this background task will " |
391 | | "update the cache using the internal map, but won't do any generating of the vtable."); |
392 | | |
393 | | DEFINE_int32(txn_table_wait_min_ts_count, 1, |
394 | | "Minimum Number of TS to wait for before creating the transaction status table." |
395 | | " Default value is 1. We wait for atleast --replication_factor if this value" |
396 | | " is smaller than that"); |
397 | | TAG_FLAG(txn_table_wait_min_ts_count, advanced); |
398 | | |
399 | | DEFINE_bool(enable_ysql_tablespaces_for_placement, true, |
400 | | "If set, tablespaces will be used for placement of YSQL tables."); |
401 | | TAG_FLAG(enable_ysql_tablespaces_for_placement, runtime); |
402 | | |
403 | | DEFINE_int32(ysql_tablespace_info_refresh_secs, 30, |
404 | | "Frequency at which the table to tablespace information will be updated in master " |
405 | | "from pg catalog tables. A value of -1 disables the refresh task."); |
406 | | TAG_FLAG(ysql_tablespace_info_refresh_secs, runtime); |
407 | | |
408 | | DEFINE_int64(tablet_split_size_threshold_bytes, 0, |
409 | | "DEPRECATED -- Threshold on tablet size after which tablet should be split. Automated " |
410 | | "splitting is disabled if this value is set to 0."); |
411 | | TAG_FLAG(tablet_split_size_threshold_bytes, hidden); |
412 | | |
413 | | DEFINE_int64(tablet_split_low_phase_shard_count_per_node, 8, |
414 | | "The per-node tablet count until which a table is splitting at the phase 1 threshold, " |
415 | | "as defined by tablet_split_low_phase_size_threshold_bytes."); |
416 | | DEFINE_int64(tablet_split_high_phase_shard_count_per_node, 24, |
417 | | "The per-node tablet count until which a table is splitting at the phase 2 threshold, " |
418 | | "as defined by tablet_split_high_phase_size_threshold_bytes."); |
419 | | |
420 | | DEFINE_int64(tablet_split_low_phase_size_threshold_bytes, 512_MB, |
421 | | "The tablet size threshold at which to split tablets in phase 1. " |
422 | | "See tablet_split_low_phase_shard_count_per_node."); |
423 | | DEFINE_int64(tablet_split_high_phase_size_threshold_bytes, 10_GB, |
424 | | "The tablet size threshold at which to split tablets in phase 2. " |
425 | | "See tablet_split_high_phase_shard_count_per_node."); |
426 | | DEFINE_int64(tablet_force_split_threshold_bytes, 100_GB, |
427 | | "The tablet size threshold at which to split tablets regardless of how many tablets " |
428 | | "exist in the table already. This should be configured to prevent runaway whale " |
429 | | "tablets from forming in your cluster even if both automatic splitting phases have " |
430 | | "been finished."); |
431 | | |
432 | | DEFINE_test_flag(bool, crash_server_on_sys_catalog_leader_affinity_move, false, |
433 | | "When set, crash the master process if it performs a sys catalog leader affinity " |
434 | | "move."); |
435 | | DEFINE_int32(blacklist_progress_initial_delay_secs, yb::master::kDelayAfterFailoverSecs, |
436 | | "When a master leader failsover, the time until which the progress of load movement " |
437 | | "off the blacklisted tservers is reported as 0. This initial delay " |
438 | | "gives sufficient time for heartbeats so that we don't report" |
439 | | " a premature incorrect completion."); |
440 | | TAG_FLAG(blacklist_progress_initial_delay_secs, runtime); |
441 | | |
442 | | DEFINE_test_flag(bool, validate_all_tablet_candidates, false, |
443 | | "When set to true, consider any tablet a valid candidate for splitting. " |
444 | | "Specifically this flag ensures that ValidateSplitCandidateTable and " |
445 | | "ValidateSplitCandidateTablet always return OK and all tablets are considered " |
446 | | "valid candidates for splitting."); |
447 | | |
448 | | DEFINE_test_flag(bool, skip_placement_validation_createtable_api, false, |
449 | | "When set, it skips checking that all the tablets of a table have enough tservers" |
450 | | " conforming to the table placement policy during CreateTable API call."); |
451 | | TAG_FLAG(TEST_skip_placement_validation_createtable_api, runtime); |
452 | | |
453 | | DEFINE_test_flag(int32, slowdown_alter_table_rpcs_ms, 0, |
454 | | "Slows down the alter table rpc's send and response handler so that the TServer " |
455 | | "has a heartbeat delay and triggers tablet leader change."); |
456 | | |
457 | | DEFINE_test_flag(bool, reject_delete_not_serving_tablet_rpc, false, |
458 | | "Whether to reject DeleteNotServingTablet RPC."); |
459 | | |
460 | | DEFINE_test_flag(double, crash_after_creating_single_split_tablet, 0.0, |
461 | | "Crash inside CatalogManager::RegisterNewTabletForSplit after calling Upsert"); |
462 | | |
463 | | DEFINE_bool(enable_delete_truncate_xcluster_replicated_table, false, |
464 | | "When set, enables deleting/truncating tables currently in xCluster replication"); |
465 | | TAG_FLAG(enable_delete_truncate_xcluster_replicated_table, runtime); |
466 | | |
467 | | namespace yb { |
468 | | namespace master { |
469 | | |
470 | | using std::atomic; |
471 | | using std::shared_ptr; |
472 | | using std::string; |
473 | | using std::unique_ptr; |
474 | | using std::vector; |
475 | | |
476 | | using namespace std::placeholders; |
477 | | |
478 | | using base::subtle::NoBarrier_Load; |
479 | | using base::subtle::NoBarrier_CompareAndSwap; |
480 | | using consensus::kMinimumTerm; |
481 | | using consensus::CONSENSUS_CONFIG_COMMITTED; |
482 | | using consensus::CONSENSUS_CONFIG_ACTIVE; |
483 | | using consensus::COMMITTED_OPID; |
484 | | using consensus::Consensus; |
485 | | using consensus::ConsensusMetadata; |
486 | | using consensus::ConsensusServiceProxy; |
487 | | using consensus::ConsensusStatePB; |
488 | | using consensus::GetConsensusRole; |
489 | | using consensus::PeerMemberType; |
490 | | using consensus::RaftPeerPB; |
491 | | using consensus::StartRemoteBootstrapRequestPB; |
492 | | using rpc::RpcContext; |
493 | | using server::MonitoredTask; |
494 | | using strings::Substitute; |
495 | | using tablet::TABLET_DATA_COPYING; |
496 | | using tablet::TABLET_DATA_DELETED; |
497 | | using tablet::TABLET_DATA_READY; |
498 | | using tablet::TABLET_DATA_TOMBSTONED; |
499 | | using tablet::TabletDataState; |
500 | | using tablet::RaftGroupMetadata; |
501 | | using tablet::RaftGroupMetadataPtr; |
502 | | using tablet::TabletPeer; |
503 | | using tablet::RaftGroupStatePB; |
504 | | using tablet::TabletStatusListener; |
505 | | using tablet::TabletStatusPB; |
506 | | using tserver::HandleReplacingStaleTablet; |
507 | | using tserver::TabletServerErrorPB; |
508 | | using yb::pgwrapper::PgWrapper; |
509 | | using yb::server::MasterAddressesToString; |
510 | | |
511 | | using yb::client::YBClient; |
512 | | using yb::client::YBClientBuilder; |
513 | | using yb::client::YBColumnSchema; |
514 | | using yb::client::YBSchema; |
515 | | using yb::client::YBSchemaBuilder; |
516 | | using yb::client::YBTable; |
517 | | using yb::client::YBTableName; |
518 | | |
519 | | namespace { |
520 | | |
521 | | // Macros to access index information in CATALOG. |
522 | | // |
523 | | // NOTES from file master.proto for SysTablesEntryPB. |
524 | | // - For index table: [to be deprecated and replaced by "index_info"] |
525 | | // optional bytes indexed_table_id = 13; // Indexed table id of this index. |
526 | | // optional bool is_local_index = 14 [ default = false ]; // Whether this is a local index. |
527 | | // optional bool is_unique_index = 15 [ default = false ]; // Whether this is a unique index. |
528 | | // - During transition period, we have to consider both fields and the following macros help |
529 | | // avoiding duplicate protobuf version check thru out our code. |
530 | | |
531 | 25.1k | const std::string& GetIndexedTableId(const SysTablesEntryPB& pb) { |
532 | 25.1k | return pb.has_index_info() ? pb.index_info().indexed_table_id() : pb.indexed_table_id(); |
533 | 25.1k | } |
534 | | |
535 | | #define PROTO_GET_IS_LOCAL(tabpb) \ |
536 | 1.00k | (tabpb.has_index_info() ? tabpb.index_info().is_local() \ |
537 | 0 | : tabpb.is_local_index()) |
538 | | |
539 | | #define PROTO_GET_IS_UNIQUE(tabpb) \ |
540 | 1.00k | (tabpb.has_index_info() ? tabpb.index_info().is_unique() \ |
541 | 0 | : tabpb.is_unique_index()) |
542 | | |
543 | | template <class PB> |
544 | 25.4k | bool IsIndex(const PB& pb) { |
545 | 25.4k | return pb.has_index_info() || !pb.indexed_table_id().empty(); |
546 | 25.4k | } catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_17IsIndexINS0_16SysTablesEntryPBEEEbRKT_ Line | Count | Source | 544 | 21.3k | bool IsIndex(const PB& pb) { | 545 | 21.3k | return pb.has_index_info() || !pb.indexed_table_id().empty(); | 546 | 21.3k | } |
catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_17IsIndexINS0_20CreateTableRequestPBEEEbRKT_ Line | Count | Source | 544 | 4.06k | bool IsIndex(const PB& pb) { | 545 | 4.06k | return pb.has_index_info() || !pb.indexed_table_id().empty(); | 546 | 4.06k | } |
|
547 | | |
548 | 5.09k | bool IsTable(const SysTablesEntryPB& pb) { |
549 | 5.09k | return !IsIndex(pb); |
550 | 5.09k | } |
551 | | |
552 | | #define PROTO_PTR_IS_INDEX(tabpb) \ |
553 | | (tabpb->has_index_info() || !tabpb->indexed_table_id().empty()) |
554 | | |
555 | | #define PROTO_PTR_IS_TABLE(tabpb) \ |
556 | 3.65k | (!tabpb->has_index_info() && tabpb->indexed_table_id().empty()) |
557 | | |
558 | | #if (0) |
559 | | // Once the deprecated fields are obsolete, the above macros should be defined as the following. |
560 | | #define GetIndexedTableId(tabpb) (tabpb.index_info().indexed_table_id()) |
561 | | #define PROTO_GET_IS_LOCAL(tabpb) (tabpb.index_info().is_local()) |
562 | | #define PROTO_GET_IS_UNIQUE(tabpb) (tabpb.index_info().is_unique()) |
563 | | #define PROTO_IS_INDEX(tabpb) (tabpb.has_index_info()) |
564 | | #define PROTO_IS_TABLE(tabpb) (!tabpb.has_index_info()) |
565 | | #define PROTO_PTR_IS_INDEX(tabpb) (tabpb->has_index_info()) |
566 | | #define PROTO_PTR_IS_TABLE(tabpb) (!tabpb->has_index_info()) |
567 | | |
568 | | #endif |
569 | | |
570 | | class IndexInfoBuilder { |
571 | | public: |
572 | 18 | explicit IndexInfoBuilder(IndexInfoPB* index_info) : index_info_(*index_info) { |
573 | 0 | DVLOG(3) << " After " << __PRETTY_FUNCTION__ << " index_info_ is " << yb::ToString(index_info_); |
574 | 18 | } |
575 | | |
576 | 18 | void ApplyProperties(const TableId& indexed_table_id, bool is_local, bool is_unique) { |
577 | 18 | index_info_.set_indexed_table_id(indexed_table_id); |
578 | 18 | index_info_.set_version(0); |
579 | 18 | index_info_.set_is_local(is_local); |
580 | 18 | index_info_.set_is_unique(is_unique); |
581 | 0 | DVLOG(3) << " After " << __PRETTY_FUNCTION__ << " index_info_ is " << yb::ToString(index_info_); |
582 | 18 | } |
583 | | |
584 | 18 | CHECKED_STATUS ApplyColumnMapping(const Schema& indexed_schema, const Schema& index_schema) { |
585 | 72 | for (size_t i = 0; i < index_schema.num_columns(); i++) { |
586 | 54 | const auto& col_name = index_schema.column(i).name(); |
587 | 54 | const auto indexed_col_idx = indexed_schema.find_column(col_name); |
588 | 54 | if (PREDICT_FALSE(indexed_col_idx == Schema::kColumnNotFound)) { |
589 | 0 | return STATUS(NotFound, "The indexed table column does not exist", col_name); |
590 | 0 | } |
591 | 54 | auto* col = index_info_.add_columns(); |
592 | 54 | col->set_column_id(index_schema.column_id(i)); |
593 | 54 | col->set_indexed_column_id(indexed_schema.column_id(indexed_col_idx)); |
594 | 54 | } |
595 | 18 | index_info_.set_hash_column_count(narrow_cast<uint32_t>(index_schema.num_hash_key_columns())); |
596 | 18 | index_info_.set_range_column_count(narrow_cast<uint32_t>(index_schema.num_range_key_columns())); |
597 | | |
598 | 36 | for (size_t i = 0; i < indexed_schema.num_hash_key_columns(); i++) { |
599 | 18 | index_info_.add_indexed_hash_column_ids(indexed_schema.column_id(i)); |
600 | 18 | } |
601 | 18 | for (size_t i = indexed_schema.num_hash_key_columns(); i < indexed_schema.num_key_columns(); |
602 | 0 | i++) { |
603 | 0 | index_info_.add_indexed_range_column_ids(indexed_schema.column_id(i)); |
604 | 0 | } |
605 | 0 | DVLOG(3) << " After " << __PRETTY_FUNCTION__ << " index_info_ is " << yb::ToString(index_info_); |
606 | 18 | return Status::OK(); |
607 | 18 | } |
608 | | |
609 | | private: |
610 | | IndexInfoPB& index_info_; |
611 | | }; |
612 | | |
613 | | template<class Lock, class RespClass> |
614 | 325k | Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) { |
615 | | // This covers both in progress and fully deleted objects. |
616 | 325k | if (lock->started_deleting()) { |
617 | 62 | Status s = STATUS_SUBSTITUTE(NotFound, |
618 | 62 | "The object '$0.$1' does not exist", lock->namespace_id(), lock->name()); |
619 | 62 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); |
620 | 62 | } |
621 | 325k | if (!lock->visible_to_client()) { |
622 | 1 | Status s = STATUS_SUBSTITUTE(ServiceUnavailable, |
623 | 1 | "The object '$0.$1' is not running", lock->namespace_id(), lock->name()); |
624 | 1 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); |
625 | 1 | } |
626 | 325k | return Status::OK(); |
627 | 325k | } catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_139CheckIfTableDeletedOrNotVisibleToClientINS_12CowWriteLockINS0_19PersistentTableInfoEEENS0_21CreateTableResponsePBEEENS_6StatusERKT_PT0_ Line | Count | Source | 614 | 603 | Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) { | 615 | | // This covers both in progress and fully deleted objects. | 616 | 603 | if (lock->started_deleting()) { | 617 | 0 | Status s = STATUS_SUBSTITUTE(NotFound, | 618 | 0 | "The object '$0.$1' does not exist", lock->namespace_id(), lock->name()); | 619 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); | 620 | 0 | } | 621 | 603 | if (!lock->visible_to_client()) { | 622 | 0 | Status s = STATUS_SUBSTITUTE(ServiceUnavailable, | 623 | 0 | "The object '$0.$1' is not running", lock->namespace_id(), lock->name()); | 624 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); | 625 | 0 | } | 626 | 603 | return Status::OK(); | 627 | 603 | } |
catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_139CheckIfTableDeletedOrNotVisibleToClientINS_11CowReadLockINS0_19PersistentTableInfoEEENS0_21CreateTableResponsePBEEENS_6StatusERKT_PT0_ Line | Count | Source | 614 | 609 | Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) { | 615 | | // This covers both in progress and fully deleted objects. | 616 | 609 | if (lock->started_deleting()) { | 617 | 1 | Status s = STATUS_SUBSTITUTE(NotFound, | 618 | 1 | "The object '$0.$1' does not exist", lock->namespace_id(), lock->name()); | 619 | 1 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); | 620 | 1 | } | 621 | 608 | if (!lock->visible_to_client()) { | 622 | 0 | Status s = STATUS_SUBSTITUTE(ServiceUnavailable, | 623 | 0 | "The object '$0.$1' is not running", lock->namespace_id(), lock->name()); | 624 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); | 625 | 0 | } | 626 | 608 | return Status::OK(); | 627 | 608 | } |
catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_139CheckIfTableDeletedOrNotVisibleToClientINS_11CowReadLockINS0_19PersistentTableInfoEEENS0_37GetTransactionStatusTabletsResponsePBEEENS_6StatusERKT_PT0_ Line | Count | Source | 614 | 2.01k | Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) { | 615 | | // This covers both in progress and fully deleted objects. | 616 | 2.01k | if (lock->started_deleting()) { | 617 | 0 | Status s = STATUS_SUBSTITUTE(NotFound, | 618 | 0 | "The object '$0.$1' does not exist", lock->namespace_id(), lock->name()); | 619 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); | 620 | 0 | } | 621 | 2.01k | if (!lock->visible_to_client()) { | 622 | 0 | Status s = STATUS_SUBSTITUTE(ServiceUnavailable, | 623 | 0 | "The object '$0.$1' is not running", lock->namespace_id(), lock->name()); | 624 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); | 625 | 0 | } | 626 | 2.01k | return Status::OK(); | 627 | 2.01k | } |
catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_139CheckIfTableDeletedOrNotVisibleToClientINS_11CowReadLockINS0_19PersistentTableInfoEEENS0_27IsCreateTableDoneResponsePBEEENS_6StatusERKT_PT0_ Line | Count | Source | 614 | 20.3k | Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) { | 615 | | // This covers both in progress and fully deleted objects. | 616 | 20.3k | if (lock->started_deleting()) { | 617 | 15 | Status s = STATUS_SUBSTITUTE(NotFound, | 618 | 15 | "The object '$0.$1' does not exist", lock->namespace_id(), lock->name()); | 619 | 15 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); | 620 | 15 | } | 621 | 20.3k | if (!lock->visible_to_client()) { | 622 | 0 | Status s = STATUS_SUBSTITUTE(ServiceUnavailable, | 623 | 0 | "The object '$0.$1' is not running", lock->namespace_id(), lock->name()); | 624 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); | 625 | 0 | } | 626 | 20.3k | return Status::OK(); | 627 | 20.3k | } |
catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_139CheckIfTableDeletedOrNotVisibleToClientINS_11CowReadLockINS0_19PersistentTableInfoEEENS0_23TruncateTableResponsePBEEENS_6StatusERKT_PT0_ Line | Count | Source | 614 | 6.43k | Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) { | 615 | | // This covers both in progress and fully deleted objects. | 616 | 6.43k | if (lock->started_deleting()) { | 617 | 0 | Status s = STATUS_SUBSTITUTE(NotFound, | 618 | 0 | "The object '$0.$1' does not exist", lock->namespace_id(), lock->name()); | 619 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); | 620 | 0 | } | 621 | 6.43k | if (!lock->visible_to_client()) { | 622 | 0 | Status s = STATUS_SUBSTITUTE(ServiceUnavailable, | 623 | 0 | "The object '$0.$1' is not running", lock->namespace_id(), lock->name()); | 624 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); | 625 | 0 | } | 626 | 6.43k | return Status::OK(); | 627 | 6.43k | } |
catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_139CheckIfTableDeletedOrNotVisibleToClientINS_11CowReadLockINS0_19PersistentTableInfoEEENS0_29IsTruncateTableDoneResponsePBEEENS_6StatusERKT_PT0_ Line | Count | Source | 614 | 8.89k | Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) { | 615 | | // This covers both in progress and fully deleted objects. | 616 | 8.89k | if (lock->started_deleting()) { | 617 | 0 | Status s = STATUS_SUBSTITUTE(NotFound, | 618 | 0 | "The object '$0.$1' does not exist", lock->namespace_id(), lock->name()); | 619 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); | 620 | 0 | } | 621 | 8.89k | if (!lock->visible_to_client()) { | 622 | 0 | Status s = STATUS_SUBSTITUTE(ServiceUnavailable, | 623 | 0 | "The object '$0.$1' is not running", lock->namespace_id(), lock->name()); | 624 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); | 625 | 0 | } | 626 | 8.89k | return Status::OK(); | 627 | 8.89k | } |
catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_139CheckIfTableDeletedOrNotVisibleToClientINS_12CowWriteLockINS0_19PersistentTableInfoEEENS0_20AlterTableResponsePBEEENS_6StatusERKT_PT0_ Line | Count | Source | 614 | 2.86k | Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) { | 615 | | // This covers both in progress and fully deleted objects. | 616 | 2.86k | if (lock->started_deleting()) { | 617 | 0 | Status s = STATUS_SUBSTITUTE(NotFound, | 618 | 0 | "The object '$0.$1' does not exist", lock->namespace_id(), lock->name()); | 619 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); | 620 | 0 | } | 621 | 2.86k | if (!lock->visible_to_client()) { | 622 | 0 | Status s = STATUS_SUBSTITUTE(ServiceUnavailable, | 623 | 0 | "The object '$0.$1' is not running", lock->namespace_id(), lock->name()); | 624 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); | 625 | 0 | } | 626 | 2.86k | return Status::OK(); | 627 | 2.86k | } |
catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_139CheckIfTableDeletedOrNotVisibleToClientINS_11CowReadLockINS0_19PersistentTableInfoEEENS0_26IsAlterTableDoneResponsePBEEENS_6StatusERKT_PT0_ Line | Count | Source | 614 | 649 | Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) { | 615 | | // This covers both in progress and fully deleted objects. | 616 | 649 | if (lock->started_deleting()) { | 617 | 0 | Status s = STATUS_SUBSTITUTE(NotFound, | 618 | 0 | "The object '$0.$1' does not exist", lock->namespace_id(), lock->name()); | 619 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); | 620 | 0 | } | 621 | 649 | if (!lock->visible_to_client()) { | 622 | 0 | Status s = STATUS_SUBSTITUTE(ServiceUnavailable, | 623 | 0 | "The object '$0.$1' is not running", lock->namespace_id(), lock->name()); | 624 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); | 625 | 0 | } | 626 | 649 | return Status::OK(); | 627 | 649 | } |
catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_139CheckIfTableDeletedOrNotVisibleToClientINS_11CowReadLockINS0_19PersistentTableInfoEEENS0_24GetTableSchemaResponsePBEEENS_6StatusERKT_PT0_ Line | Count | Source | 614 | 115k | Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) { | 615 | | // This covers both in progress and fully deleted objects. | 616 | 115k | if (lock->started_deleting()) { | 617 | 14 | Status s = STATUS_SUBSTITUTE(NotFound, | 618 | 14 | "The object '$0.$1' does not exist", lock->namespace_id(), lock->name()); | 619 | 14 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); | 620 | 14 | } | 621 | 115k | if (!lock->visible_to_client()) { | 622 | 1 | Status s = STATUS_SUBSTITUTE(ServiceUnavailable, | 623 | 1 | "The object '$0.$1' is not running", lock->namespace_id(), lock->name()); | 624 | 1 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); | 625 | 1 | } | 626 | 115k | return Status::OK(); | 627 | 115k | } |
Unexecuted instantiation: catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_139CheckIfTableDeletedOrNotVisibleToClientINS_11CowReadLockINS0_19PersistentTableInfoEEENS0_34GetColocatedTabletSchemaResponsePBEEENS_6StatusERKT_PT0_ catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_139CheckIfTableDeletedOrNotVisibleToClientINS_11CowReadLockINS0_19PersistentTableInfoEEENS0_27GetTableLocationsResponsePBEEENS_6StatusERKT_PT0_ Line | Count | Source | 614 | 167k | Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) { | 615 | | // This covers both in progress and fully deleted objects. | 616 | 167k | if (lock->started_deleting()) { | 617 | 32 | Status s = STATUS_SUBSTITUTE(NotFound, | 618 | 32 | "The object '$0.$1' does not exist", lock->namespace_id(), lock->name()); | 619 | 32 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); | 620 | 32 | } | 621 | 167k | if (!lock->visible_to_client()) { | 622 | 0 | Status s = STATUS_SUBSTITUTE(ServiceUnavailable, | 623 | 0 | "The object '$0.$1' is not running", lock->namespace_id(), lock->name()); | 624 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); | 625 | 0 | } | 626 | 167k | return Status::OK(); | 627 | 167k | } |
|
628 | | |
629 | 4.51k | #define VERIFY_NAMESPACE_FOUND(expr, resp) RESULT_CHECKER_HELPER( \ |
630 | 4.14k | expr, \ |
631 | 4.14k | if (!__result.ok()) { \ |
632 | 4.14k | return SetupError((resp)->mutable_error(), __result.status()); \ |
633 | 4.14k | }); |
634 | | |
635 | 2 | MasterErrorPB_Code NamespaceMasterError(SysNamespaceEntryPB_State state) { |
636 | 2 | switch (state) { |
637 | 2 | case SysNamespaceEntryPB::PREPARING: FALLTHROUGH_INTENDED; |
638 | 2 | case SysNamespaceEntryPB::DELETING: |
639 | 2 | return MasterErrorPB::IN_TRANSITION_CAN_RETRY; |
640 | 0 | case SysNamespaceEntryPB::DELETED: FALLTHROUGH_INTENDED; |
641 | 0 | case SysNamespaceEntryPB::FAILED: FALLTHROUGH_INTENDED; |
642 | 0 | case SysNamespaceEntryPB::RUNNING: |
643 | 0 | return MasterErrorPB::INTERNAL_ERROR; |
644 | 0 | default: |
645 | 0 | FATAL_INVALID_ENUM_VALUE(SysNamespaceEntryPB_State, state); |
646 | 2 | } |
647 | 2 | } |
648 | | |
649 | 255k | size_t GetNameMapperIndex(YQLDatabase db_type) { |
650 | 255k | switch (db_type) { |
651 | 0 | case YQL_DATABASE_UNKNOWN: break; |
652 | 250k | case YQL_DATABASE_CQL: return 1; |
653 | 2.47k | case YQL_DATABASE_PGSQL: return 2; |
654 | 2.45k | case YQL_DATABASE_REDIS: return 3; |
655 | 0 | } |
656 | 0 | CHECK(false) << "Unexpected db type " << db_type; |
657 | 0 | return 0; |
658 | 0 | } |
659 | | |
660 | 4.27k | bool IsIndexBackfillEnabled(TableType table_type, bool is_transactional) { |
661 | | // Fetch the runtime flag to prevent any issues from the updates to flag while processing. |
662 | 4.27k | const bool disabled = |
663 | 4.27k | (table_type == PGSQL_TABLE_TYPE |
664 | 1.59k | ? GetAtomicFlag(&FLAGS_ysql_disable_index_backfill) |
665 | 2.68k | : GetAtomicFlag(&FLAGS_disable_index_backfill) || |
666 | 2.68k | (!is_transactional && GetAtomicFlag(&FLAGS_disable_index_backfill_for_non_txn_tables))); |
667 | 4.27k | return !disabled; |
668 | 4.27k | } |
669 | | |
670 | | constexpr auto kDefaultYQLPartitionsRefreshBgTaskSleep = 10s; |
671 | | |
672 | | void FillRetainedBySnapshotSchedules( |
673 | | const SnapshotSchedulesToObjectIdsMap& schedules_to_tables_map, |
674 | | const TableId& table_id, |
675 | 2.80k | RepeatedBytes* retained_by_snapshot_schedules) { |
676 | 0 | for (const auto& entry : schedules_to_tables_map) { |
677 | 0 | if (std::binary_search(entry.second.begin(), entry.second.end(), table_id)) { |
678 | 0 | retained_by_snapshot_schedules->Add()->assign( |
679 | 0 | entry.first.AsSlice().cdata(), entry.first.size()); |
680 | 0 | } |
681 | 0 | } |
682 | 2.80k | } |
683 | | |
684 | 5.38k | int GetTransactionTableNumShardsPerTServer() { |
685 | 5.38k | int value = 8; |
686 | 5.38k | if (IsTsan()) { |
687 | 0 | value = 2; |
688 | 5.38k | } else if (base::NumCPUs() <= 2) { |
689 | 0 | value = 4; |
690 | 0 | } |
691 | 5.38k | return value; |
692 | 5.38k | } |
693 | | |
694 | 5.45k | void InitMasterFlags() { |
695 | 5.45k | yb::InitCommonFlags(); |
696 | 5.45k | if (GetAtomicFlag(&FLAGS_transaction_table_num_tablets_per_tserver) == |
697 | 5.38k | kAutoDetectNumShardsPerTServer) { |
698 | 5.38k | const auto value = GetTransactionTableNumShardsPerTServer(); |
699 | 0 | VLOG(1) << "Auto setting FLAGS_transaction_table_num_tablets_per_tserver to " << value; |
700 | 5.38k | SetAtomicFlag(value, &FLAGS_transaction_table_num_tablets_per_tserver); |
701 | 5.38k | } |
702 | 5.45k | } |
703 | | |
704 | 5.31k | Result<bool> DoesTableExist(const Result<TableInfoPtr>& result) { |
705 | 5.31k | if (result.ok()) { |
706 | 4.65k | return true; |
707 | 4.65k | } |
708 | 653 | if (result.status().IsNotFound() |
709 | 653 | && MasterError(result.status()) == MasterErrorPB::OBJECT_NOT_FOUND) { |
710 | 653 | return false; |
711 | 653 | } |
712 | 0 | return result.status(); |
713 | 0 | } |
714 | | |
715 | | } // anonymous namespace |
716 | | |
717 | | //////////////////////////////////////////////////////////// |
718 | | // CatalogManager |
719 | | //////////////////////////////////////////////////////////// |
720 | | |
721 | | CatalogManager::NamespaceInfoMap& CatalogManager::NamespaceNameMapper::operator[]( |
722 | 13.5k | YQLDatabase db_type) { |
723 | 13.5k | return typed_maps_[GetNameMapperIndex(db_type)]; |
724 | 13.5k | } |
725 | | |
726 | | const CatalogManager::NamespaceInfoMap& CatalogManager::NamespaceNameMapper::operator[]( |
727 | 241k | YQLDatabase db_type) const { |
728 | 241k | return typed_maps_[GetNameMapperIndex(db_type)]; |
729 | 241k | } |
730 | | |
731 | 2.37k | void CatalogManager::NamespaceNameMapper::clear() { |
732 | 9.48k | for (auto& m : typed_maps_) { |
733 | 9.48k | m.clear(); |
734 | 9.48k | } |
735 | 2.37k | } |
736 | | |
737 | | CatalogManager::CatalogManager(Master* master) |
738 | | : master_(master), |
739 | | tablet_exists_(false), |
740 | | state_(kConstructed), |
741 | | leader_ready_term_(-1), |
742 | | leader_lock_(RWMutex::Priority::PREFER_WRITING), |
743 | | load_balance_policy_(std::make_unique<ClusterLoadBalancer>(this)), |
744 | | permissions_manager_(std::make_unique<PermissionsManager>(this)), |
745 | | tasks_tracker_(new TasksTracker(IsUserInitiated::kFalse)), |
746 | | jobs_tracker_(new TasksTracker(IsUserInitiated::kTrue)), |
747 | | encryption_manager_(new EncryptionManager()), |
748 | | tablespace_manager_(std::make_shared<YsqlTablespaceManager>(nullptr, nullptr)), |
749 | | tablespace_bg_task_running_(false), |
750 | 5.45k | tablet_split_manager_(this, this, this) { |
751 | 5.45k | InitMasterFlags(); |
752 | 5.45k | CHECK_OK(ThreadPoolBuilder("leader-initialization") |
753 | 5.45k | .set_max_threads(1) |
754 | 5.45k | .Build(&leader_initialization_pool_)); |
755 | 5.45k | CHECK_OK(ThreadPoolBuilder("CatalogManagerBGTasks").Build(&background_tasks_thread_pool_)); |
756 | 5.45k | CHECK_OK(ThreadPoolBuilder("async-tasks").Build(&async_task_pool_)); |
757 | | |
758 | 5.45k | if (master_) { |
759 | 5.45k | sys_catalog_.reset(new SysCatalogTable( |
760 | 5.45k | master_, master_->metric_registry(), |
761 | 5.45k | Bind(&CatalogManager::ElectedAsLeaderCb, Unretained(this)))); |
762 | 5.45k | } |
763 | 5.45k | } |
764 | | |
765 | 92 | CatalogManager::~CatalogManager() { |
766 | 92 | if (StartShutdown()) { |
767 | 0 | CompleteShutdown(); |
768 | 0 | } |
769 | 92 | } |
770 | | |
771 | 5.42k | Status CatalogManager::Init() { |
772 | 5.42k | { |
773 | 5.42k | std::lock_guard<simple_spinlock> l(state_lock_); |
774 | 5.42k | CHECK_EQ(kConstructed, state_); |
775 | 5.42k | state_ = kStarting; |
776 | 5.42k | } |
777 | | |
778 | 5.42k | if (master_) { |
779 | 5.42k | ysql_transaction_ = std::make_unique<YsqlTransactionDdl>( |
780 | 5.42k | sys_catalog_.get(), master_->async_client_initializer().get_client_future(), |
781 | 5.42k | background_tasks_thread_pool_.get()); |
782 | 5.42k | } |
783 | | |
784 | | // Initialize the metrics emitted by the catalog manager. |
785 | 5.42k | metric_num_tablet_servers_live_ = |
786 | 5.42k | METRIC_num_tablet_servers_live.Instantiate(master_->metric_entity_cluster(), 0); |
787 | | |
788 | 5.42k | metric_num_tablet_servers_dead_ = |
789 | 5.42k | METRIC_num_tablet_servers_dead.Instantiate(master_->metric_entity_cluster(), 0); |
790 | | |
791 | 5.42k | RETURN_NOT_OK_PREPEND(InitSysCatalogAsync(), |
792 | 5.41k | "Failed to initialize sys tables async"); |
793 | | |
794 | 5.41k | if (PREDICT_FALSE(FLAGS_TEST_simulate_slow_system_tablet_bootstrap_secs > 0)) { |
795 | 9 | LOG_WITH_PREFIX(INFO) << "Simulating slow system tablet bootstrap"; |
796 | 9 | SleepFor(MonoDelta::FromSeconds(FLAGS_TEST_simulate_slow_system_tablet_bootstrap_secs)); |
797 | 9 | } |
798 | | |
799 | | // WaitUntilRunning() must run outside of the lock as to prevent |
800 | | // deadlock. This is safe as WaitUntilRunning waits for another |
801 | | // thread to finish its work and doesn't itself depend on any state |
802 | | // within CatalogManager. Need not start sys catalog or background tasks |
803 | | // when we are started in shell mode. |
804 | 5.41k | if (!master_->opts().IsShellMode()) { |
805 | 5.31k | RETURN_NOT_OK_PREPEND(sys_catalog_->WaitUntilRunning(), |
806 | 5.31k | "Failed waiting for the catalog tablet to run"); |
807 | 5.31k | std::vector<consensus::RaftPeerPB> masters_raft; |
808 | 5.31k | RETURN_NOT_OK(master_->ListRaftConfigMasters(&masters_raft)); |
809 | 5.31k | std::vector<HostPort> hps; |
810 | 14.4k | for (const auto& peer : masters_raft) { |
811 | 14.4k | if (NodeInstance().permanent_uuid() == peer.permanent_uuid()) { |
812 | 5.31k | continue; |
813 | 5.31k | } |
814 | 9.17k | HostPort hp = HostPortFromPB(DesiredHostPort(peer, master_->MakeCloudInfoPB())); |
815 | 9.17k | hps.push_back(hp); |
816 | 9.17k | } |
817 | 5.31k | universe_key_client_ = std::make_unique<client::UniverseKeyClient>( |
818 | 9.16k | hps, &master_->proxy_cache(), [&] (const encryption::UniverseKeysPB& universe_keys) { |
819 | 9.16k | encryption_manager_->PopulateUniverseKeys(universe_keys); |
820 | 9.16k | }); |
821 | 5.31k | universe_key_client_->GetUniverseKeyRegistryAsync(); |
822 | 5.31k | RETURN_NOT_OK(EnableBgTasks()); |
823 | 5.31k | } |
824 | | |
825 | | // Cache the server registration even for shell mode masters. See |
826 | | // https://github.com/yugabyte/yugabyte-db/issues/8065. |
827 | 5.41k | RETURN_NOT_OK(GetRegistration(&server_registration_)); |
828 | | |
829 | 5.41k | { |
830 | 5.41k | std::lock_guard<simple_spinlock> l(state_lock_); |
831 | 5.41k | CHECK_EQ(kStarting, state_); |
832 | 5.41k | state_ = kRunning; |
833 | 5.41k | } |
834 | | |
835 | 5.41k | Started(); |
836 | | |
837 | 5.41k | return Status::OK(); |
838 | 5.41k | } |
839 | | |
840 | | Status CatalogManager::ChangeEncryptionInfo(const ChangeEncryptionInfoRequestPB* req, |
841 | 0 | ChangeEncryptionInfoResponsePB* resp) { |
842 | 0 | return STATUS(InvalidCommand, "Command only supported in enterprise build."); |
843 | 0 | } |
844 | | |
845 | 2.01k | Status CatalogManager::ElectedAsLeaderCb() { |
846 | 2.01k | time_elected_leader_ = MonoTime::Now(); |
847 | 2.01k | return leader_initialization_pool_->SubmitClosure( |
848 | 2.01k | Bind(&CatalogManager::LoadSysCatalogDataTask, Unretained(this))); |
849 | 2.01k | } |
850 | | |
851 | 2.01k | Status CatalogManager::WaitUntilCaughtUpAsLeader(const MonoDelta& timeout) { |
852 | 2.01k | string uuid = master_->fs_manager()->uuid(); |
853 | 2.01k | Consensus* consensus = tablet_peer()->consensus(); |
854 | 2.01k | ConsensusStatePB cstate = consensus->ConsensusState(CONSENSUS_CONFIG_ACTIVE); |
855 | 2.01k | if (!cstate.has_leader_uuid() || cstate.leader_uuid() != uuid) { |
856 | 0 | return STATUS_SUBSTITUTE(IllegalState, |
857 | 0 | "Node $0 not leader. Consensus state: $1", uuid, cstate.ShortDebugString()); |
858 | 0 | } |
859 | | |
860 | | // Wait for all transactions to be committed. |
861 | 2.01k | const CoarseTimePoint deadline = CoarseMonoClock::now() + timeout; |
862 | 2.01k | { |
863 | 2.01k | tablet::HistoryCutoffPropagationDisabler disabler(tablet_peer()->tablet()->RetentionPolicy()); |
864 | 2.01k | RETURN_NOT_OK(tablet_peer()->operation_tracker()->WaitForAllToFinish(timeout)); |
865 | 2.01k | } |
866 | | |
867 | 2.01k | RETURN_NOT_OK(tablet_peer()->consensus()->WaitForLeaderLeaseImprecise(deadline)); |
868 | 2.01k | return Status::OK(); |
869 | 2.01k | } |
870 | | |
871 | 2.01k | void CatalogManager::LoadSysCatalogDataTask() { |
872 | 2.01k | auto consensus = tablet_peer()->shared_consensus(); |
873 | 2.01k | const int64_t term = consensus->ConsensusState(CONSENSUS_CONFIG_ACTIVE).current_term(); |
874 | 2.01k | Status s = WaitUntilCaughtUpAsLeader( |
875 | 2.01k | MonoDelta::FromMilliseconds(FLAGS_master_failover_catchup_timeout_ms)); |
876 | | |
877 | 2.01k | int64_t term_after_wait = consensus->ConsensusState(CONSENSUS_CONFIG_ACTIVE).current_term(); |
878 | 2.01k | if (term_after_wait != term) { |
879 | | // If we got elected leader again while waiting to catch up then we will get another callback to |
880 | | // update state from sys_catalog, so bail now. |
881 | | // |
882 | | // If we failed when waiting, i.e. could not acquire a leader lease, this could be due to us |
883 | | // becoming a follower. If we're not partitioned away, we'll know about a new term soon. |
884 | 1 | LOG_WITH_PREFIX(INFO) |
885 | 1 | << "Term change from " << term << " to " << term_after_wait |
886 | 1 | << " while waiting for master leader catchup. Not loading sys catalog metadata. " |
887 | 1 | << "Status of waiting: " << s; |
888 | 1 | return; |
889 | 1 | } |
890 | | |
891 | 2.01k | if (!s.ok()) { |
892 | | // This could happen e.g. if we are a partitioned-away leader that failed to acquire a leader |
893 | | // lease. |
894 | | // |
895 | | // TODO: handle this cleanly by transitioning to a follower without crashing. |
896 | 0 | LOG_WITH_PREFIX(WARNING) << "Failed waiting for node to catch up after master election: " << s; |
897 | |
|
898 | 0 | if (s.IsTimedOut()) { |
899 | 0 | LOG_WITH_PREFIX(FATAL) << "Shutting down due to unavailability of other masters after" |
900 | 0 | << " election. TODO: Abdicate instead."; |
901 | 0 | } |
902 | 0 | return; |
903 | 0 | } |
904 | | |
905 | 2.01k | LOG_WITH_PREFIX(INFO) << "Loading table and tablet metadata into memory for term " << term; |
906 | 2.01k | LOG_SLOW_EXECUTION(WARNING, 1000, LogPrefix() + "Loading metadata into memory") { |
907 | 2.00k | Status status = VisitSysCatalog(term); |
908 | 2.00k | if (!status.ok()) { |
909 | 1 | { |
910 | 1 | std::lock_guard<simple_spinlock> l(state_lock_); |
911 | 1 | if (state_ == kClosing) { |
912 | 0 | LOG_WITH_PREFIX(INFO) |
913 | 0 | << "Error loading sys catalog; because shutdown is in progress. term " << term |
914 | 0 | << " status : " << status; |
915 | 0 | return; |
916 | 0 | } |
917 | 1 | } |
918 | 1 | auto new_term = consensus->ConsensusState(CONSENSUS_CONFIG_ACTIVE).current_term(); |
919 | 1 | if (new_term != term) { |
920 | 0 | LOG_WITH_PREFIX(INFO) |
921 | 0 | << "Error loading sys catalog; but that's OK as term was changed from " << term |
922 | 0 | << " to " << new_term << ": " << status; |
923 | 0 | return; |
924 | 0 | } |
925 | 1 | LOG_WITH_PREFIX(FATAL) << "Failed to load sys catalog: " << status; |
926 | 1 | } |
927 | 2.00k | } |
928 | | |
929 | 2.01k | { |
930 | 2.01k | std::lock_guard<simple_spinlock> l(state_lock_); |
931 | 2.01k | leader_ready_term_ = term; |
932 | 2.01k | LOG_WITH_PREFIX(INFO) << "Completed load of sys catalog in term " << term; |
933 | 2.01k | } |
934 | 2.01k | SysCatalogLoaded(term); |
935 | | // Once we have loaded the SysCatalog, reset and regenerate the yql partitions table in order to |
936 | | // regenerate entries for previous tables. |
937 | 2.01k | GetYqlPartitionsVtable().ResetAndRegenerateCache(); |
938 | 2.01k | } |
939 | | |
940 | 429 | CHECKED_STATUS CatalogManager::WaitForWorkerPoolTests(const MonoDelta& timeout) const { |
941 | 429 | if (!async_task_pool_->WaitFor(timeout)) { |
942 | 0 | return STATUS(TimedOut, "Worker Pool hasn't finished processing tasks"); |
943 | 0 | } |
944 | 429 | return Status::OK(); |
945 | 429 | } |
946 | | |
947 | 2.00k | Status CatalogManager::VisitSysCatalog(int64_t term) { |
948 | | // Block new catalog operations, and wait for existing operations to finish. |
949 | 2.00k | LOG_WITH_PREFIX_AND_FUNC(INFO) |
950 | 2.00k | << "Wait on leader_lock_ for any existing operations to finish. Term: " << term; |
951 | 2.00k | auto start = std::chrono::steady_clock::now(); |
952 | 2.00k | std::lock_guard<RWMutex> leader_lock_guard(leader_lock_); |
953 | 2.00k | auto finish = std::chrono::steady_clock::now(); |
954 | | |
955 | 2.00k | static const auto kLongLockAcquisitionLimit = RegularBuildVsSanitizers(100ms, 750ms); |
956 | 2.00k | if (finish > start + kLongLockAcquisitionLimit) { |
957 | 0 | LOG_WITH_PREFIX(WARNING) << "Long wait on leader_lock_: " << yb::ToString(finish - start); |
958 | 0 | } |
959 | | |
960 | 2.00k | LOG_WITH_PREFIX(INFO) |
961 | 2.00k | << __func__ << ": Acquire catalog manager lock_ before loading sys catalog."; |
962 | 2.00k | LockGuard lock(mutex_); |
963 | 0 | VLOG_WITH_FUNC(3) << "Acquired the catalog manager lock"; |
964 | | |
965 | | // Abort any outstanding tasks. All TableInfos are orphaned below, so |
966 | | // it's important to end their tasks now; otherwise Shutdown() will |
967 | | // destroy master state used by these tasks. |
968 | 2.00k | std::vector<scoped_refptr<TableInfo>> tables; |
969 | 2.00k | AppendValuesFromMap(*table_ids_map_, &tables); |
970 | 2.00k | AbortAndWaitForAllTasks(tables); |
971 | | |
972 | | // Clear internal maps and run data loaders. |
973 | 2.00k | RETURN_NOT_OK(RunLoaders(term)); |
974 | | |
975 | | // Prepare various default system configurations. |
976 | 2.00k | RETURN_NOT_OK(PrepareDefaultSysConfig(term)); |
977 | | |
978 | 2.00k | if ((FLAGS_use_initial_sys_catalog_snapshot || FLAGS_enable_ysql) && |
979 | 451 | !FLAGS_initial_sys_catalog_snapshot_path.empty() && |
980 | 365 | !FLAGS_create_initial_sys_catalog_snapshot) { |
981 | 365 | if (!namespace_ids_map_.empty() || !system_tablets_.empty()) { |
982 | 4 | LOG_WITH_PREFIX(INFO) |
983 | 4 | << "This is an existing cluster, not initializing from a sys catalog snapshot."; |
984 | 361 | } else { |
985 | 361 | Result<bool> dir_exists = |
986 | 361 | Env::Default()->DoesDirectoryExist(FLAGS_initial_sys_catalog_snapshot_path); |
987 | 361 | if (dir_exists.ok() && *dir_exists) { |
988 | 361 | bool initdb_was_already_done = false; |
989 | 361 | { |
990 | 361 | auto l = ysql_catalog_config_->LockForRead(); |
991 | 361 | initdb_was_already_done = l->pb.ysql_catalog_config().initdb_done(); |
992 | 361 | } |
993 | 361 | if (initdb_was_already_done) { |
994 | 0 | LOG_WITH_PREFIX(INFO) |
995 | 0 | << "initdb has been run before, no need to restore sys catalog from " |
996 | 0 | << "the initial snapshot"; |
997 | 361 | } else { |
998 | 361 | LOG_WITH_PREFIX(INFO) << "Restoring snapshot in sys catalog"; |
999 | 361 | Status restore_status = RestoreInitialSysCatalogSnapshot( |
1000 | 361 | FLAGS_initial_sys_catalog_snapshot_path, |
1001 | 361 | sys_catalog_->tablet_peer().get(), |
1002 | 361 | term); |
1003 | 361 | if (!restore_status.ok()) { |
1004 | 0 | LOG_WITH_PREFIX(ERROR) << "Failed restoring snapshot in sys catalog"; |
1005 | 0 | return restore_status; |
1006 | 0 | } |
1007 | | |
1008 | 361 | LOG_WITH_PREFIX(INFO) << "Re-initializing cluster config"; |
1009 | 361 | cluster_config_.reset(); |
1010 | 361 | RETURN_NOT_OK(PrepareDefaultClusterConfig(term)); |
1011 | | |
1012 | 361 | LOG_WITH_PREFIX(INFO) << "Restoring snapshot completed, considering initdb finished"; |
1013 | 361 | RETURN_NOT_OK(InitDbFinished(Status::OK(), term)); |
1014 | 361 | RETURN_NOT_OK(RunLoaders(term)); |
1015 | 361 | } |
1016 | 0 | } else { |
1017 | 0 | LOG_WITH_PREFIX(WARNING) |
1018 | 0 | << "Initial sys catalog snapshot directory does not exist: " |
1019 | 0 | << FLAGS_initial_sys_catalog_snapshot_path |
1020 | 0 | << (dir_exists.ok() ? "" : ", status: " + dir_exists.status().ToString()); |
1021 | 0 | } |
1022 | 361 | } |
1023 | 365 | } |
1024 | | |
1025 | | // Create the system namespaces (created only if they don't already exist). |
1026 | 2.00k | RETURN_NOT_OK(PrepareDefaultNamespaces(term)); |
1027 | | |
1028 | | // Create the system tables (created only if they don't already exist). |
1029 | 2.00k | RETURN_NOT_OK(PrepareSystemTables(term)); |
1030 | | |
1031 | | // Create the default cassandra (created only if they don't already exist). |
1032 | 2.00k | RETURN_NOT_OK(permissions_manager_->PrepareDefaultRoles(term)); |
1033 | | |
1034 | | // If this is the first time we start up, we have no config information as default. We write an |
1035 | | // empty version 0. |
1036 | 2.00k | RETURN_NOT_OK(PrepareDefaultClusterConfig(term)); |
1037 | | |
1038 | 2.00k | permissions_manager_->BuildRecursiveRoles(); |
1039 | | |
1040 | 2.00k | if (FLAGS_enable_ysql) { |
1041 | | // Number of TS to wait for before creating the txn table. |
1042 | 450 | auto wait_ts_count = std::max(FLAGS_txn_table_wait_min_ts_count, FLAGS_replication_factor); |
1043 | | |
1044 | 450 | LOG_WITH_PREFIX(INFO) |
1045 | 450 | << "YSQL is enabled, will create the transaction status table when " |
1046 | 450 | << wait_ts_count << " tablet servers are online"; |
1047 | 391 | master_->ts_manager()->SetTSCountCallback(wait_ts_count, [this, wait_ts_count] { |
1048 | 391 | LOG_WITH_PREFIX(INFO) |
1049 | 391 | << wait_ts_count |
1050 | 391 | << " tablet servers registered, creating the transaction status table"; |
1051 | | // Retry table creation until it succeedes. It might fail initially because placement UUID |
1052 | | // of live replicas is set through an RPC from YugaWare, and we won't be able to calculate |
1053 | | // the number of primary (non-read-replica) tablet servers until that happens. |
1054 | 398 | while (true) { |
1055 | 393 | const auto s = CreateGlobalTransactionStatusTableIfNeeded(/* rpc */ nullptr); |
1056 | 393 | if (s.ok()) { |
1057 | 386 | break; |
1058 | 386 | } |
1059 | 7 | LOG_WITH_PREFIX(WARNING) << "Failed creating transaction status table, waiting: " << s; |
1060 | 7 | if (s.IsShutdownInProgress()) { |
1061 | 0 | return; |
1062 | 0 | } |
1063 | 7 | auto role = Role(); |
1064 | 7 | if (role != PeerRole::LEADER) { |
1065 | 0 | LOG_WITH_PREFIX(WARNING) |
1066 | 0 | << "Cancel creating transaction because of role: " << PeerRole_Name(role); |
1067 | 0 | return; |
1068 | 0 | } |
1069 | 7 | SleepFor(MonoDelta::FromSeconds(1)); |
1070 | 7 | } |
1071 | 391 | LOG_WITH_PREFIX(INFO) << "Finished creating transaction status table asynchronously"; |
1072 | 391 | }); |
1073 | 450 | } |
1074 | | |
1075 | 2.00k | if (!StartRunningInitDbIfNeeded(term)) { |
1076 | | // If we are not running initdb, this is an existing cluster, and we need to check whether we |
1077 | | // need to do a one-time migration to make YSQL system catalog tables transactional. |
1078 | 2.00k | RETURN_NOT_OK(MakeYsqlSysCatalogTablesTransactional( |
1079 | 2.00k | table_ids_map_.CheckOut().get_ptr(), sys_catalog_.get(), ysql_catalog_config_.get(), term)); |
1080 | 2.00k | } |
1081 | | |
1082 | 2.00k | return Status::OK(); |
1083 | 2.00k | } |
1084 | | |
1085 | | template <class Loader> |
1086 | 18.9k | Status CatalogManager::Load(const std::string& title, const int64_t term) { |
1087 | 18.9k | LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory."; |
1088 | 18.9k | std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term); |
1089 | 18.9k | RETURN_NOT_OK_PREPEND( |
1090 | 18.9k | sys_catalog_->Visit(loader.get()), |
1091 | 18.9k | "Failed while visiting " + title + " in sys catalog"); |
1092 | 18.9k | return Status::OK(); |
1093 | 18.9k | } _ZN2yb6master14CatalogManager4LoadINS0_10RoleLoaderEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEEx Line | Count | Source | 1086 | 2.37k | Status CatalogManager::Load(const std::string& title, const int64_t term) { | 1087 | 2.37k | LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory."; | 1088 | 2.37k | std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term); | 1089 | 2.37k | RETURN_NOT_OK_PREPEND( | 1090 | 2.37k | sys_catalog_->Visit(loader.get()), | 1091 | 2.37k | "Failed while visiting " + title + " in sys catalog"); | 1092 | 2.37k | return Status::OK(); | 1093 | 2.37k | } |
_ZN2yb6master14CatalogManager4LoadINS0_15SysConfigLoaderEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEEx Line | Count | Source | 1086 | 2.37k | Status CatalogManager::Load(const std::string& title, const int64_t term) { | 1087 | 2.37k | LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory."; | 1088 | 2.37k | std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term); | 1089 | 2.37k | RETURN_NOT_OK_PREPEND( | 1090 | 2.37k | sys_catalog_->Visit(loader.get()), | 1091 | 2.37k | "Failed while visiting " + title + " in sys catalog"); | 1092 | 2.37k | return Status::OK(); | 1093 | 2.37k | } |
_ZN2yb6master14CatalogManager4LoadINS0_11TableLoaderEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEEx Line | Count | Source | 1086 | 2.37k | Status CatalogManager::Load(const std::string& title, const int64_t term) { | 1087 | 2.37k | LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory."; | 1088 | 2.37k | std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term); | 1089 | 2.37k | RETURN_NOT_OK_PREPEND( | 1090 | 2.37k | sys_catalog_->Visit(loader.get()), | 1091 | 2.37k | "Failed while visiting " + title + " in sys catalog"); | 1092 | 2.37k | return Status::OK(); | 1093 | 2.37k | } |
_ZN2yb6master14CatalogManager4LoadINS0_12TabletLoaderEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEEx Line | Count | Source | 1086 | 2.37k | Status CatalogManager::Load(const std::string& title, const int64_t term) { | 1087 | 2.37k | LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory."; | 1088 | 2.37k | std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term); | 1089 | 2.37k | RETURN_NOT_OK_PREPEND( | 1090 | 2.37k | sys_catalog_->Visit(loader.get()), | 1091 | 2.37k | "Failed while visiting " + title + " in sys catalog"); | 1092 | 2.37k | return Status::OK(); | 1093 | 2.37k | } |
_ZN2yb6master14CatalogManager4LoadINS0_15NamespaceLoaderEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEEx Line | Count | Source | 1086 | 2.37k | Status CatalogManager::Load(const std::string& title, const int64_t term) { | 1087 | 2.37k | LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory."; | 1088 | 2.37k | std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term); | 1089 | 2.37k | RETURN_NOT_OK_PREPEND( | 1090 | 2.37k | sys_catalog_->Visit(loader.get()), | 1091 | 2.37k | "Failed while visiting " + title + " in sys catalog"); | 1092 | 2.37k | return Status::OK(); | 1093 | 2.37k | } |
_ZN2yb6master14CatalogManager4LoadINS0_12UDTypeLoaderEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEEx Line | Count | Source | 1086 | 2.37k | Status CatalogManager::Load(const std::string& title, const int64_t term) { | 1087 | 2.37k | LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory."; | 1088 | 2.37k | std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term); | 1089 | 2.37k | RETURN_NOT_OK_PREPEND( | 1090 | 2.37k | sys_catalog_->Visit(loader.get()), | 1091 | 2.37k | "Failed while visiting " + title + " in sys catalog"); | 1092 | 2.37k | return Status::OK(); | 1093 | 2.37k | } |
_ZN2yb6master14CatalogManager4LoadINS0_19ClusterConfigLoaderEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEEx Line | Count | Source | 1086 | 2.37k | Status CatalogManager::Load(const std::string& title, const int64_t term) { | 1087 | 2.37k | LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory."; | 1088 | 2.37k | std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term); | 1089 | 2.37k | RETURN_NOT_OK_PREPEND( | 1090 | 2.37k | sys_catalog_->Visit(loader.get()), | 1091 | 2.37k | "Failed while visiting " + title + " in sys catalog"); | 1092 | 2.37k | return Status::OK(); | 1093 | 2.37k | } |
_ZN2yb6master14CatalogManager4LoadINS0_17RedisConfigLoaderEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEEx Line | Count | Source | 1086 | 2.37k | Status CatalogManager::Load(const std::string& title, const int64_t term) { | 1087 | 2.37k | LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory."; | 1088 | 2.37k | std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term); | 1089 | 2.37k | RETURN_NOT_OK_PREPEND( | 1090 | 2.37k | sys_catalog_->Visit(loader.get()), | 1091 | 2.37k | "Failed while visiting " + title + " in sys catalog"); | 1092 | 2.37k | return Status::OK(); | 1093 | 2.37k | } |
|
1094 | | |
1095 | 2.37k | Status CatalogManager::RunLoaders(int64_t term) { |
1096 | | // Clear the table and tablet state. |
1097 | 2.37k | table_names_map_.clear(); |
1098 | 2.37k | transaction_table_ids_set_.clear(); |
1099 | 2.37k | auto table_ids_map_checkout = table_ids_map_.CheckOut(); |
1100 | 2.37k | table_ids_map_checkout->clear(); |
1101 | | |
1102 | 2.37k | auto tablet_map_checkout = tablet_map_.CheckOut(); |
1103 | 2.37k | tablet_map_checkout->clear(); |
1104 | | |
1105 | | // Clear the namespace mappings. |
1106 | 2.37k | namespace_ids_map_.clear(); |
1107 | 2.37k | namespace_names_mapper_.clear(); |
1108 | | |
1109 | | // Clear the type mappings. |
1110 | 2.37k | udtype_ids_map_.clear(); |
1111 | 2.37k | udtype_names_map_.clear(); |
1112 | | |
1113 | | // Clear the current cluster config. |
1114 | 2.37k | cluster_config_.reset(); |
1115 | | |
1116 | | // Clear redis config mapping. |
1117 | 2.37k | redis_config_map_.clear(); |
1118 | | |
1119 | | // Clear ysql catalog config. |
1120 | 2.37k | ysql_catalog_config_.reset(); |
1121 | | |
1122 | | // Clear transaction tables config. |
1123 | 2.37k | transaction_tables_config_.reset(); |
1124 | | |
1125 | | // Clear recent tasks. |
1126 | 2.37k | tasks_tracker_->Reset(); |
1127 | | |
1128 | | // Clear recent jobs. |
1129 | 2.37k | jobs_tracker_->Reset(); |
1130 | | |
1131 | 2.37k | std::vector<std::shared_ptr<TSDescriptor>> descs; |
1132 | 2.37k | master_->ts_manager()->GetAllDescriptors(&descs); |
1133 | 3 | for (const auto& ts_desc : descs) { |
1134 | 3 | ts_desc->set_has_tablet_report(false); |
1135 | 3 | } |
1136 | | |
1137 | 2.37k | { |
1138 | 2.37k | LockGuard lock(permissions_manager()->mutex()); |
1139 | | |
1140 | | // Clear the roles mapping. |
1141 | 2.37k | permissions_manager()->ClearRolesUnlocked(); |
1142 | 2.37k | RETURN_NOT_OK(Load<RoleLoader>("roles", term)); |
1143 | 2.37k | RETURN_NOT_OK(Load<SysConfigLoader>("sys config", term)); |
1144 | 2.37k | } |
1145 | | // Clear the hidden tablets vector. |
1146 | 2.37k | hidden_tablets_.clear(); |
1147 | | |
1148 | 2.37k | RETURN_NOT_OK(Load<TableLoader>("tables", term)); |
1149 | 2.37k | RETURN_NOT_OK(Load<TabletLoader>("tablets", term)); |
1150 | 2.37k | RETURN_NOT_OK(Load<NamespaceLoader>("namespaces", term)); |
1151 | 2.37k | RETURN_NOT_OK(Load<UDTypeLoader>("user-defined types", term)); |
1152 | 2.37k | RETURN_NOT_OK(Load<ClusterConfigLoader>("cluster configuration", term)); |
1153 | 2.37k | RETURN_NOT_OK(Load<RedisConfigLoader>("Redis config", term)); |
1154 | | |
1155 | 2.37k | if (!transaction_tables_config_) { |
1156 | 1.94k | RETURN_NOT_OK(InitializeTransactionTablesConfig(term)); |
1157 | 1.94k | } |
1158 | | |
1159 | 2.37k | return Status::OK(); |
1160 | 2.37k | } |
1161 | | |
1162 | | Status CatalogManager::CheckResource( |
1163 | | const GrantRevokePermissionRequestPB* req, |
1164 | 721 | GrantRevokePermissionResponsePB* resp) { |
1165 | 721 | scoped_refptr<TableInfo> table; |
1166 | | |
1167 | | // Checking if resources exist. |
1168 | 721 | if (req->resource_type() == ResourceType::TABLE || |
1169 | 522 | req->resource_type() == ResourceType::KEYSPACE) { |
1170 | | // We can't match Apache Cassandra's error because when a namespace is not provided, the error |
1171 | | // is detected by the semantic analysis in PTQualifiedName::AnalyzeName. |
1172 | 435 | DCHECK(req->has_namespace_()); |
1173 | 435 | const auto& namespace_info = req->namespace_(); |
1174 | 435 | auto ns = FindNamespace(namespace_info); |
1175 | | |
1176 | 435 | if (req->resource_type() == ResourceType::KEYSPACE) { |
1177 | 236 | if (!ns.ok()) { |
1178 | | // Matches Apache Cassandra's error. |
1179 | 0 | Status s = STATUS_SUBSTITUTE( |
1180 | 0 | NotFound, "Resource <keyspace $0> doesn't exist", namespace_info.name()); |
1181 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s); |
1182 | 0 | } |
1183 | 199 | } else { |
1184 | 199 | if (ns.ok()) { |
1185 | 199 | CatalogManager::SharedLock l(mutex_); |
1186 | 199 | table = FindPtrOrNull(table_names_map_, {(**ns).id(), req->resource_name()}); |
1187 | 199 | } |
1188 | 199 | if (table == nullptr) { |
1189 | | // Matches Apache Cassandra's error. |
1190 | 0 | Status s = STATUS_SUBSTITUTE( |
1191 | 0 | NotFound, "Resource <object '$0.$1'> doesn't exist", |
1192 | 0 | namespace_info.name(), req->resource_name()); |
1193 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); |
1194 | 0 | } |
1195 | 721 | } |
1196 | 435 | } |
1197 | 721 | return Status::OK(); |
1198 | 721 | } |
1199 | | |
1200 | 2.36k | Status CatalogManager::PrepareDefaultClusterConfig(int64_t term) { |
1201 | 2.36k | if (cluster_config_) { |
1202 | 421 | LOG_WITH_PREFIX(INFO) |
1203 | 421 | << "Cluster configuration has already been set up, skipping re-initialization."; |
1204 | 421 | return Status::OK(); |
1205 | 421 | } |
1206 | | |
1207 | | // Create default. |
1208 | 1.94k | SysClusterConfigEntryPB config; |
1209 | 1.94k | config.set_version(0); |
1210 | | |
1211 | 1.94k | std::string cluster_uuid_source; |
1212 | 1.94k | if (!FLAGS_cluster_uuid.empty()) { |
1213 | 1 | RETURN_NOT_OK(Uuid::FromString(FLAGS_cluster_uuid)); |
1214 | 0 | config.set_cluster_uuid(FLAGS_cluster_uuid); |
1215 | 0 | cluster_uuid_source = "from the --cluster_uuid flag"; |
1216 | 1.94k | } else { |
1217 | 1.94k | auto uuid = Uuid::Generate(); |
1218 | 1.94k | config.set_cluster_uuid(uuid.ToString()); |
1219 | 1.94k | cluster_uuid_source = "(randomly generated)"; |
1220 | 1.94k | } |
1221 | 1.94k | LOG_WITH_PREFIX(INFO) |
1222 | 1.94k | << "Setting cluster UUID to " << config.cluster_uuid() << " " << cluster_uuid_source; |
1223 | | |
1224 | | // Create in memory object. |
1225 | 1.94k | cluster_config_ = new ClusterConfigInfo(); |
1226 | | |
1227 | | // Prepare write. |
1228 | 1.94k | auto l = cluster_config_->LockForWrite(); |
1229 | 1.94k | l.mutable_data()->pb = std::move(config); |
1230 | | |
1231 | | // Write to sys_catalog and in memory. |
1232 | 1.94k | RETURN_NOT_OK(sys_catalog_->Upsert(term, cluster_config_)); |
1233 | 1.94k | l.Commit(); |
1234 | | |
1235 | 1.94k | return Status::OK(); |
1236 | 1.94k | } |
1237 | | |
1238 | 18.9k | std::vector<std::string> CatalogManager::GetMasterAddresses() { |
1239 | 18.9k | std::vector<std::string> result; |
1240 | 18.9k | consensus::ConsensusStatePB state; |
1241 | 18.9k | auto status = GetCurrentConfig(&state); |
1242 | 18.9k | if (!status.ok()) { |
1243 | 11.6k | LOG(WARNING) << "Failed to get current config: " << status; |
1244 | 11.6k | return result; |
1245 | 11.6k | } |
1246 | 19.8k | for (const auto& peer : state.config().peers()) { |
1247 | 19.8k | std::vector<std::string> peer_addresses; |
1248 | 39.7k | for (const auto& list : {peer.last_known_private_addr(), peer.last_known_broadcast_addr()}) { |
1249 | 20.0k | for (const auto& entry : list) { |
1250 | 20.0k | peer_addresses.push_back(HostPort::FromPB(entry).ToString()); |
1251 | 20.0k | } |
1252 | 39.7k | } |
1253 | 19.8k | if (!peer_addresses.empty()) { |
1254 | 19.8k | result.push_back(JoinStrings(peer_addresses, ",")); |
1255 | 19.8k | } |
1256 | 19.8k | } |
1257 | 7.28k | return result; |
1258 | 7.28k | } |
1259 | | |
1260 | 2.00k | Status CatalogManager::PrepareDefaultSysConfig(int64_t term) { |
1261 | 2.00k | { |
1262 | 2.00k | LockGuard lock(permissions_manager()->mutex()); |
1263 | 2.00k | RETURN_NOT_OK(permissions_manager()->PrepareDefaultSecurityConfigUnlocked(term)); |
1264 | 2.00k | } |
1265 | | |
1266 | 2.00k | if (!ysql_catalog_config_) { |
1267 | 1.94k | SysYSQLCatalogConfigEntryPB ysql_catalog_config; |
1268 | 1.94k | ysql_catalog_config.set_version(0); |
1269 | | |
1270 | | // Create in memory objects. |
1271 | 1.94k | ysql_catalog_config_ = new SysConfigInfo(kYsqlCatalogConfigType); |
1272 | | |
1273 | | // Prepare write. |
1274 | 1.94k | auto l = ysql_catalog_config_->LockForWrite(); |
1275 | 1.94k | *l.mutable_data()->pb.mutable_ysql_catalog_config() = std::move(ysql_catalog_config); |
1276 | | |
1277 | | // Write to sys_catalog and in memory. |
1278 | 1.94k | RETURN_NOT_OK(sys_catalog_->Upsert(term, ysql_catalog_config_)); |
1279 | 1.94k | l.Commit(); |
1280 | 1.94k | } |
1281 | | |
1282 | 2.00k | if (!transaction_tables_config_) { |
1283 | 0 | RETURN_NOT_OK(InitializeTransactionTablesConfig(term)); |
1284 | 0 | } |
1285 | | |
1286 | 2.00k | return Status::OK(); |
1287 | 2.00k | } |
1288 | | |
1289 | 2.00k | bool CatalogManager::StartRunningInitDbIfNeeded(int64_t term) { |
1290 | 2.00k | if (!ShouldAutoRunInitDb(ysql_catalog_config_.get(), pg_proc_exists_)) { |
1291 | 2.00k | return false; |
1292 | 2.00k | } |
1293 | | |
1294 | 0 | string master_addresses_str = MasterAddressesToString( |
1295 | 0 | *master_->opts().GetMasterAddresses()); |
1296 | |
|
1297 | 0 | initdb_future_ = std::async(std::launch::async, [this, master_addresses_str, term] { |
1298 | 0 | if (FLAGS_create_initial_sys_catalog_snapshot) { |
1299 | 0 | initial_snapshot_writer_.emplace(); |
1300 | 0 | } |
1301 | |
|
1302 | 0 | Status status = PgWrapper::InitDbForYSQL( |
1303 | 0 | master_addresses_str, "/tmp", master_->GetSharedMemoryFd()); |
1304 | |
|
1305 | 0 | if (FLAGS_create_initial_sys_catalog_snapshot && status.ok()) { |
1306 | 0 | Status write_snapshot_status = initial_snapshot_writer_->WriteSnapshot( |
1307 | 0 | sys_catalog_->tablet_peer()->tablet(), |
1308 | 0 | FLAGS_initial_sys_catalog_snapshot_path); |
1309 | 0 | if (!write_snapshot_status.ok()) { |
1310 | 0 | status = write_snapshot_status; |
1311 | 0 | } |
1312 | 0 | } |
1313 | 0 | Status finish_status = InitDbFinished(status, term); |
1314 | 0 | if (!finish_status.ok()) { |
1315 | 0 | if (status.ok()) { |
1316 | 0 | status = finish_status; |
1317 | 0 | } |
1318 | 0 | LOG_WITH_PREFIX(WARNING) |
1319 | 0 | << "Failed to set initdb as finished in sys catalog: " << finish_status; |
1320 | 0 | } |
1321 | 0 | return status; |
1322 | 0 | }); |
1323 | 0 | return true; |
1324 | 0 | } |
1325 | | |
1326 | 2.00k | Status CatalogManager::PrepareDefaultNamespaces(int64_t term) { |
1327 | 2.00k | RETURN_NOT_OK(PrepareNamespace( |
1328 | 2.00k | YQL_DATABASE_CQL, kSystemNamespaceName, kSystemNamespaceId, term)); |
1329 | 2.00k | RETURN_NOT_OK(PrepareNamespace( |
1330 | 2.00k | YQL_DATABASE_CQL, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term)); |
1331 | 2.00k | RETURN_NOT_OK(PrepareNamespace( |
1332 | 2.00k | YQL_DATABASE_CQL, kSystemAuthNamespaceName, kSystemAuthNamespaceId, term)); |
1333 | 2.00k | return Status::OK(); |
1334 | 2.00k | } |
1335 | | |
1336 | 2.00k | Status CatalogManager::PrepareSystemTables(int64_t term) { |
1337 | | // Prepare sys catalog table. |
1338 | 2.00k | RETURN_NOT_OK(PrepareSysCatalogTable(term)); |
1339 | | |
1340 | | // Create the required system tables here. |
1341 | 2.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<PeersVTable>( |
1342 | 2.00k | kSystemPeersTableName, kSystemNamespaceName, kSystemNamespaceId, term))); |
1343 | 2.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<LocalVTable>( |
1344 | 2.00k | kSystemLocalTableName, kSystemNamespaceName, kSystemNamespaceId, term))); |
1345 | 2.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLKeyspacesVTable>( |
1346 | 2.00k | kSystemSchemaKeyspacesTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, |
1347 | 2.00k | term))); |
1348 | 2.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLTablesVTable>( |
1349 | 2.00k | kSystemSchemaTablesTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term))); |
1350 | 2.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLColumnsVTable>( |
1351 | 2.00k | kSystemSchemaColumnsTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term))); |
1352 | 2.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLSizeEstimatesVTable>( |
1353 | 2.00k | kSystemSizeEstimatesTableName, kSystemNamespaceName, kSystemNamespaceId, term))); |
1354 | | |
1355 | | // Empty tables. |
1356 | 2.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLAggregatesVTable>( |
1357 | 2.00k | kSystemSchemaAggregatesTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, |
1358 | 2.00k | term))); |
1359 | 2.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLFunctionsVTable>( |
1360 | 2.00k | kSystemSchemaFunctionsTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, |
1361 | 2.00k | term))); |
1362 | 2.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLIndexesVTable>( |
1363 | 2.00k | kSystemSchemaIndexesTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term))); |
1364 | 2.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLTriggersVTable>( |
1365 | 2.00k | kSystemSchemaTriggersTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term))); |
1366 | 2.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLViewsVTable>( |
1367 | 2.00k | kSystemSchemaViewsTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term))); |
1368 | 2.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<QLTypesVTable>( |
1369 | 2.00k | kSystemSchemaTypesTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term))); |
1370 | 2.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLPartitionsVTable>( |
1371 | 2.00k | kSystemPartitionsTableName, kSystemNamespaceName, kSystemNamespaceId, term))); |
1372 | | |
1373 | | // System auth tables. |
1374 | 2.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLAuthRolesVTable>( |
1375 | 2.00k | kSystemAuthRolesTableName, kSystemAuthNamespaceName, kSystemAuthNamespaceId, term))); |
1376 | 2.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLAuthRolePermissionsVTable>( |
1377 | 2.00k | kSystemAuthRolePermissionsTableName, kSystemAuthNamespaceName, kSystemAuthNamespaceId, |
1378 | 2.00k | term))); |
1379 | 2.00k | RETURN_NOT_OK((PrepareSystemTableTemplate<YQLAuthResourceRolePermissionsIndexVTable>( |
1380 | 2.00k | kSystemAuthResourceRolePermissionsIndexTableName, kSystemAuthNamespaceName, |
1381 | 2.00k | kSystemAuthNamespaceId, term))); |
1382 | | |
1383 | | // Ensure kNumSystemTables is in-sync with the system tables created. |
1384 | 1 | LOG_IF(DFATAL, system_tablets_.size() != kNumSystemTables) |
1385 | 1 | << "kNumSystemTables is " << kNumSystemTables << " but " << system_tablets_.size() |
1386 | 1 | << " tables were created"; |
1387 | | |
1388 | | // Cache the system.partitions tablet so we can access it in RebuildYQLSystemPartitions. |
1389 | 2.00k | RETURN_NOT_OK(GetYQLPartitionsVTable(&system_partitions_tablet_)); |
1390 | | |
1391 | 2.00k | return Status::OK(); |
1392 | 2.00k | } |
1393 | | |
1394 | 2.00k | Status CatalogManager::PrepareSysCatalogTable(int64_t term) { |
1395 | | // Prepare sys catalog table info. |
1396 | 2.00k | auto sys_catalog_table_iter = table_ids_map_->find(kSysCatalogTableId); |
1397 | 2.00k | if (sys_catalog_table_iter == table_ids_map_->end()) { |
1398 | 1.58k | scoped_refptr<TableInfo> table = NewTableInfo(kSysCatalogTableId); |
1399 | 1.58k | table->mutable_metadata()->StartMutation(); |
1400 | 1.58k | SysTablesEntryPB& metadata = table->mutable_metadata()->mutable_dirty()->pb; |
1401 | 1.58k | metadata.set_state(SysTablesEntryPB::RUNNING); |
1402 | 1.58k | metadata.set_namespace_id(kSystemSchemaNamespaceId); |
1403 | 1.58k | metadata.set_name(kSysCatalogTableName); |
1404 | 1.58k | metadata.set_table_type(TableType::YQL_TABLE_TYPE); |
1405 | 1.58k | SchemaToPB(*sys_catalog_->schema_, metadata.mutable_schema()); |
1406 | 1.58k | metadata.set_version(0); |
1407 | | |
1408 | 1.58k | auto table_ids_map_checkout = table_ids_map_.CheckOut(); |
1409 | 1.58k | sys_catalog_table_iter = table_ids_map_checkout->emplace(table->id(), table).first; |
1410 | 1.58k | table_names_map_[{kSystemSchemaNamespaceId, kSysCatalogTableName}] = table; |
1411 | 1.58k | table->set_is_system(); |
1412 | | |
1413 | 1.58k | RETURN_NOT_OK(sys_catalog_->Upsert(term, table)); |
1414 | 1.58k | table->mutable_metadata()->CommitMutation(); |
1415 | 1.58k | } |
1416 | | |
1417 | | // Prepare sys catalog tablet info. |
1418 | 2.00k | if (tablet_map_->count(kSysCatalogTabletId) == 0) { |
1419 | 1.58k | scoped_refptr<TableInfo> table = sys_catalog_table_iter->second; |
1420 | 1.58k | scoped_refptr<TabletInfo> tablet(new TabletInfo(table, kSysCatalogTabletId)); |
1421 | 1.58k | tablet->mutable_metadata()->StartMutation(); |
1422 | 1.58k | SysTabletsEntryPB& metadata = tablet->mutable_metadata()->mutable_dirty()->pb; |
1423 | 1.58k | metadata.set_state(SysTabletsEntryPB::RUNNING); |
1424 | | |
1425 | 1.58k | auto l = table->LockForRead(); |
1426 | 1.58k | PartitionSchema partition_schema; |
1427 | 1.58k | RETURN_NOT_OK(PartitionSchema::FromPB(l->pb.partition_schema(), |
1428 | 1.58k | *sys_catalog_->schema_, |
1429 | 1.58k | &partition_schema)); |
1430 | 1.58k | vector<Partition> partitions; |
1431 | 1.58k | RETURN_NOT_OK(partition_schema.CreatePartitions(1, &partitions)); |
1432 | 1.58k | partitions[0].ToPB(metadata.mutable_partition()); |
1433 | 1.58k | metadata.set_table_id(table->id()); |
1434 | 1.58k | metadata.add_table_ids(table->id()); |
1435 | | |
1436 | 1.58k | table->set_is_system(); |
1437 | 1.58k | table->AddTablet(tablet.get()); |
1438 | | |
1439 | 1.58k | auto tablet_map_checkout = tablet_map_.CheckOut(); |
1440 | 1.58k | (*tablet_map_checkout)[tablet->tablet_id()] = tablet; |
1441 | | |
1442 | 1.58k | RETURN_NOT_OK(sys_catalog_->Upsert(term, tablet)); |
1443 | 1.58k | tablet->mutable_metadata()->CommitMutation(); |
1444 | 1.58k | } |
1445 | | |
1446 | 2.00k | system_tablets_[kSysCatalogTabletId] = sys_catalog_->tablet_peer_->shared_tablet(); |
1447 | | |
1448 | 2.00k | return Status::OK(); |
1449 | 2.00k | } |
1450 | | |
1451 | | template <class T> |
1452 | | Status CatalogManager::PrepareSystemTableTemplate(const TableName& table_name, |
1453 | | const NamespaceName& namespace_name, |
1454 | | const NamespaceId& namespace_id, |
1455 | 32.1k | int64_t term) { |
1456 | 32.1k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); |
1457 | 32.1k | return PrepareSystemTable( |
1458 | 32.1k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); |
1459 | 32.1k | } _ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_11PeersVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x Line | Count | Source | 1455 | 2.00k | int64_t term) { | 1456 | 2.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1457 | 2.00k | return PrepareSystemTable( | 1458 | 2.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1459 | 2.00k | } |
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_11LocalVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x Line | Count | Source | 1455 | 2.00k | int64_t term) { | 1456 | 2.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1457 | 2.00k | return PrepareSystemTable( | 1458 | 2.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1459 | 2.00k | } |
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_18YQLKeyspacesVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x Line | Count | Source | 1455 | 2.00k | int64_t term) { | 1456 | 2.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1457 | 2.00k | return PrepareSystemTable( | 1458 | 2.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1459 | 2.00k | } |
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_15YQLTablesVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x Line | Count | Source | 1455 | 2.00k | int64_t term) { | 1456 | 2.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1457 | 2.00k | return PrepareSystemTable( | 1458 | 2.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1459 | 2.00k | } |
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_16YQLColumnsVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x Line | Count | Source | 1455 | 2.00k | int64_t term) { | 1456 | 2.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1457 | 2.00k | return PrepareSystemTable( | 1458 | 2.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1459 | 2.00k | } |
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_22YQLSizeEstimatesVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x Line | Count | Source | 1455 | 2.00k | int64_t term) { | 1456 | 2.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1457 | 2.00k | return PrepareSystemTable( | 1458 | 2.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1459 | 2.00k | } |
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_19YQLAggregatesVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x Line | Count | Source | 1455 | 2.00k | int64_t term) { | 1456 | 2.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1457 | 2.00k | return PrepareSystemTable( | 1458 | 2.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1459 | 2.00k | } |
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_18YQLFunctionsVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x Line | Count | Source | 1455 | 2.00k | int64_t term) { | 1456 | 2.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1457 | 2.00k | return PrepareSystemTable( | 1458 | 2.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1459 | 2.00k | } |
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_16YQLIndexesVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x Line | Count | Source | 1455 | 2.00k | int64_t term) { | 1456 | 2.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1457 | 2.00k | return PrepareSystemTable( | 1458 | 2.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1459 | 2.00k | } |
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_17YQLTriggersVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x Line | Count | Source | 1455 | 2.00k | int64_t term) { | 1456 | 2.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1457 | 2.00k | return PrepareSystemTable( | 1458 | 2.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1459 | 2.00k | } |
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_14YQLViewsVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x Line | Count | Source | 1455 | 2.00k | int64_t term) { | 1456 | 2.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1457 | 2.00k | return PrepareSystemTable( | 1458 | 2.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1459 | 2.00k | } |
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_13QLTypesVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x Line | Count | Source | 1455 | 2.00k | int64_t term) { | 1456 | 2.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1457 | 2.00k | return PrepareSystemTable( | 1458 | 2.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1459 | 2.00k | } |
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_19YQLPartitionsVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x Line | Count | Source | 1455 | 2.00k | int64_t term) { | 1456 | 2.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1457 | 2.00k | return PrepareSystemTable( | 1458 | 2.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1459 | 2.00k | } |
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_18YQLAuthRolesVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x Line | Count | Source | 1455 | 2.00k | int64_t term) { | 1456 | 2.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1457 | 2.00k | return PrepareSystemTable( | 1458 | 2.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1459 | 2.00k | } |
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_28YQLAuthRolePermissionsVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x Line | Count | Source | 1455 | 2.00k | int64_t term) { | 1456 | 2.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1457 | 2.00k | return PrepareSystemTable( | 1458 | 2.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1459 | 2.00k | } |
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_41YQLAuthResourceRolePermissionsIndexVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x Line | Count | Source | 1455 | 2.00k | int64_t term) { | 1456 | 2.00k | YQLVirtualTable* vtable = new T(table_name, namespace_name, master_); | 1457 | 2.00k | return PrepareSystemTable( | 1458 | 2.00k | table_name, namespace_name, namespace_id, vtable->schema(), term, vtable); | 1459 | 2.00k | } |
|
1460 | | |
1461 | | Status CatalogManager::PrepareSystemTable(const TableName& table_name, |
1462 | | const NamespaceName& namespace_name, |
1463 | | const NamespaceId& namespace_id, |
1464 | | const Schema& schema, |
1465 | | int64_t term, |
1466 | 32.1k | YQLVirtualTable* vtable) { |
1467 | 32.1k | std::unique_ptr<YQLVirtualTable> yql_storage(vtable); |
1468 | | |
1469 | 32.1k | scoped_refptr<TableInfo> table = FindPtrOrNull(table_names_map_, |
1470 | 32.1k | std::make_pair(namespace_id, table_name)); |
1471 | 32.1k | bool create_table = true; |
1472 | 32.1k | if (table != nullptr) { |
1473 | 6.73k | LOG_WITH_PREFIX(INFO) << "Table " << namespace_name << "." << table_name << " already created"; |
1474 | | |
1475 | | // Mark the table as a system table. |
1476 | 6.73k | table->set_is_system(); |
1477 | | |
1478 | 6.73k | Schema persisted_schema; |
1479 | 6.73k | RETURN_NOT_OK(table->GetSchema(&persisted_schema)); |
1480 | 6.73k | if (!persisted_schema.Equals(schema)) { |
1481 | 0 | LOG_WITH_PREFIX(INFO) |
1482 | 0 | << "Updating schema of " << namespace_name << "." << table_name << " ..."; |
1483 | 0 | auto l = table->LockForWrite(); |
1484 | 0 | SchemaToPB(schema, l.mutable_data()->pb.mutable_schema()); |
1485 | 0 | l.mutable_data()->pb.set_version(l->pb.version() + 1); |
1486 | 0 | l.mutable_data()->pb.set_updates_only_index_permissions(false); |
1487 | | |
1488 | | // Update sys-catalog with the new table schema. |
1489 | 0 | RETURN_NOT_OK(sys_catalog_->Upsert(term, table)); |
1490 | 0 | l.Commit(); |
1491 | 0 | } |
1492 | | |
1493 | | // There might have been a failure after writing the table but before writing the tablets. As |
1494 | | // a result, if we don't find any tablets, we try to create the tablets only again. |
1495 | 6.73k | auto tablets = table->GetTablets(); |
1496 | 6.73k | if (!tablets.empty()) { |
1497 | | // Initialize the appropriate system tablet. |
1498 | 6.73k | DCHECK_EQ(1, tablets.size()); |
1499 | 6.73k | auto tablet = tablets[0]; |
1500 | 6.73k | system_tablets_[tablet->tablet_id()] = |
1501 | 6.73k | std::make_shared<SystemTablet>(schema, std::move(yql_storage), tablet->tablet_id()); |
1502 | 6.73k | return Status::OK(); |
1503 | 1 | } else { |
1504 | | // Table is already created, only need to create tablets now. |
1505 | 1 | LOG_WITH_PREFIX(INFO) |
1506 | 1 | << "Creating tablets for " << namespace_name << "." << table_name << " ..."; |
1507 | 1 | create_table = false; |
1508 | 1 | } |
1509 | 6.73k | } |
1510 | | |
1511 | | // Create partitions. |
1512 | 25.3k | vector<Partition> partitions; |
1513 | 25.3k | PartitionSchemaPB partition_schema_pb; |
1514 | 25.3k | partition_schema_pb.set_hash_schema(PartitionSchemaPB::MULTI_COLUMN_HASH_SCHEMA); |
1515 | 25.3k | PartitionSchema partition_schema; |
1516 | 25.3k | RETURN_NOT_OK(PartitionSchema::FromPB(partition_schema_pb, schema, &partition_schema)); |
1517 | 25.3k | RETURN_NOT_OK(partition_schema.CreatePartitions(1, &partitions)); |
1518 | | |
1519 | 25.3k | TabletInfos tablets; |
1520 | | |
1521 | 25.3k | if (create_table) { |
1522 | | // Fill in details for the system table. |
1523 | 25.3k | CreateTableRequestPB req; |
1524 | 25.3k | req.set_name(table_name); |
1525 | 25.3k | req.set_table_type(TableType::YQL_TABLE_TYPE); |
1526 | | |
1527 | 25.3k | RETURN_NOT_OK(CreateTableInMemory( |
1528 | 25.3k | req, schema, partition_schema, namespace_id, namespace_name, |
1529 | 25.3k | partitions, nullptr, &tablets, nullptr, &table)); |
1530 | | // Mark the table as a system table. |
1531 | 25.3k | LOG_WITH_PREFIX(INFO) << "Inserted new " << namespace_name << "." << table_name |
1532 | 25.3k | << " table info into CatalogManager maps"; |
1533 | | // Update the on-disk table state to "running". |
1534 | 25.3k | table->mutable_metadata()->mutable_dirty()->pb.set_state(SysTablesEntryPB::RUNNING); |
1535 | 25.3k | RETURN_NOT_OK(sys_catalog_->Upsert(term, table)); |
1536 | 25.3k | LOG_WITH_PREFIX(INFO) << "Wrote table to system catalog: " << ToString(table) << ", tablets: " |
1537 | 25.3k | << ToString(tablets); |
1538 | 1 | } else { |
1539 | | // Still need to create the tablets. |
1540 | 1 | tablets = VERIFY_RESULT(CreateTabletsFromTable(partitions, table)); |
1541 | 1 | } |
1542 | | |
1543 | 25.3k | DCHECK_EQ(1, tablets.size()); |
1544 | | // We use LOG_ASSERT here since this is expected to crash in some unit tests. |
1545 | 25.3k | LOG_ASSERT(!FLAGS_TEST_catalog_manager_simulate_system_table_create_failure); |
1546 | | |
1547 | | // Write Tablets to sys-tablets (in "running" state since we don't want the loadbalancer to |
1548 | | // assign these tablets since this table is virtual). |
1549 | 25.3k | for (const auto& tablet : tablets) { |
1550 | 25.3k | tablet->mutable_metadata()->mutable_dirty()->pb.set_state(SysTabletsEntryPB::RUNNING); |
1551 | 25.3k | } |
1552 | 25.3k | RETURN_NOT_OK(sys_catalog_->Upsert(term, tablets)); |
1553 | 25.3k | LOG_WITH_PREFIX(INFO) << "Wrote tablets to system catalog: " << ToString(tablets); |
1554 | | |
1555 | | // Commit the in-memory state. |
1556 | 25.3k | if (create_table) { |
1557 | 25.3k | table->mutable_metadata()->CommitMutation(); |
1558 | 25.3k | } |
1559 | | |
1560 | 25.3k | for (const auto& tablet : tablets) { |
1561 | 25.3k | tablet->mutable_metadata()->CommitMutation(); |
1562 | 25.3k | } |
1563 | | // Mark the table as a system table. |
1564 | 25.3k | table->set_is_system(); |
1565 | | |
1566 | | // Finally create the appropriate tablet object. |
1567 | 25.3k | auto tablet = tablets[0]; |
1568 | 25.3k | system_tablets_[tablet->tablet_id()] = |
1569 | 25.3k | std::make_shared<SystemTablet>(schema, std::move(yql_storage), tablet->tablet_id()); |
1570 | 25.3k | return Status::OK(); |
1571 | 25.3k | } |
1572 | | |
1573 | 56.0k | bool IsYcqlNamespace(const NamespaceInfo& ns) { |
1574 | 56.0k | return ns.database_type() == YQLDatabase::YQL_DATABASE_CQL; |
1575 | 56.0k | } |
1576 | | |
1577 | 935k | bool IsYcqlTable(const TableInfo& table) { |
1578 | 935k | return table.GetTableType() == TableType::YQL_TABLE_TYPE && table.id() != kSysCatalogTableId; |
1579 | 935k | } |
1580 | | |
1581 | | Status CatalogManager::PrepareNamespace( |
1582 | 6.02k | YQLDatabase db_type, const NamespaceName& name, const NamespaceId& id, int64_t term) { |
1583 | | |
1584 | 6.02k | scoped_refptr<NamespaceInfo> ns = FindPtrOrNull(namespace_ids_map_, id); |
1585 | 6.02k | if (ns != nullptr) { |
1586 | 1.26k | LOG_WITH_PREFIX(INFO) |
1587 | 1.26k | << "Keyspace " << ns->ToString() << " already created, skipping initialization"; |
1588 | 1.26k | return Status::OK(); |
1589 | 1.26k | } |
1590 | | |
1591 | | // Create entry. |
1592 | 4.76k | SysNamespaceEntryPB ns_entry; |
1593 | 4.76k | ns_entry.set_name(name); |
1594 | 4.76k | ns_entry.set_database_type(db_type); |
1595 | 4.76k | ns_entry.set_state(SysNamespaceEntryPB::RUNNING); |
1596 | | |
1597 | | // Create in memory object. |
1598 | 4.76k | ns = new NamespaceInfo(id); |
1599 | | |
1600 | | // Prepare write. |
1601 | 4.76k | auto l = ns->LockForWrite(); |
1602 | 4.76k | l.mutable_data()->pb = std::move(ns_entry); |
1603 | | |
1604 | 4.76k | namespace_ids_map_[id] = ns; |
1605 | 4.76k | namespace_names_mapper_[db_type][l.mutable_data()->pb.name()] = ns; |
1606 | | |
1607 | | // Write to sys_catalog and in memory. |
1608 | 4.76k | RETURN_NOT_OK(sys_catalog_->Upsert(term, ns)); |
1609 | 4.76k | l.Commit(); |
1610 | | |
1611 | 4.76k | LOG_WITH_PREFIX(INFO) << "Created default keyspace: " << ns->ToString(); |
1612 | 4.76k | return Status::OK(); |
1613 | 4.76k | } |
1614 | | |
1615 | 5.35k | Status CatalogManager::CheckLocalHostInMasterAddresses() { |
1616 | 5.35k | auto local_hostport = master_->first_rpc_address(); |
1617 | 5.35k | std::vector<IpAddress> local_addrs; |
1618 | | |
1619 | 5.35k | if (local_hostport.address().is_unspecified()) { |
1620 | 0 | auto status = GetLocalAddresses(&local_addrs, AddressFilter::ANY); |
1621 | 0 | if (!status.ok() || local_addrs.empty()) { |
1622 | 0 | LOG(WARNING) << "Could not enumerate network interfaces due to " << status << ", found " |
1623 | 0 | << local_addrs.size() << " local addresses."; |
1624 | 0 | return Status::OK(); |
1625 | 0 | } |
1626 | 5.35k | } else { |
1627 | 5.35k | for (auto const &addr : master_->rpc_addresses()) { |
1628 | 5.35k | local_addrs.push_back(addr.address()); |
1629 | 5.35k | } |
1630 | 5.35k | } |
1631 | | |
1632 | 5.35k | auto resolved_addresses = VERIFY_RESULT(server::ResolveMasterAddresses( |
1633 | 5.35k | *master_->opts().GetMasterAddresses())); |
1634 | | |
1635 | 9.88k | for (auto const &addr : resolved_addresses) { |
1636 | 9.88k | if (addr.address().is_unspecified() || |
1637 | 9.88k | std::find(local_addrs.begin(), local_addrs.end(), addr.address()) != |
1638 | 5.35k | local_addrs.end()) { |
1639 | 5.35k | return Status::OK(); |
1640 | 5.35k | } |
1641 | 9.88k | } |
1642 | 0 | return STATUS_SUBSTITUTE(IllegalState, |
1643 | 5.35k | "None of the local addresses are present in master_addresses $0.", |
1644 | 5.35k | master_->opts().master_addresses_flag); |
1645 | 5.35k | } |
1646 | | |
1647 | 5.42k | Status CatalogManager::InitSysCatalogAsync() { |
1648 | 5.42k | LockGuard lock(mutex_); |
1649 | | |
1650 | | // Optimistically try to load data from disk. |
1651 | 5.42k | Status s = sys_catalog_->Load(master_->fs_manager()); |
1652 | | |
1653 | 5.42k | if (!s.ok() && s.IsNotFound()) { |
1654 | | // We have yet to intialize the syscatalog metadata, need to create the metadata file. |
1655 | 5.40k | LOG(INFO) << "Did not find previous SysCatalogTable data on disk. " << s; |
1656 | | |
1657 | 5.40k | if (!master_->opts().AreMasterAddressesProvided()) { |
1658 | 41 | master_->SetShellMode(true); |
1659 | 41 | LOG(INFO) << "Starting master in shell mode."; |
1660 | 41 | return Status::OK(); |
1661 | 41 | } |
1662 | | |
1663 | 5.35k | RETURN_NOT_OK(CheckLocalHostInMasterAddresses()); |
1664 | 5.35k | RETURN_NOT_OK_PREPEND(sys_catalog_->CreateNew(master_->fs_manager()), |
1665 | 5.35k | Substitute("Encountered errors during system catalog initialization:" |
1666 | 5.34k | "\n\tError on Load: $0\n\tError on CreateNew: ", s.ToString())); |
1667 | | |
1668 | 5.34k | return Status::OK(); |
1669 | 23 | } |
1670 | | |
1671 | 23 | return s; |
1672 | 23 | } |
1673 | | |
1674 | 5.06M | bool CatalogManager::IsInitialized() const { |
1675 | 5.06M | std::lock_guard<simple_spinlock> l(state_lock_); |
1676 | 5.06M | return state_ == kRunning; |
1677 | 5.06M | } |
1678 | | |
1679 | | // TODO - delete this API after HandleReportedTablet() usage is removed. |
1680 | 260k | Status CatalogManager::CheckIsLeaderAndReady() const { |
1681 | 260k | std::lock_guard<simple_spinlock> l(state_lock_); |
1682 | 260k | if (PREDICT_FALSE(state_ != kRunning)) { |
1683 | 27 | return STATUS_SUBSTITUTE(ServiceUnavailable, |
1684 | 27 | "Catalog manager is shutting down. State: $0", state_); |
1685 | 27 | } |
1686 | 260k | string uuid = master_->fs_manager()->uuid(); |
1687 | 260k | if (master_->opts().IsShellMode()) { |
1688 | | // Consensus and other internal fields should not be checked when is shell mode. |
1689 | 0 | return STATUS_SUBSTITUTE(IllegalState, |
1690 | 0 | "Catalog manager of $0 is in shell mode, not the leader", uuid); |
1691 | 0 | } |
1692 | 260k | Consensus* consensus = tablet_peer()->consensus(); |
1693 | 260k | if (consensus == nullptr) { |
1694 | 0 | return STATUS(IllegalState, "Consensus has not been initialized yet"); |
1695 | 0 | } |
1696 | 260k | ConsensusStatePB cstate = consensus->ConsensusState(CONSENSUS_CONFIG_COMMITTED); |
1697 | 260k | if (PREDICT_FALSE(!cstate.has_leader_uuid() || cstate.leader_uuid() != uuid)) { |
1698 | 0 | return STATUS_SUBSTITUTE(IllegalState, |
1699 | 0 | "Not the leader. Local UUID: $0, Consensus state: $1", uuid, cstate.ShortDebugString()); |
1700 | 0 | } |
1701 | 260k | if (PREDICT_FALSE(leader_ready_term_ != cstate.current_term())) { |
1702 | 0 | return STATUS_SUBSTITUTE(ServiceUnavailable, |
1703 | 0 | "Leader not yet ready to serve requests: ready term $0 vs cstate term $1", |
1704 | 0 | leader_ready_term_, cstate.current_term()); |
1705 | 0 | } |
1706 | 260k | return Status::OK(); |
1707 | 260k | } |
1708 | | |
1709 | 6.31M | std::shared_ptr<tablet::TabletPeer> CatalogManager::tablet_peer() const { |
1710 | 6.31M | return sys_catalog_->tablet_peer(); |
1711 | 6.31M | } |
1712 | | |
1713 | 3.90M | PeerRole CatalogManager::Role() const { |
1714 | 3.90M | if (!IsInitialized() || master_->opts().IsShellMode()) { |
1715 | 196 | return PeerRole::NON_PARTICIPANT; |
1716 | 196 | } |
1717 | | |
1718 | 3.90M | return tablet_peer()->consensus()->role(); |
1719 | 3.90M | } |
1720 | | |
1721 | 271 | bool CatalogManager::StartShutdown() { |
1722 | 271 | { |
1723 | 271 | std::lock_guard<simple_spinlock> l(state_lock_); |
1724 | 271 | if (state_ == kClosing) { |
1725 | 0 | VLOG(2) << "CatalogManager already shut down"; |
1726 | 177 | return false; |
1727 | 177 | } |
1728 | 94 | state_ = kClosing; |
1729 | 94 | } |
1730 | | |
1731 | 94 | refresh_yql_partitions_task_.StartShutdown(); |
1732 | | |
1733 | 94 | refresh_ysql_tablespace_info_task_.StartShutdown(); |
1734 | | |
1735 | 94 | if (sys_catalog_) { |
1736 | 94 | sys_catalog_->StartShutdown(); |
1737 | 94 | } |
1738 | | |
1739 | 94 | return true; |
1740 | 94 | } |
1741 | | |
1742 | 92 | void CatalogManager::CompleteShutdown() { |
1743 | | // Shutdown the Catalog Manager background thread (load balancing). |
1744 | 92 | refresh_yql_partitions_task_.CompleteShutdown(); |
1745 | 92 | refresh_ysql_tablespace_info_task_.CompleteShutdown(); |
1746 | | |
1747 | 92 | if (background_tasks_) { |
1748 | 75 | background_tasks_->Shutdown(); |
1749 | 75 | } |
1750 | 92 | if (background_tasks_thread_pool_) { |
1751 | 92 | background_tasks_thread_pool_->Shutdown(); |
1752 | 92 | } |
1753 | 92 | if (leader_initialization_pool_) { |
1754 | 92 | leader_initialization_pool_->Shutdown(); |
1755 | 92 | } |
1756 | 92 | if (async_task_pool_) { |
1757 | 92 | async_task_pool_->Shutdown(); |
1758 | 92 | } |
1759 | | |
1760 | | // Mark all outstanding table tasks as aborted and wait for them to fail. |
1761 | | // |
1762 | | // There may be an outstanding table visitor thread modifying the table map, |
1763 | | // so we must make a copy of it before we iterate. It's OK if the visitor |
1764 | | // adds more entries to the map even after we finish; it won't start any new |
1765 | | // tasks for those entries. |
1766 | 92 | vector<scoped_refptr<TableInfo>> copy; |
1767 | 92 | { |
1768 | 92 | SharedLock lock(mutex_); |
1769 | 92 | AppendValuesFromMap(*table_ids_map_, ©); |
1770 | 92 | } |
1771 | 92 | AbortAndWaitForAllTasks(copy); |
1772 | | |
1773 | | // Shut down the underlying storage for tables and tablets. |
1774 | 92 | if (sys_catalog_) { |
1775 | 92 | sys_catalog_->CompleteShutdown(); |
1776 | 92 | } |
1777 | | |
1778 | | // Reset the jobs/tasks tracker. |
1779 | 92 | tasks_tracker_->Reset(); |
1780 | 92 | jobs_tracker_->Reset(); |
1781 | | |
1782 | 92 | if (initdb_future_ && initdb_future_->wait_for(0s) != std::future_status::ready) { |
1783 | 0 | LOG(WARNING) << "initdb is still running, waiting for it to complete."; |
1784 | 0 | initdb_future_->wait(); |
1785 | 0 | LOG(INFO) << "Finished running initdb, proceeding with catalog manager shutdown."; |
1786 | 0 | } |
1787 | 92 | } |
1788 | | |
1789 | | Status CatalogManager::AbortTableCreation(TableInfo* table, |
1790 | | const TabletInfos& tablets, |
1791 | | const Status& s, |
1792 | 4 | CreateTableResponsePB* resp) { |
1793 | 4 | LOG(WARNING) << s; |
1794 | | |
1795 | 4 | const TableId table_id = table->id(); |
1796 | 4 | const TableName table_name = table->mutable_metadata()->mutable_dirty()->pb.name(); |
1797 | 4 | const NamespaceId table_namespace_id = |
1798 | 4 | table->mutable_metadata()->mutable_dirty()->pb.namespace_id(); |
1799 | 4 | vector<string> tablet_ids_to_erase; |
1800 | 7 | for (const auto& tablet : tablets) { |
1801 | 7 | tablet_ids_to_erase.push_back(tablet->tablet_id()); |
1802 | 7 | } |
1803 | | |
1804 | 4 | LOG(INFO) << "Aborting creation of table '" << table_name << "', erasing table and tablets (" << |
1805 | 4 | JoinStrings(tablet_ids_to_erase, ",") << ") from in-memory state."; |
1806 | | |
1807 | | // Since this is a failed creation attempt, it's safe to just abort |
1808 | | // all tasks, as (by definition) no tasks may be pending against a |
1809 | | // table that has failed to successfully create. |
1810 | 4 | table->AbortTasksAndClose(); |
1811 | 4 | table->WaitTasksCompletion(); |
1812 | | |
1813 | 4 | LockGuard lock(mutex_); |
1814 | | |
1815 | | // Call AbortMutation() manually, as otherwise the lock won't be released. |
1816 | 7 | for (const auto& tablet : tablets) { |
1817 | 7 | tablet->mutable_metadata()->AbortMutation(); |
1818 | 7 | } |
1819 | 4 | table->mutable_metadata()->AbortMutation(); |
1820 | 4 | auto tablet_map_checkout = tablet_map_.CheckOut(); |
1821 | 7 | for (const TabletId& tablet_id_to_erase : tablet_ids_to_erase) { |
1822 | 0 | CHECK_EQ(tablet_map_checkout->erase(tablet_id_to_erase), 1) |
1823 | 0 | << "Unable to erase tablet " << tablet_id_to_erase << " from tablet map."; |
1824 | 7 | } |
1825 | | |
1826 | 4 | auto table_ids_map_checkout = table_ids_map_.CheckOut(); |
1827 | 4 | table_names_map_.erase({table_namespace_id, table_name}); // Not present if PGSQL table. |
1828 | 0 | CHECK_EQ(table_ids_map_checkout->erase(table_id), 1) |
1829 | 0 | << "Unable to erase table with id " << table_id << " from table ids map."; |
1830 | | |
1831 | 4 | if (IsYcqlTable(*table)) { |
1832 | 4 | GetYqlPartitionsVtable().RemoveFromCache(table->id()); |
1833 | 4 | } |
1834 | 4 | return CheckIfNoLongerLeaderAndSetupError(s, resp); |
1835 | 4 | } |
1836 | | |
1837 | | Result<ReplicationInfoPB> CatalogManager::GetTableReplicationInfo( |
1838 | | const ReplicationInfoPB& table_replication_info, |
1839 | 32.3k | const TablespaceId& tablespace_id) { |
1840 | | |
1841 | 32.3k | if (IsReplicationInfoSet(table_replication_info)) { |
1842 | | // The table has custom replication info set for it, return it if valid. |
1843 | 5 | RETURN_NOT_OK(ValidateTableReplicationInfo(table_replication_info)); |
1844 | 5 | return table_replication_info; |
1845 | 32.3k | } |
1846 | | // Table level replication info not set. Check whether the table is |
1847 | | // associated with a tablespace and if so, return the tablespace |
1848 | | // replication info. |
1849 | 32.3k | if (GetAtomicFlag(&FLAGS_enable_ysql_tablespaces_for_placement)) { |
1850 | 32.3k | boost::optional<ReplicationInfoPB> tablespace_pb = |
1851 | 32.3k | VERIFY_RESULT(GetTablespaceReplicationInfoWithRetry(tablespace_id)); |
1852 | 32.3k | if (tablespace_pb) { |
1853 | | // Return the tablespace placement. |
1854 | 0 | return tablespace_pb.value(); |
1855 | 0 | } |
1856 | 32.3k | } |
1857 | | |
1858 | | // Neither table nor tablespace info set. Return cluster level replication info. |
1859 | 32.3k | auto l = cluster_config_->LockForRead(); |
1860 | 32.3k | return l->pb.replication_info(); |
1861 | 32.3k | } |
1862 | | |
1863 | 100k | std::shared_ptr<YsqlTablespaceManager> CatalogManager::GetTablespaceManager() const { |
1864 | 100k | SharedLock lock(tablespace_mutex_); |
1865 | 100k | return tablespace_manager_; |
1866 | 100k | } |
1867 | | |
1868 | | Result<boost::optional<TablespaceId>> CatalogManager::GetTablespaceForTable( |
1869 | 1 | const scoped_refptr<TableInfo>& table) { |
1870 | | |
1871 | 1 | auto tablespace_manager = GetTablespaceManager(); |
1872 | 1 | return tablespace_manager->GetTablespaceForTable(table); |
1873 | 1 | } |
1874 | | |
1875 | | Result<boost::optional<ReplicationInfoPB>> CatalogManager::GetTablespaceReplicationInfoWithRetry( |
1876 | 32.3k | const TablespaceId& tablespace_id) { |
1877 | | |
1878 | 32.3k | auto tablespace_manager = GetTablespaceManager(); |
1879 | 32.3k | auto replication_info_result = tablespace_manager->GetTablespaceReplicationInfo(tablespace_id); |
1880 | | |
1881 | 32.3k | if (replication_info_result) { |
1882 | 32.3k | return replication_info_result; |
1883 | 32.3k | } |
1884 | | |
1885 | | // We failed to find the tablespace placement policy. Refresh the tablespace info and try again. |
1886 | 0 | auto tablespace_map = VERIFY_RESULT(GetYsqlTablespaceInfo()); |
1887 | | |
1888 | | // We clone the tablespace_manager and update the clone with the new tablespace_map that we |
1889 | | // fetched above. We do this instead of updating the tablespace_manager object in-place because |
1890 | | // other clients may have a shared_ptr to it through 'GetTablespaceManager()'. |
1891 | 0 | tablespace_manager = tablespace_manager->CreateCloneWithTablespaceMap(tablespace_map); |
1892 | 0 | { |
1893 | 0 | LockGuard lock(tablespace_mutex_); |
1894 | 0 | tablespace_manager_ = tablespace_manager; |
1895 | 0 | } |
1896 | |
|
1897 | 0 | return tablespace_manager->GetTablespaceReplicationInfo(tablespace_id); |
1898 | 0 | } |
1899 | | |
1900 | 153k | bool CatalogManager::IsReplicationInfoSet(const ReplicationInfoPB& replication_info) { |
1901 | 153k | const auto& live_placement_info = replication_info.live_replicas(); |
1902 | 153k | if (!(live_placement_info.placement_blocks().empty() && |
1903 | 153k | live_placement_info.num_replicas() <= 0 && |
1904 | 153k | live_placement_info.placement_uuid().empty()) || |
1905 | 153k | !replication_info.read_replicas().empty() || |
1906 | 153k | !replication_info.affinitized_leaders().empty()) { |
1907 | | |
1908 | 17 | return true; |
1909 | 17 | } |
1910 | 153k | return false; |
1911 | 153k | } |
1912 | | |
1913 | 9 | Status CatalogManager::ValidateTableReplicationInfo(const ReplicationInfoPB& replication_info) { |
1914 | 9 | if (!IsReplicationInfoSet(replication_info)) { |
1915 | 0 | return STATUS(InvalidArgument, "No replication info set."); |
1916 | 0 | } |
1917 | | // We don't support setting any other fields other than live replica placements for now. |
1918 | 9 | if (!replication_info.read_replicas().empty() || |
1919 | 9 | !replication_info.affinitized_leaders().empty()) { |
1920 | |
|
1921 | 0 | return STATUS(InvalidArgument, "Only live placement info can be set for table " |
1922 | 0 | "level replication info."); |
1923 | 0 | } |
1924 | | // Today we support setting table level replication info only in clusters where read replica |
1925 | | // placements is not set. Return error if the cluster has read replica placements set. |
1926 | 9 | auto l = cluster_config_->LockForRead(); |
1927 | 9 | const ReplicationInfoPB& cluster_replication_info = l->pb.replication_info(); |
1928 | | // TODO(bogdan): figure this out when we expand on geopartition support. |
1929 | | // if (!cluster_replication_info.read_replicas().empty() || |
1930 | | // !cluster_replication_info.affinitized_leaders().empty()) { |
1931 | | |
1932 | | // return STATUS(InvalidArgument, "Setting table level replication info is not supported " |
1933 | | // "for clusters with read replica placements"); |
1934 | | // } |
1935 | | // If the replication info has placement_uuid set, verify that it matches the cluster |
1936 | | // placement_uuid. |
1937 | 9 | if (replication_info.live_replicas().placement_uuid().empty()) { |
1938 | 7 | return Status::OK(); |
1939 | 7 | } |
1940 | 2 | if (replication_info.live_replicas().placement_uuid() != |
1941 | 0 | cluster_replication_info.live_replicas().placement_uuid()) { |
1942 | |
|
1943 | 0 | return STATUS(InvalidArgument, "Placement uuid for table level replication info " |
1944 | 0 | "must match that of the cluster's live placement info."); |
1945 | 0 | } |
1946 | 2 | return Status::OK(); |
1947 | 2 | } |
1948 | | |
1949 | 533 | Result<shared_ptr<TablespaceIdToReplicationInfoMap>> CatalogManager::GetYsqlTablespaceInfo() { |
1950 | 533 | auto table_info = GetTableInfo(kPgTablespaceTableId); |
1951 | 533 | if (table_info == nullptr) { |
1952 | 50 | return STATUS(InternalError, "pg_tablespace table info not found"); |
1953 | 50 | } |
1954 | | |
1955 | 483 | auto tablespace_map = VERIFY_RESULT(sys_catalog_->ReadPgTablespaceInfo()); |
1956 | | |
1957 | | // The tablespace options do not usually contain the placement uuid. |
1958 | | // Populate the current cluster placement uuid into the placement information for |
1959 | | // each tablespace. |
1960 | 483 | string placement_uuid; |
1961 | 483 | { |
1962 | 483 | auto l = cluster_config_->LockForRead(); |
1963 | | // TODO(deepthi.srinivasan): Read-replica placements are not supported as |
1964 | | // of now. |
1965 | 483 | placement_uuid = l->pb.replication_info().live_replicas().placement_uuid(); |
1966 | 483 | } |
1967 | 483 | if (!placement_uuid.empty()) { |
1968 | 2 | for (auto& iter : *tablespace_map) { |
1969 | 2 | if (iter.second) { |
1970 | 0 | iter.second.value().mutable_live_replicas()->set_placement_uuid(placement_uuid); |
1971 | 0 | } |
1972 | 2 | } |
1973 | 1 | } |
1974 | | |
1975 | | // Before updating the tablespace placement map, validate the |
1976 | | // placement policies. |
1977 | 966 | for (auto& iter : *tablespace_map) { |
1978 | 966 | if (iter.second) { |
1979 | 0 | RETURN_NOT_OK(ValidateTableReplicationInfo(iter.second.value())); |
1980 | 0 | } |
1981 | 966 | } |
1982 | | |
1983 | 483 | return tablespace_map; |
1984 | 483 | } |
1985 | | |
1986 | | boost::optional<TablespaceId> CatalogManager::GetTransactionStatusTableTablespace( |
1987 | 2.13k | const scoped_refptr<TableInfo>& table) { |
1988 | 2.13k | auto lock = table->LockForRead(); |
1989 | 2.13k | if (lock->pb.table_type() != TRANSACTION_STATUS_TABLE_TYPE) { |
1990 | 0 | return boost::none; |
1991 | 0 | } |
1992 | | |
1993 | 2.13k | if (!lock->pb.has_transaction_table_tablespace_id()) { |
1994 | 2.13k | return boost::none; |
1995 | 2.13k | } |
1996 | | |
1997 | 0 | return lock->pb.transaction_table_tablespace_id(); |
1998 | 0 | } |
1999 | | |
2000 | 0 | void CatalogManager::ClearTransactionStatusTableTablespace(const scoped_refptr<TableInfo>& table) { |
2001 | 0 | auto lock = table->LockForWrite(); |
2002 | 0 | if (lock->pb.table_type() != TRANSACTION_STATUS_TABLE_TYPE) { |
2003 | 0 | return; |
2004 | 0 | } |
2005 | | |
2006 | 0 | lock.mutable_data()->pb.clear_transaction_table_tablespace_id(); |
2007 | 0 | lock.mutable_data()->pb.set_version(lock.mutable_data()->pb.version() + 1); |
2008 | 0 | lock.Commit(); |
2009 | 0 | } |
2010 | | |
2011 | | bool CatalogManager::CheckTransactionStatusTablesWithMissingTablespaces( |
2012 | 483 | const TablespaceIdToReplicationInfoMap& tablespace_info) { |
2013 | 483 | SharedLock lock(mutex_); |
2014 | 122 | for (const auto& table_id : transaction_table_ids_set_) { |
2015 | 122 | auto table = table_ids_map_->find(table_id); |
2016 | 122 | if (table == table_ids_map_->end()) { |
2017 | 0 | LOG(DFATAL) << "Table uuid " << table_id |
2018 | 0 | << " in transaction_table_ids_set_ but not in table_ids_map_"; |
2019 | 0 | continue; |
2020 | 0 | } |
2021 | 122 | auto tablespace_id = GetTransactionStatusTableTablespace(table->second); |
2022 | 122 | if (tablespace_id) { |
2023 | 0 | if (!tablespace_info.count(*tablespace_id)) { |
2024 | 0 | return true; |
2025 | 0 | } |
2026 | 0 | } |
2027 | 122 | } |
2028 | 483 | return false; |
2029 | 483 | } |
2030 | | |
2031 | | Status CatalogManager::UpdateTransactionStatusTableTablespaces( |
2032 | 483 | const TablespaceIdToReplicationInfoMap& tablespace_info) { |
2033 | 483 | if (CheckTransactionStatusTablesWithMissingTablespaces(tablespace_info)) { |
2034 | 0 | { |
2035 | 0 | LockGuard lock(mutex_); |
2036 | 0 | for (const auto& table_id : transaction_table_ids_set_) { |
2037 | 0 | auto table = table_ids_map_->find(table_id); |
2038 | 0 | if (table == table_ids_map_->end()) { |
2039 | 0 | LOG(DFATAL) << "Table uuid " << table_id |
2040 | 0 | << " in transaction_table_ids_set_ but not in table_ids_map_"; |
2041 | 0 | continue; |
2042 | 0 | } |
2043 | 0 | auto tablespace_id = GetTransactionStatusTableTablespace(table->second); |
2044 | 0 | if (tablespace_id) { |
2045 | 0 | if (!tablespace_info.count(*tablespace_id)) { |
2046 | | // TODO: We should also delete the transaction table, see #11123. |
2047 | 0 | LOG(INFO) << "Found transaction status table for tablespace id " << *tablespace_id |
2048 | 0 | << " which doesn't exist, clearing tablespace id"; |
2049 | 0 | ClearTransactionStatusTableTablespace(table->second); |
2050 | 0 | } |
2051 | 0 | } |
2052 | 0 | } |
2053 | 0 | } |
2054 | | |
2055 | | // A tablespace id has been cleared, meaning a transaction table's placement has changed, |
2056 | | // and thus the transaction tables version needs to be incremented. |
2057 | 0 | RETURN_NOT_OK(IncrementTransactionTablesVersion()); |
2058 | 0 | } |
2059 | | |
2060 | 483 | return Status::OK(); |
2061 | 483 | } |
2062 | | |
2063 | | Result<shared_ptr<TableToTablespaceIdMap>> CatalogManager::GetYsqlTableToTablespaceMap( |
2064 | 0 | const TablespaceIdToReplicationInfoMap& tablespace_info) { |
2065 | 0 | auto table_to_tablespace_map = std::make_shared<TableToTablespaceIdMap>(); |
2066 | | |
2067 | | // First fetch all namespaces. This is because the table_to_tablespace information is only |
2068 | | // found in the pg_class catalog table. There exists a separate pg_class table in each |
2069 | | // namespace. To build in-memory state for all tables, process pg_class table for each |
2070 | | // namespace. |
2071 | 0 | vector<NamespaceId> namespace_id_vec; |
2072 | 0 | set<NamespaceId> colocated_namespaces; |
2073 | 0 | { |
2074 | 0 | SharedLock lock(mutex_); |
2075 | 0 | for (const auto& ns : namespace_ids_map_) { |
2076 | 0 | if (ns.second->database_type() != YQL_DATABASE_PGSQL) { |
2077 | 0 | continue; |
2078 | 0 | } |
2079 | | |
2080 | 0 | if (ns.first == kPgSequencesDataNamespaceId) { |
2081 | | // Skip the database created for sequences system table. |
2082 | 0 | continue; |
2083 | 0 | } |
2084 | | |
2085 | 0 | if (ns.second->colocated()) { |
2086 | 0 | colocated_namespaces.insert(ns.first); |
2087 | 0 | } |
2088 | | |
2089 | | // TODO (Deepthi): Investigate if safe to skip template0 and template1 as well. |
2090 | 0 | namespace_id_vec.emplace_back(ns.first); |
2091 | 0 | } |
2092 | | |
2093 | | // Add local transaction tables corresponding to tablespaces. |
2094 | 0 | for (const auto& table_id : transaction_table_ids_set_) { |
2095 | 0 | auto table = table_ids_map_->find(table_id); |
2096 | 0 | if (table == table_ids_map_->end()) { |
2097 | 0 | LOG(DFATAL) << "Table uuid " << table_id |
2098 | 0 | << " in transaction_table_ids_set_ but not in table_ids_map_"; |
2099 | 0 | continue; |
2100 | 0 | } |
2101 | 0 | auto tablespace_id = GetTransactionStatusTableTablespace(table->second); |
2102 | 0 | if (tablespace_id) { |
2103 | 0 | if (tablespace_info.count(*tablespace_id)) { |
2104 | 0 | (*table_to_tablespace_map)[table_id] = *tablespace_id; |
2105 | 0 | } else { |
2106 | | // It's possible that a new tablespace had its transaction table created then deleted |
2107 | | // between when we checked tablespace ids and now; we ignore it here, and it will be |
2108 | | // caught and cleared in the next tablespace update. |
2109 | 0 | LOG(INFO) << "Found transaction status table for tablespace id " << *tablespace_id |
2110 | 0 | << " which doesn't exist, ignoring"; |
2111 | 0 | } |
2112 | 0 | } |
2113 | 0 | } |
2114 | 0 | } |
2115 | | |
2116 | | // For each namespace, fetch the table->tablespace information by reading pg_class |
2117 | | // table for each namespace. |
2118 | 0 | for (const NamespaceId& nsid : namespace_id_vec) { |
2119 | 0 | VLOG(1) << "Refreshing placement information for namespace " << nsid; |
2120 | 0 | const uint32_t database_oid = CHECK_RESULT(GetPgsqlDatabaseOid(nsid)); |
2121 | 0 | const bool is_colocated_database = colocated_namespaces.count(nsid) > 0; |
2122 | 0 | Status table_tablespace_status = sys_catalog_->ReadPgClassInfo(database_oid, |
2123 | 0 | is_colocated_database, |
2124 | 0 | table_to_tablespace_map.get()); |
2125 | 0 | if (!table_tablespace_status.ok()) { |
2126 | 0 | LOG(WARNING) << "Refreshing table->tablespace info failed for namespace " |
2127 | 0 | << nsid << " with error: " << table_tablespace_status.ToString(); |
2128 | 0 | } |
2129 | |
|
2130 | 0 | const bool pg_yb_tablegroup_exists = VERIFY_RESULT(DoesTableExist(FindTableById( |
2131 | 0 | GetPgsqlTableId(database_oid, kPgYbTablegroupTableOid)))); |
2132 | | |
2133 | | // no pg_yb_tablegroup means we only need to check pg_class |
2134 | 0 | if (table_tablespace_status.ok() && !pg_yb_tablegroup_exists) { |
2135 | 0 | VLOG(5) << "Successfully refreshed placement information for namespace " << nsid |
2136 | 0 | << " from pg_class"; |
2137 | 0 | continue; |
2138 | 0 | } |
2139 | | |
2140 | 0 | Status tablegroup_tablespace_status = sys_catalog_->ReadTablespaceInfoFromPgYbTablegroup( |
2141 | 0 | database_oid, |
2142 | 0 | table_to_tablespace_map.get()); |
2143 | 0 | if (!tablegroup_tablespace_status.ok()) { |
2144 | 0 | LOG(WARNING) << "Refreshing tablegroup->tablespace info failed for namespace " |
2145 | 0 | << nsid << " with error: " << tablegroup_tablespace_status.ToString(); |
2146 | 0 | } |
2147 | 0 | if (table_tablespace_status.ok() && tablegroup_tablespace_status.ok()) { |
2148 | 0 | VLOG(5) << "Successfully refreshed placement information for namespace " << nsid |
2149 | 0 | << " from pg_class and pg_yb_tablegroup"; |
2150 | 0 | } |
2151 | 0 | } |
2152 | |
|
2153 | 0 | return table_to_tablespace_map; |
2154 | 0 | } |
2155 | | |
2156 | | Status CatalogManager::CreateTransactionStatusTablesForTablespaces( |
2157 | | const TablespaceIdToReplicationInfoMap& tablespace_info, |
2158 | 0 | const TableToTablespaceIdMap& table_to_tablespace_map) { |
2159 | 0 | if (!GetAtomicFlag(&FLAGS_enable_ysql_tablespaces_for_placement) || |
2160 | 0 | !GetAtomicFlag(&FLAGS_auto_create_local_transaction_tables)) { |
2161 | 0 | return Status::OK(); |
2162 | 0 | } |
2163 | | |
2164 | 0 | std::unordered_set<TablespaceId> valid_tablespaces; |
2165 | 0 | for (const auto& entry : table_to_tablespace_map) { |
2166 | 0 | if (entry.second) { |
2167 | 0 | valid_tablespaces.insert(*entry.second); |
2168 | 0 | } |
2169 | 0 | } |
2170 | 0 | for (const auto& entry : tablespace_info) { |
2171 | 0 | if (!entry.second) { |
2172 | 0 | valid_tablespaces.erase(entry.first); |
2173 | 0 | } |
2174 | 0 | } |
2175 | |
|
2176 | 0 | for (const auto& tablespace_id : valid_tablespaces) { |
2177 | 0 | RETURN_NOT_OK(CreateLocalTransactionStatusTableIfNeeded(nullptr /* rpc */, tablespace_id)); |
2178 | 0 | } |
2179 | |
|
2180 | 0 | return Status::OK(); |
2181 | 0 | } |
2182 | | |
2183 | 17.2k | void CatalogManager::StartTablespaceBgTaskIfStopped() { |
2184 | 17.2k | if (GetAtomicFlag(&FLAGS_ysql_tablespace_info_refresh_secs) <= 0 || |
2185 | 17.2k | !GetAtomicFlag(&FLAGS_enable_ysql_tablespaces_for_placement)) { |
2186 | | // The tablespace bg task is disabled. Nothing to do. |
2187 | 0 | return; |
2188 | 0 | } |
2189 | | |
2190 | 17.2k | const bool is_task_running = tablespace_bg_task_running_.exchange(true); |
2191 | 17.2k | if (is_task_running) { |
2192 | | // Task already running, nothing to do. |
2193 | 16.8k | return; |
2194 | 16.8k | } |
2195 | | |
2196 | 414 | ScheduleRefreshTablespaceInfoTask(true /* schedule_now */); |
2197 | 414 | } |
2198 | | |
2199 | 947 | void CatalogManager::ScheduleRefreshTablespaceInfoTask(const bool schedule_now) { |
2200 | 947 | int wait_time = 0; |
2201 | | |
2202 | 947 | if (!schedule_now) { |
2203 | 533 | wait_time = GetAtomicFlag(&FLAGS_ysql_tablespace_info_refresh_secs); |
2204 | 533 | if (wait_time <= 0) { |
2205 | | // The tablespace refresh task has been disabled. |
2206 | 0 | tablespace_bg_task_running_ = false; |
2207 | 0 | return; |
2208 | 0 | } |
2209 | 947 | } |
2210 | | |
2211 | 947 | refresh_ysql_tablespace_info_task_.Schedule([this](const Status& status) { |
2212 | 561 | Status s = background_tasks_thread_pool_->SubmitFunc( |
2213 | 561 | std::bind(&CatalogManager::RefreshTablespaceInfoPeriodically, this)); |
2214 | 561 | if (!s.IsOk()) { |
2215 | | // Failed to submit task to the thread pool. Mark that the task is now |
2216 | | // no longer running. |
2217 | 0 | LOG(WARNING) << "Failed to schedule: RefreshTablespaceInfoPeriodically"; |
2218 | 0 | tablespace_bg_task_running_ = false; |
2219 | 0 | } |
2220 | 561 | }, wait_time * 1s); |
2221 | 947 | } |
2222 | | |
2223 | 560 | void CatalogManager::RefreshTablespaceInfoPeriodically() { |
2224 | 560 | if (!GetAtomicFlag(&FLAGS_enable_ysql_tablespaces_for_placement)) { |
2225 | 0 | tablespace_bg_task_running_ = false; |
2226 | 0 | return; |
2227 | 0 | } |
2228 | | |
2229 | 560 | if (!CheckIsLeaderAndReady().IsOk()) { |
2230 | 27 | LOG(INFO) << "No longer the leader, so cancelling tablespace info task"; |
2231 | 27 | tablespace_bg_task_running_ = false; |
2232 | 27 | return; |
2233 | 27 | } |
2234 | | |
2235 | | // Refresh the tablespace info in memory. |
2236 | 533 | Status s = DoRefreshTablespaceInfo(); |
2237 | 533 | if (!s.IsOk()) { |
2238 | 50 | LOG(WARNING) << "Tablespace refresh task failed with error " << s.ToString(); |
2239 | 50 | } |
2240 | | |
2241 | | // Schedule the next iteration of the task. |
2242 | 533 | ScheduleRefreshTablespaceInfoTask(); |
2243 | 533 | } |
2244 | | |
2245 | 468 | Status CatalogManager::DoRefreshTablespaceInfo() { |
2246 | 0 | VLOG(2) << "Running RefreshTablespaceInfoPeriodically task"; |
2247 | | |
2248 | | // First refresh the tablespace info in memory. |
2249 | 468 | auto tablespace_info = VERIFY_RESULT(GetYsqlTablespaceInfo()); |
2250 | | |
2251 | | // Clear tablespace ids for transaction tables mapped to missing tablespaces. |
2252 | 468 | RETURN_NOT_OK(UpdateTransactionStatusTableTablespaces(*tablespace_info)); |
2253 | | |
2254 | 468 | shared_ptr<TableToTablespaceIdMap> table_to_tablespace_map = nullptr; |
2255 | | |
2256 | 468 | if (tablespace_info->size() > kYsqlNumDefaultTablespaces) { |
2257 | | // There exist custom tablespaces in the system. Fetch the table->tablespace |
2258 | | // map from PG catalog tables. |
2259 | 0 | table_to_tablespace_map = VERIFY_RESULT(GetYsqlTableToTablespaceMap(*tablespace_info)); |
2260 | 0 | } |
2261 | | |
2262 | | // Update tablespace_manager_. |
2263 | 468 | { |
2264 | 468 | LockGuard lock(tablespace_mutex_); |
2265 | 468 | tablespace_manager_ = std::make_shared<YsqlTablespaceManager>(tablespace_info, |
2266 | 468 | table_to_tablespace_map); |
2267 | 468 | } |
2268 | | |
2269 | 468 | if (table_to_tablespace_map) { |
2270 | | // Trigger transaction table creates for tablespaces with tables and no transaction tables. |
2271 | 0 | RETURN_NOT_OK(CreateTransactionStatusTablesForTablespaces( |
2272 | 0 | *tablespace_info, *table_to_tablespace_map)); |
2273 | 0 | } |
2274 | | |
2275 | 0 | VLOG(3) << "Refreshed tablespace information in memory"; |
2276 | 468 | return Status::OK(); |
2277 | 468 | } |
2278 | | |
2279 | | Status CatalogManager::AddIndexInfoToTable(const scoped_refptr<TableInfo>& indexed_table, |
2280 | | const IndexInfoPB& index_info, |
2281 | 603 | CreateTableResponsePB* resp) { |
2282 | 603 | LOG(INFO) << "AddIndexInfoToTable to " << indexed_table->ToString() << " IndexInfo " |
2283 | 603 | << yb::ToString(index_info); |
2284 | 603 | TRACE("Locking indexed table"); |
2285 | 603 | auto l = DCHECK_NOTNULL(indexed_table)->LockForWrite(); |
2286 | 603 | RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp)); |
2287 | | |
2288 | | // Make sure that the index appears to not have been added to the table until the tservers apply |
2289 | | // the alter and respond back. |
2290 | | // Heed issue #6233. |
2291 | 603 | if (!l->pb.has_fully_applied_schema()) { |
2292 | 591 | MultiStageAlterTable::CopySchemaDetailsToFullyApplied(&l.mutable_data()->pb); |
2293 | 591 | } |
2294 | | |
2295 | | // Add index info to indexed table and increment schema version. |
2296 | 603 | auto& pb = l.mutable_data()->pb; |
2297 | 603 | pb.add_indexes()->CopyFrom(index_info); |
2298 | 603 | pb.set_version(l.mutable_data()->pb.version() + 1); |
2299 | 603 | pb.set_updates_only_index_permissions(false); |
2300 | 603 | l.mutable_data()->set_state( |
2301 | 603 | SysTablesEntryPB::ALTERING, |
2302 | 603 | Format("Add index info version=$0 ts=$1", pb.version(), LocalTimeAsString())); |
2303 | | |
2304 | | // Update sys-catalog with the new indexed table info. |
2305 | 603 | TRACE("Updating indexed table metadata on disk"); |
2306 | 603 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), indexed_table)); |
2307 | | |
2308 | | // Update the in-memory state. |
2309 | 603 | TRACE("Committing in-memory state"); |
2310 | 603 | l.Commit(); |
2311 | | |
2312 | 603 | RETURN_NOT_OK(SendAlterTableRequest(indexed_table)); |
2313 | | |
2314 | 603 | return Status::OK(); |
2315 | 603 | } |
2316 | | |
2317 | | Status CatalogManager::CreateCopartitionedTable(const CreateTableRequestPB& req, |
2318 | | CreateTableResponsePB* resp, |
2319 | | rpc::RpcContext* rpc, |
2320 | | Schema schema, |
2321 | 0 | scoped_refptr<NamespaceInfo> ns) { |
2322 | 0 | scoped_refptr<TableInfo> parent_table_info; |
2323 | 0 | Status s; |
2324 | 0 | PartitionSchema partition_schema; |
2325 | 0 | std::vector<Partition> partitions; |
2326 | |
|
2327 | 0 | const NamespaceId& namespace_id = ns->id(); |
2328 | 0 | const NamespaceName& namespace_name = ns->name(); |
2329 | |
|
2330 | 0 | LockGuard lock(mutex_); |
2331 | 0 | TRACE("Acquired catalog manager lock"); |
2332 | 0 | parent_table_info = FindPtrOrNull(*table_ids_map_, |
2333 | 0 | schema.table_properties().CopartitionTableId()); |
2334 | 0 | if (parent_table_info == nullptr) { |
2335 | 0 | s = STATUS(NotFound, "The object does not exist: copartitioned table with id", |
2336 | 0 | schema.table_properties().CopartitionTableId()); |
2337 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); |
2338 | 0 | } |
2339 | | |
2340 | 0 | TableInfoPtr this_table_info; |
2341 | | // Verify that the table does not exist. |
2342 | 0 | this_table_info = FindPtrOrNull(table_names_map_, {namespace_id, req.name()}); |
2343 | |
|
2344 | 0 | if (this_table_info != nullptr) { |
2345 | 0 | s = STATUS_SUBSTITUTE(AlreadyPresent, |
2346 | 0 | "Object '$0.$1' already exists", |
2347 | 0 | GetNamespaceNameUnlocked(this_table_info), this_table_info->name()); |
2348 | 0 | LOG(WARNING) << "Found table: " << this_table_info->ToStringWithState() |
2349 | 0 | << ". Failed creating copartitioned table with error: " |
2350 | 0 | << s.ToString() << " Request:\n" << req.DebugString(); |
2351 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_ALREADY_PRESENT, s); |
2352 | 0 | } |
2353 | | // Don't add copartitioned tables to Namespaces that aren't running. |
2354 | 0 | if (ns->state() != SysNamespaceEntryPB::RUNNING) { |
2355 | 0 | Status s = STATUS_SUBSTITUTE(TryAgain, |
2356 | 0 | "Namespace not running (State=$0). Cannot create $1.$2", |
2357 | 0 | ns->state(), ns->name(), req.name() ); |
2358 | 0 | return SetupError(resp->mutable_error(), NamespaceMasterError(ns->state()), s); |
2359 | 0 | } |
2360 | | |
2361 | | // TODO: pass index_info for copartitioned index. |
2362 | 0 | RETURN_NOT_OK(CreateTableInMemory( |
2363 | 0 | req, schema, partition_schema, namespace_id, namespace_name, |
2364 | 0 | partitions, nullptr, nullptr, resp, &this_table_info)); |
2365 | |
|
2366 | 0 | TRACE("Inserted new table info into CatalogManager maps"); |
2367 | | |
2368 | | // NOTE: the table is already locked for write at this point, |
2369 | | // since the CreateTableInfo function leave it in that state. |
2370 | | // It will get committed at the end of this function. |
2371 | | // Sanity check: the table should be in "preparing" state. |
2372 | 0 | CHECK_EQ(SysTablesEntryPB::PREPARING, this_table_info->metadata().dirty().pb.state()); |
2373 | 0 | TabletInfos tablets = parent_table_info->GetTablets(); |
2374 | 0 | for (auto tablet : tablets) { |
2375 | 0 | tablet->mutable_metadata()->StartMutation(); |
2376 | 0 | tablet->mutable_metadata()->mutable_dirty()->pb.add_table_ids(this_table_info->id()); |
2377 | 0 | } |
2378 | | |
2379 | | // Update Tablets about new table id to sys-tablets. |
2380 | 0 | s = sys_catalog_->Upsert(leader_ready_term(), tablets); |
2381 | 0 | if (PREDICT_FALSE(!s.ok())) { |
2382 | 0 | return AbortTableCreation(this_table_info.get(), tablets, s.CloneAndPrepend( |
2383 | 0 | Substitute("An error occurred while inserting to sys-tablets: $0", s.ToString())), resp); |
2384 | 0 | } |
2385 | 0 | TRACE("Wrote tablets to system table"); |
2386 | | |
2387 | | // Update the on-disk table state to "running". |
2388 | 0 | this_table_info->AddTablets(tablets); |
2389 | 0 | this_table_info->mutable_metadata()->mutable_dirty()->pb.set_state(SysTablesEntryPB::RUNNING); |
2390 | 0 | s = sys_catalog_->Upsert(leader_ready_term(), this_table_info); |
2391 | 0 | if (PREDICT_FALSE(!s.ok())) { |
2392 | 0 | return AbortTableCreation(this_table_info.get(), tablets, s.CloneAndPrepend( |
2393 | 0 | Substitute("An error occurred while inserting to sys-tablets: $0", |
2394 | 0 | s.ToString())), resp); |
2395 | 0 | } |
2396 | 0 | TRACE("Wrote table to system table"); |
2397 | | |
2398 | | // Commit the in-memory state. |
2399 | 0 | this_table_info->mutable_metadata()->CommitMutation(); |
2400 | |
|
2401 | 0 | for (const auto& tablet : tablets) { |
2402 | 0 | tablet->mutable_metadata()->CommitMutation(); |
2403 | 0 | } |
2404 | |
|
2405 | 0 | for (const auto& tablet : tablets) { |
2406 | 0 | SendCopartitionTabletRequest(tablet, this_table_info); |
2407 | 0 | } |
2408 | |
|
2409 | 0 | LOG(INFO) << "Successfully created table " << this_table_info->ToString() |
2410 | 0 | << " per request from " << RequestorString(rpc); |
2411 | 0 | return Status::OK(); |
2412 | 0 | } |
2413 | | |
2414 | | |
2415 | | template <class Req, class Resp, class Action> |
2416 | 0 | Status CatalogManager::PerformOnSysCatalogTablet(const Req& req, Resp* resp, const Action& action) { |
2417 | 0 | auto tablet_peer = sys_catalog_->tablet_peer(); |
2418 | 0 | auto shared_tablet = tablet_peer ? tablet_peer->shared_tablet() : nullptr; |
2419 | 0 | if (!shared_tablet) { |
2420 | 0 | return SetupError( |
2421 | 0 | resp->mutable_error(), |
2422 | 0 | MasterErrorPB::TABLET_NOT_RUNNING, |
2423 | 0 | STATUS(NotFound, "The sys catalog tablet was not found.")); |
2424 | 0 | } |
2425 | | |
2426 | 0 | auto s = action(shared_tablet); |
2427 | 0 | if (!s.ok()) { |
2428 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INTERNAL_ERROR, s); |
2429 | 0 | } |
2430 | | |
2431 | 0 | return Status::OK(); |
2432 | 0 | } Unexecuted instantiation: catalog_manager.cc:_ZN2yb6master14CatalogManager25PerformOnSysCatalogTabletIPKNS0_24FlushSysCatalogRequestPBENS0_25FlushSysCatalogResponsePBEZNS1_15FlushSysCatalogES5_PS6_PNS_3rpc10RpcContextEE3$_4EENS_6StatusERKT_PT0_RKT1_ Unexecuted instantiation: catalog_manager.cc:_ZN2yb6master14CatalogManager25PerformOnSysCatalogTabletIPKNS0_26CompactSysCatalogRequestPBENS0_27CompactSysCatalogResponsePBEZNS1_17CompactSysCatalogES5_PS6_PNS_3rpc10RpcContextEE3$_5EENS_6StatusERKT_PT0_RKT1_ |
2433 | | |
2434 | | Status CatalogManager::FlushSysCatalog( |
2435 | | const FlushSysCatalogRequestPB* req, |
2436 | | FlushSysCatalogResponsePB* resp, |
2437 | 0 | rpc::RpcContext* context) { |
2438 | 0 | return PerformOnSysCatalogTablet(req, resp, [](auto shared_tablet) { |
2439 | 0 | return shared_tablet->Flush(tablet::FlushMode::kSync); |
2440 | 0 | }); |
2441 | 0 | } |
2442 | | |
2443 | | Status CatalogManager::CompactSysCatalog( |
2444 | | const CompactSysCatalogRequestPB* req, |
2445 | | CompactSysCatalogResponsePB* resp, |
2446 | 0 | rpc::RpcContext* context) { |
2447 | 0 | return PerformOnSysCatalogTablet(req, resp, [](auto shared_tablet) { |
2448 | 0 | return shared_tablet->ForceFullRocksDBCompact(); |
2449 | 0 | }); |
2450 | 0 | } |
2451 | | |
2452 | | namespace { |
2453 | | |
2454 | | Result<std::array<PartitionPB, kNumSplitParts>> CreateNewTabletsPartition( |
2455 | 45 | const TabletInfo& tablet_info, const std::string& split_partition_key) { |
2456 | 45 | const auto& source_partition = tablet_info.LockForRead()->pb.partition(); |
2457 | | |
2458 | 45 | if (split_partition_key <= source_partition.partition_key_start() || |
2459 | 45 | (!source_partition.partition_key_end().empty() && |
2460 | 32 | split_partition_key >= source_partition.partition_key_end())) { |
2461 | 0 | return STATUS_FORMAT( |
2462 | 0 | InvalidArgument, |
2463 | 0 | "Can't split tablet $0 (partition_key_start: $1 partition_key_end: $2) by partition " |
2464 | 0 | "boundary (split_key: $3)", |
2465 | 0 | tablet_info.tablet_id(), source_partition.partition_key_start(), |
2466 | 0 | source_partition.partition_key_end(), split_partition_key); |
2467 | 0 | } |
2468 | | |
2469 | 45 | std::array<PartitionPB, kNumSplitParts> new_tablets_partition; |
2470 | | |
2471 | 45 | new_tablets_partition.fill(source_partition); |
2472 | | |
2473 | 45 | new_tablets_partition[0].set_partition_key_end(split_partition_key); |
2474 | 45 | new_tablets_partition[1].set_partition_key_start(split_partition_key); |
2475 | 45 | static_assert(kNumSplitParts == 2, "We expect tablet to be split into 2 new tablets here"); |
2476 | | |
2477 | 45 | return new_tablets_partition; |
2478 | 45 | } |
2479 | | |
2480 | | } // namespace |
2481 | | |
2482 | | CHECKED_STATUS CatalogManager::TEST_SplitTablet( |
2483 | | const TabletId& tablet_id, const std::string& split_encoded_key, |
2484 | 0 | const std::string& split_partition_key) { |
2485 | 0 | auto source_tablet_info = VERIFY_RESULT(GetTabletInfo(tablet_id)); |
2486 | 0 | return DoSplitTablet(source_tablet_info, split_encoded_key, split_partition_key, |
2487 | 0 | true /* select_all_tablets_for_split */); |
2488 | 0 | } |
2489 | | |
2490 | | Status CatalogManager::TEST_SplitTablet( |
2491 | 0 | const scoped_refptr<TabletInfo>& source_tablet_info, docdb::DocKeyHash split_hash_code) { |
2492 | 0 | return DoSplitTablet(source_tablet_info, split_hash_code, |
2493 | 0 | true /* select_all_tablets_for_split */); |
2494 | 0 | } |
2495 | | |
2496 | 0 | Status CatalogManager::TEST_IncrementTablePartitionListVersion(const TableId& table_id) { |
2497 | 0 | auto table_info = GetTableInfo(table_id); |
2498 | 0 | SCHECK(table_info != nullptr, NotFound, Format("Table $0 not found", table_id)); |
2499 | |
|
2500 | 0 | LockGuard lock(mutex_); |
2501 | 0 | auto table_lock = table_info->LockForWrite(); |
2502 | 0 | auto& table_pb = table_lock.mutable_data()->pb; |
2503 | 0 | table_pb.set_partition_list_version(table_pb.partition_list_version() + 1); |
2504 | 0 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), table_info)); |
2505 | 0 | table_lock.Commit(); |
2506 | 0 | return Status::OK(); |
2507 | 0 | } |
2508 | | |
2509 | | Result<ReplicationInfoPB> CatalogManager::GetTableReplicationInfo( |
2510 | 81 | const TabletInfo& tablet_info) const { |
2511 | 81 | auto table = tablet_info.table(); |
2512 | 81 | { |
2513 | 81 | auto table_lock = table->LockForRead(); |
2514 | 81 | if (table_lock->pb.has_replication_info()) { |
2515 | 0 | return table_lock->pb.replication_info(); |
2516 | 0 | } |
2517 | 81 | } |
2518 | | |
2519 | 81 | auto replication_info_opt = VERIFY_RESULT( |
2520 | 81 | GetTablespaceManager()->GetTableReplicationInfo(table)); |
2521 | 81 | if (replication_info_opt) { |
2522 | 0 | return replication_info_opt.value(); |
2523 | 0 | } |
2524 | | |
2525 | 81 | return cluster_config_->LockForRead()->pb.replication_info(); |
2526 | 81 | } |
2527 | | |
2528 | | bool CatalogManager::ShouldSplitValidCandidate( |
2529 | 545k | const TabletInfo& tablet_info, const TabletReplicaDriveInfo& drive_info) const { |
2530 | 545k | if (drive_info.may_have_orphaned_post_split_data) { |
2531 | 87.0k | return false; |
2532 | 87.0k | } |
2533 | 458k | ssize_t size = drive_info.sst_files_size; |
2534 | 0 | DCHECK(size >= 0) << "Detected overflow in casting sst_files_size to signed int."; |
2535 | 458k | if (size < FLAGS_tablet_split_low_phase_size_threshold_bytes) { |
2536 | 458k | return false; |
2537 | 458k | } |
2538 | 81 | TSDescriptorVector ts_descs = GetAllLiveNotBlacklistedTServers(); |
2539 | | |
2540 | 81 | size_t num_servers = 0; |
2541 | 81 | auto table_replication_info_or_status = GetTableReplicationInfo(tablet_info); |
2542 | | |
2543 | | // If there is custom placement information present then |
2544 | | // only count the tservers which the table has access to |
2545 | | // according to the placement policy |
2546 | 81 | if (table_replication_info_or_status.ok() |
2547 | 81 | && table_replication_info_or_status->has_live_replicas()) { |
2548 | 0 | auto pb = table_replication_info_or_status->live_replicas(); |
2549 | 0 | auto valid_tservers_res = FindTServersForPlacementInfo( |
2550 | 0 | table_replication_info_or_status->live_replicas(), ts_descs); |
2551 | 0 | if (!valid_tservers_res.ok()) { |
2552 | 0 | num_servers = ts_descs.size(); |
2553 | 0 | } else { |
2554 | 0 | num_servers = valid_tservers_res.get().size(); |
2555 | 0 | } |
2556 | 81 | } else { |
2557 | 81 | num_servers = ts_descs.size(); |
2558 | 81 | } |
2559 | | |
2560 | 81 | int64 num_tablets_per_server = tablet_info.table()->NumPartitions() / num_servers; |
2561 | | |
2562 | 81 | if (num_tablets_per_server < FLAGS_tablet_split_low_phase_shard_count_per_node) { |
2563 | 0 | return size > FLAGS_tablet_split_low_phase_size_threshold_bytes; |
2564 | 0 | } |
2565 | 81 | if (num_tablets_per_server < FLAGS_tablet_split_high_phase_shard_count_per_node) { |
2566 | 0 | return size > FLAGS_tablet_split_high_phase_size_threshold_bytes; |
2567 | 0 | } |
2568 | 81 | return size > FLAGS_tablet_force_split_threshold_bytes; |
2569 | 81 | } |
2570 | | |
2571 | | Status CatalogManager::DoSplitTablet( |
2572 | | const scoped_refptr<TabletInfo>& source_tablet_info, std::string split_encoded_key, |
2573 | 45 | std::string split_partition_key, bool select_all_tablets_for_split) { |
2574 | 45 | auto source_table_lock = source_tablet_info->table()->LockForWrite(); |
2575 | 45 | auto source_tablet_lock = source_tablet_info->LockForWrite(); |
2576 | | |
2577 | | // We must re-validate the split candidate here *after* grabbing locks on the table and tablet to |
2578 | | // ensure a backfill does not happen before we modify catalog metadata to include new subtablets. |
2579 | | // This process adds new subtablets in the CREATING state, which if encountered by backfill code |
2580 | | // will block the backfill process. |
2581 | 45 | RETURN_NOT_OK(tablet_split_manager_.ValidateSplitCandidateTable(*source_tablet_info->table())); |
2582 | 45 | RETURN_NOT_OK(tablet_split_manager_.ValidateSplitCandidateTablet(*source_tablet_info)); |
2583 | | |
2584 | 45 | auto drive_info = VERIFY_RESULT(source_tablet_info->GetLeaderReplicaDriveInfo()); |
2585 | 45 | if (!select_all_tablets_for_split && |
2586 | 35 | !ShouldSplitValidCandidate(*source_tablet_info, drive_info)) { |
2587 | | // It is possible that we queued up a split candidate in TabletSplitManager which was, at the |
2588 | | // time, a valid split candidate, but by the time the candidate was actually processed here, the |
2589 | | // cluster may have changed, putting us in a new split threshold phase, and it may no longer be |
2590 | | // a valid candidate. This is not an unexpected error, but we should bail out of splitting this |
2591 | | // tablet regardless. |
2592 | 0 | return STATUS_FORMAT( |
2593 | 0 | InvalidArgument, |
2594 | 0 | "Tablet split candidate $0 is no longer a valid split candidate.", |
2595 | 0 | source_tablet_info->tablet_id()); |
2596 | 0 | } |
2597 | | |
2598 | | // Check if at least one child tablet already registered |
2599 | 45 | if (source_tablet_lock->pb.split_tablet_ids().size() > 0) { |
2600 | 27 | const auto child_tablet_id = source_tablet_lock->pb.split_tablet_ids(0); |
2601 | 27 | const auto child_tablet = VERIFY_RESULT(GetTabletInfo(child_tablet_id)); |
2602 | 27 | const auto parent_partition = source_tablet_lock->pb.partition(); |
2603 | 27 | const auto child_partition = child_tablet->LockForRead()->pb.partition(); |
2604 | | |
2605 | 27 | if (parent_partition.partition_key_start() == child_partition.partition_key_start()) { |
2606 | 27 | split_partition_key = child_partition.partition_key_end(); |
2607 | 0 | } else { |
2608 | 0 | SCHECK_EQ(parent_partition.partition_key_end(), child_partition.partition_key_end(), |
2609 | 0 | IllegalState, "Parent partion key end does not equal child partition key end"); |
2610 | 0 | split_partition_key = child_partition.partition_key_start(); |
2611 | 0 | } |
2612 | | |
2613 | | // Re-compute the encoded key |
2614 | | // to ensure we use the same partition boundary for both child tablets |
2615 | 27 | split_encoded_key = PartitionSchema::GetEncodedKeyPrefix( |
2616 | 27 | split_partition_key, source_table_lock->pb.partition_schema()); |
2617 | 27 | } |
2618 | | |
2619 | 45 | LOG(INFO) << "Starting tablet split: " << source_tablet_info->ToString() |
2620 | 45 | << " by partition key: " << Slice(split_partition_key).ToDebugHexString(); |
2621 | | |
2622 | 45 | std::array<PartitionPB, kNumSplitParts> new_tablets_partition = VERIFY_RESULT( |
2623 | 45 | CreateNewTabletsPartition(*source_tablet_info, split_partition_key)); |
2624 | | |
2625 | 45 | std::array<TabletId, kNumSplitParts> new_tablet_ids; |
2626 | 133 | for (int i = 0; i < kNumSplitParts; ++i) { |
2627 | 88 | if (i < source_tablet_lock->pb.split_tablet_ids_size()) { |
2628 | | // Post-split tablet `i` has been already registered. |
2629 | 52 | new_tablet_ids[i] = source_tablet_lock->pb.split_tablet_ids(i); |
2630 | 36 | } else { |
2631 | 36 | auto new_tablet_info = VERIFY_RESULT(RegisterNewTabletForSplit( |
2632 | 36 | source_tablet_info.get(), new_tablets_partition[i], |
2633 | 36 | &source_table_lock, &source_tablet_lock)); |
2634 | | |
2635 | 36 | new_tablet_ids[i] = new_tablet_info->id(); |
2636 | 36 | } |
2637 | 88 | } |
2638 | 45 | source_tablet_lock.Commit(); |
2639 | 45 | source_table_lock.Commit(); |
2640 | | |
2641 | | // TODO(tsplit): what if source tablet will be deleted before or during TS leader is processing |
2642 | | // split? Add unit-test. |
2643 | 45 | RETURN_NOT_OK(SendSplitTabletRequest( |
2644 | 45 | source_tablet_info, new_tablet_ids, split_encoded_key, split_partition_key)); |
2645 | | |
2646 | 45 | return Status::OK(); |
2647 | 45 | } |
2648 | | |
2649 | | Status CatalogManager::DoSplitTablet( |
2650 | | const scoped_refptr<TabletInfo>& source_tablet_info, docdb::DocKeyHash split_hash_code, |
2651 | 0 | bool select_all_tablets_for_split) { |
2652 | 0 | docdb::KeyBytes split_encoded_key; |
2653 | 0 | docdb::DocKeyEncoderAfterTableIdStep(&split_encoded_key) |
2654 | 0 | .Hash(split_hash_code, std::vector<docdb::PrimitiveValue>()); |
2655 | |
|
2656 | 0 | const auto split_partition_key = PartitionSchema::EncodeMultiColumnHashValue(split_hash_code); |
2657 | |
|
2658 | 0 | return DoSplitTablet(source_tablet_info, split_encoded_key.ToStringBuffer(), split_partition_key, |
2659 | 0 | select_all_tablets_for_split); |
2660 | 0 | } |
2661 | | |
2662 | 80 | Result<scoped_refptr<TabletInfo>> CatalogManager::GetTabletInfo(const TabletId& tablet_id) { |
2663 | 80 | LockGuard lock(mutex_); |
2664 | 80 | TRACE("Acquired catalog manager lock"); |
2665 | | |
2666 | 80 | const auto tablet_info = FindPtrOrNull(*tablet_map_, tablet_id); |
2667 | 80 | SCHECK(tablet_info != nullptr, NotFound, Format("Tablet $0 not found", tablet_id)); |
2668 | | |
2669 | 80 | return tablet_info; |
2670 | 80 | } |
2671 | | |
2672 | | void CatalogManager::SplitTabletWithKey( |
2673 | | const scoped_refptr<TabletInfo>& tablet, const std::string& split_encoded_key, |
2674 | 45 | const std::string& split_partition_key, const bool select_all_tablets_for_split) { |
2675 | | // Note that DoSplitTablet() will trigger an async SplitTablet task, and will only return not OK() |
2676 | | // if it failed to submit that task. In other words, any failures here are not retriable, and |
2677 | | // success indicates that an async and automatically retrying task was submitted. |
2678 | 45 | auto s = DoSplitTablet( |
2679 | 45 | tablet, split_encoded_key, split_partition_key, select_all_tablets_for_split); |
2680 | 45 | WARN_NOT_OK(s, Format("Failed to split tablet with GetSplitKey result for tablet: $0", |
2681 | 45 | tablet->tablet_id())); |
2682 | 45 | } |
2683 | | |
2684 | 45 | Status CatalogManager::SplitTablet(const TabletId& tablet_id, bool select_all_tablets_for_split) { |
2685 | 45 | LOG(INFO) << "Got tablet to split: " << tablet_id; |
2686 | | |
2687 | 45 | const auto tablet = VERIFY_RESULT(GetTabletInfo(tablet_id)); |
2688 | | |
2689 | 0 | VLOG(2) << "Scheduling GetSplitKey request to leader tserver for source tablet ID: " |
2690 | 0 | << tablet->tablet_id(); |
2691 | 45 | auto call = std::make_shared<AsyncGetTabletSplitKey>( |
2692 | 45 | master_, AsyncTaskPool(), tablet, |
2693 | 45 | [this, tablet, select_all_tablets_for_split] |
2694 | 45 | (const Result<AsyncGetTabletSplitKey::Data>& result) { |
2695 | 45 | if (result.ok()) { |
2696 | 45 | SplitTabletWithKey(tablet, result->split_encoded_key, result->split_partition_key, |
2697 | 45 | select_all_tablets_for_split); |
2698 | 0 | } else { |
2699 | 0 | LOG(WARNING) << "AsyncGetTabletSplitKey task failed with status: " << result.status(); |
2700 | 0 | } |
2701 | 45 | }); |
2702 | 45 | tablet->table()->AddTask(call); |
2703 | 45 | return ScheduleTask(call); |
2704 | 45 | } |
2705 | | |
2706 | | Status CatalogManager::SplitTablet( |
2707 | 10 | const SplitTabletRequestPB* req, SplitTabletResponsePB* resp, rpc::RpcContext* rpc) { |
2708 | 10 | const auto source_tablet_id = req->tablet_id(); |
2709 | 10 | return SplitTablet(source_tablet_id, true /* select_all_tablets_for_split */); |
2710 | 10 | } |
2711 | | |
2712 | | Status CatalogManager::DeleteNotServingTablet( |
2713 | | const DeleteNotServingTabletRequestPB* req, DeleteNotServingTabletResponsePB* resp, |
2714 | 8 | rpc::RpcContext* rpc) { |
2715 | 8 | const auto& tablet_id = req->tablet_id(); |
2716 | 8 | const auto tablet_info = VERIFY_RESULT(GetTabletInfo(tablet_id)); |
2717 | | |
2718 | 8 | if (PREDICT_FALSE(FLAGS_TEST_reject_delete_not_serving_tablet_rpc)) { |
2719 | 0 | TEST_SYNC_POINT("CatalogManager::DeleteNotServingTablet:Reject"); |
2720 | 0 | return STATUS( |
2721 | 0 | InvalidArgument, "Rejecting due to FLAGS_TEST_reject_delete_not_serving_tablet_rpc"); |
2722 | 0 | } |
2723 | | |
2724 | 8 | const auto& table_info = tablet_info->table(); |
2725 | | |
2726 | 8 | RETURN_NOT_OK(CheckIfForbiddenToDeleteTabletOf(table_info)); |
2727 | | |
2728 | 8 | RETURN_NOT_OK(CatalogManagerUtil::CheckIfCanDeleteSingleTablet(tablet_info)); |
2729 | | |
2730 | 7 | auto schedules_to_tables_map = VERIFY_RESULT( |
2731 | 7 | MakeSnapshotSchedulesToObjectIdsMap(SysRowEntryType::TABLE)); |
2732 | 7 | RepeatedBytes retained_by_snapshot_schedules; |
2733 | 7 | FillRetainedBySnapshotSchedules( |
2734 | 7 | schedules_to_tables_map, table_info->id(), &retained_by_snapshot_schedules); |
2735 | | |
2736 | 7 | return DeleteTabletListAndSendRequests( |
2737 | 7 | { tablet_info }, "Not serving tablet deleted upon request at " + LocalTimeAsString(), |
2738 | 7 | retained_by_snapshot_schedules); |
2739 | 7 | } |
2740 | | |
2741 | | Status CatalogManager::DdlLog( |
2742 | 0 | const DdlLogRequestPB* req, DdlLogResponsePB* resp, rpc::RpcContext* rpc) { |
2743 | 0 | return sys_catalog_->FetchDdlLog(resp->mutable_entries()); |
2744 | 0 | } |
2745 | | |
2746 | | namespace { |
2747 | | |
2748 | 6.29k | CHECKED_STATUS ValidateCreateTableSchema(const Schema& schema, CreateTableResponsePB* resp) { |
2749 | 6.29k | if (schema.num_key_columns() <= 0) { |
2750 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, |
2751 | 0 | STATUS(InvalidArgument, "Must specify at least one key column")); |
2752 | 0 | } |
2753 | 17.4k | for (size_t i = 0; i < schema.num_key_columns(); i++) { |
2754 | 11.1k | if (!IsTypeAllowableInKey(schema.column(i).type_info())) { |
2755 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, |
2756 | 0 | STATUS(InvalidArgument, "Invalid datatype for primary key column")); |
2757 | 0 | } |
2758 | 11.1k | } |
2759 | 6.29k | return Status::OK(); |
2760 | 6.29k | } |
2761 | | |
2762 | | } // namespace |
2763 | | |
2764 | | Status CatalogManager::CreateYsqlSysTable(const CreateTableRequestPB* req, |
2765 | 2.26k | CreateTableResponsePB* resp) { |
2766 | 2.26k | LOG(INFO) << "CreateYsqlSysTable: " << req->name(); |
2767 | | // Lookup the namespace and verify if it exists. |
2768 | 2.26k | TRACE("Looking up namespace"); |
2769 | 2.26k | auto ns = VERIFY_RESULT(FindNamespace(req->namespace_())); |
2770 | 2.26k | const NamespaceId& namespace_id = ns->id(); |
2771 | 2.26k | const NamespaceName& namespace_name = ns->name(); |
2772 | | |
2773 | 2.26k | Schema schema; |
2774 | 2.26k | RETURN_NOT_OK(SchemaFromPB(req->schema(), &schema)); |
2775 | | // If the schema contains column ids, we are copying a Postgres table from one namespace to |
2776 | | // another. Anyway, validate the schema. |
2777 | 2.26k | RETURN_NOT_OK(ValidateCreateTableSchema(schema, resp)); |
2778 | 2.26k | if (!schema.has_column_ids()) { |
2779 | 0 | schema.InitColumnIdsByDefault(); |
2780 | 0 | } |
2781 | 2.26k | schema.mutable_table_properties()->set_is_ysql_catalog_table(true); |
2782 | | |
2783 | | // Verify no hash partition schema is specified. |
2784 | 2.26k | if (req->partition_schema().has_hash_schema()) { |
2785 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, |
2786 | 0 | STATUS(InvalidArgument, |
2787 | 0 | "PostgreSQL system catalog tables are non-partitioned")); |
2788 | 0 | } |
2789 | | |
2790 | 2.26k | if (req->table_type() != TableType::PGSQL_TABLE_TYPE) { |
2791 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, |
2792 | 0 | STATUS_FORMAT( |
2793 | 0 | InvalidArgument, |
2794 | 0 | "Expected table type to be PGSQL_TABLE_TYPE ($0), got $1 ($2)", |
2795 | 0 | PGSQL_TABLE_TYPE, |
2796 | 0 | TableType_Name(req->table_type()))); |
2797 | |
|
2798 | 0 | } |
2799 | | |
2800 | | // Create partition schema and one partition. |
2801 | 2.26k | PartitionSchema partition_schema; |
2802 | 2.26k | vector<Partition> partitions; |
2803 | 2.26k | RETURN_NOT_OK(partition_schema.CreatePartitions(1, &partitions)); |
2804 | | |
2805 | | // Create table info in memory. |
2806 | 2.26k | scoped_refptr<TableInfo> table; |
2807 | 2.26k | scoped_refptr<TabletInfo> sys_catalog_tablet; |
2808 | 2.26k | { |
2809 | 2.26k | LockGuard lock(mutex_); |
2810 | 2.26k | TRACE("Acquired catalog manager lock"); |
2811 | | |
2812 | | // Verify that the table does not exist, or has been deleted. |
2813 | 2.26k | table = FindPtrOrNull(*table_ids_map_, req->table_id()); |
2814 | 2.26k | if (table != nullptr && !table->is_deleted()) { |
2815 | 0 | Status s = STATUS_SUBSTITUTE(AlreadyPresent, |
2816 | 0 | "YSQL table '$0.$1' (ID: $2) already exists", ns->name(), table->name(), table->id()); |
2817 | 0 | LOG(WARNING) << "Found table: " << table->ToStringWithState() |
2818 | 0 | << ". Failed creating YSQL system table with error: " |
2819 | 0 | << s.ToString() << " Request:\n" << req->DebugString(); |
2820 | | // Technically, client already knows table ID, but we set it anyway for unified handling of |
2821 | | // AlreadyPresent errors. See comment in CreateTable() |
2822 | 0 | resp->set_table_id(table->id()); |
2823 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_ALREADY_PRESENT, s); |
2824 | 0 | } |
2825 | | |
2826 | 2.26k | RETURN_NOT_OK(CreateTableInMemory( |
2827 | 2.26k | *req, schema, partition_schema, namespace_id, namespace_name, |
2828 | 2.26k | partitions, nullptr /* index_info */, nullptr /* tablets */, resp, &table)); |
2829 | | |
2830 | 2.26k | sys_catalog_tablet = tablet_map_->find(kSysCatalogTabletId)->second; |
2831 | 2.26k | } |
2832 | | |
2833 | | // Tables with a transaction should be rolled back if the transaction does not get committed. |
2834 | | // Store this on the table persistent state until the transaction has been a verified success. |
2835 | 2.26k | TransactionMetadata txn; |
2836 | 2.26k | if (req->has_transaction() && FLAGS_enable_transactional_ddl_gc) { |
2837 | 0 | table->mutable_metadata()->mutable_dirty()->pb.mutable_transaction()-> |
2838 | 0 | CopyFrom(req->transaction()); |
2839 | 0 | txn = VERIFY_RESULT(TransactionMetadata::FromPB(req->transaction())); |
2840 | 0 | RSTATUS_DCHECK(!txn.status_tablet.empty(), Corruption, "Given incomplete Transaction"); |
2841 | 0 | } |
2842 | | |
2843 | 2.26k | { |
2844 | 2.26k | auto tablet_lock = sys_catalog_tablet->LockForWrite(); |
2845 | 2.26k | tablet_lock.mutable_data()->pb.add_table_ids(table->id()); |
2846 | | |
2847 | 2.26k | Status s = sys_catalog_->Upsert(leader_ready_term(), sys_catalog_tablet); |
2848 | 2.26k | if (PREDICT_FALSE(!s.ok())) { |
2849 | 1 | return AbortTableCreation(table.get(), {}, s.CloneAndPrepend( |
2850 | 1 | "An error occurred while inserting to sys-tablets: "), resp); |
2851 | 1 | } |
2852 | 2.26k | table->set_is_system(); |
2853 | 2.26k | table->AddTablet(sys_catalog_tablet.get()); |
2854 | 2.26k | tablet_lock.Commit(); |
2855 | 2.26k | } |
2856 | 2.26k | TRACE("Inserted new table info into CatalogManager maps"); |
2857 | | |
2858 | | // Update the on-disk table state to "running". |
2859 | 2.26k | table->mutable_metadata()->mutable_dirty()->pb.set_state(SysTablesEntryPB::RUNNING); |
2860 | 2.26k | Status s = sys_catalog_->Upsert(leader_ready_term(), table); |
2861 | 2.26k | if (PREDICT_FALSE(!s.ok())) { |
2862 | 0 | return AbortTableCreation(table.get(), {}, s.CloneAndPrepend( |
2863 | 0 | "An error occurred while inserting to sys-tablets: "), resp); |
2864 | 0 | } |
2865 | 2.26k | TRACE("Wrote table to system table"); |
2866 | | |
2867 | | // Commit the in-memory state. |
2868 | 2.26k | table->mutable_metadata()->CommitMutation(); |
2869 | | |
2870 | | // Verify Transaction gets committed, which occurs after table create finishes. |
2871 | 2.26k | if (req->has_transaction() && PREDICT_TRUE(FLAGS_enable_transactional_ddl_gc)) { |
2872 | 0 | LOG(INFO) << "Enqueuing table for Transaction Verification: " << req->name(); |
2873 | 0 | std::function<Status(bool)> when_done = |
2874 | 0 | std::bind(&CatalogManager::VerifyTablePgLayer, this, table, _1); |
2875 | 0 | WARN_NOT_OK(background_tasks_thread_pool_->SubmitFunc( |
2876 | 0 | std::bind(&YsqlTransactionDdl::VerifyTransaction, ysql_transaction_.get(), txn, when_done)), |
2877 | 0 | "Could not submit VerifyTransaction to thread pool"); |
2878 | 0 | } |
2879 | | |
2880 | 2.26k | tablet::ChangeMetadataRequestPB change_req; |
2881 | 2.26k | change_req.set_tablet_id(kSysCatalogTabletId); |
2882 | 2.26k | auto& add_table = *change_req.mutable_add_table(); |
2883 | | |
2884 | 2.26k | add_table.set_table_id(req->table_id()); |
2885 | 2.26k | add_table.set_table_type(TableType::PGSQL_TABLE_TYPE); |
2886 | 2.26k | add_table.set_table_name(req->name()); |
2887 | 2.26k | SchemaToPB(schema, add_table.mutable_schema()); |
2888 | 2.26k | add_table.set_schema_version(0); |
2889 | | |
2890 | 2.26k | partition_schema.ToPB(add_table.mutable_partition_schema()); |
2891 | | |
2892 | 2.26k | RETURN_NOT_OK(tablet::SyncReplicateChangeMetadataOperation( |
2893 | 2.26k | &change_req, sys_catalog_->tablet_peer().get(), leader_ready_term())); |
2894 | | |
2895 | 2.26k | if (initial_snapshot_writer_) { |
2896 | 0 | initial_snapshot_writer_->AddMetadataChange(change_req); |
2897 | 0 | } |
2898 | 2.26k | return Status::OK(); |
2899 | 2.26k | } |
2900 | | |
2901 | | Status CatalogManager::ReservePgsqlOids(const ReservePgsqlOidsRequestPB* req, |
2902 | | ReservePgsqlOidsResponsePB* resp, |
2903 | 380 | rpc::RpcContext* rpc) { |
2904 | 0 | VLOG(1) << "ReservePgsqlOids request: " << req->ShortDebugString(); |
2905 | | |
2906 | | // Lookup namespace |
2907 | 380 | scoped_refptr<NamespaceInfo> ns; |
2908 | 380 | { |
2909 | 380 | SharedLock lock(mutex_); |
2910 | 380 | ns = FindPtrOrNull(namespace_ids_map_, req->namespace_id()); |
2911 | 380 | } |
2912 | 380 | if (!ns) { |
2913 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, |
2914 | 0 | STATUS(NotFound, "Namespace not found", req->namespace_id())); |
2915 | 0 | } |
2916 | | |
2917 | | // Reserve oids. |
2918 | 380 | auto l = ns->LockForWrite(); |
2919 | | |
2920 | 380 | uint32_t begin_oid = l->pb.next_pg_oid(); |
2921 | 380 | if (begin_oid < req->next_oid()) { |
2922 | 355 | begin_oid = req->next_oid(); |
2923 | 355 | } |
2924 | 380 | if (begin_oid == std::numeric_limits<uint32_t>::max()) { |
2925 | 0 | LOG(WARNING) << Format("No more object identifier is available for Postgres database $0 ($1)", |
2926 | 0 | l->pb.name(), req->namespace_id()); |
2927 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::UNKNOWN_ERROR, |
2928 | 0 | STATUS(InvalidArgument, "No more object identifier is available")); |
2929 | 0 | } |
2930 | | |
2931 | 380 | uint32_t end_oid = begin_oid + req->count(); |
2932 | 380 | if (end_oid < begin_oid) { |
2933 | 0 | end_oid = std::numeric_limits<uint32_t>::max(); // Handle wraparound. |
2934 | 0 | } |
2935 | | |
2936 | 380 | resp->set_begin_oid(begin_oid); |
2937 | 380 | resp->set_end_oid(end_oid); |
2938 | 380 | l.mutable_data()->pb.set_next_pg_oid(end_oid); |
2939 | | |
2940 | | // Update the on-disk state. |
2941 | 380 | const Status s = sys_catalog_->Upsert(leader_ready_term(), ns); |
2942 | 380 | if (!s.ok()) { |
2943 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::UNKNOWN_ERROR, s); |
2944 | 0 | } |
2945 | | |
2946 | | // Commit the in-memory state. |
2947 | 380 | l.Commit(); |
2948 | | |
2949 | 0 | VLOG(1) << "ReservePgsqlOids response: " << resp->ShortDebugString(); |
2950 | | |
2951 | 380 | return Status::OK(); |
2952 | 380 | } |
2953 | | |
2954 | | Status CatalogManager::GetYsqlCatalogConfig(const GetYsqlCatalogConfigRequestPB* req, |
2955 | | GetYsqlCatalogConfigResponsePB* resp, |
2956 | 0 | rpc::RpcContext* rpc) { |
2957 | 0 | VLOG(1) << "GetYsqlCatalogConfig request: " << req->ShortDebugString(); |
2958 | 0 | auto l = CHECK_NOTNULL(ysql_catalog_config_.get())->LockForRead(); |
2959 | 0 | resp->set_version(l->pb.ysql_catalog_config().version()); |
2960 | |
|
2961 | 0 | return Status::OK(); |
2962 | 0 | } |
2963 | | |
2964 | | Status CatalogManager::CopyPgsqlSysTables(const NamespaceId& namespace_id, |
2965 | 22 | const std::vector<scoped_refptr<TableInfo>>& tables) { |
2966 | 22 | const uint32_t database_oid = CHECK_RESULT(GetPgsqlDatabaseOid(namespace_id)); |
2967 | 22 | vector<TableId> source_table_ids; |
2968 | 22 | vector<TableId> target_table_ids; |
2969 | 2.68k | for (const auto& table : tables) { |
2970 | 2.68k | CreateTableRequestPB table_req; |
2971 | 2.68k | CreateTableResponsePB table_resp; |
2972 | | |
2973 | 2.68k | const uint32_t table_oid = VERIFY_RESULT(GetPgsqlTableOid(table->id())); |
2974 | 2.68k | const TableId table_id = GetPgsqlTableId(database_oid, table_oid); |
2975 | | |
2976 | | // Hold read lock until rows from the table are copied also. |
2977 | 2.68k | auto l = table->LockForRead(); |
2978 | | |
2979 | | // Skip shared table. |
2980 | 2.68k | if (l->pb.is_pg_shared_table()) { |
2981 | 420 | continue; |
2982 | 420 | } |
2983 | | |
2984 | 2.26k | table_req.set_name(l->pb.name()); |
2985 | 2.26k | table_req.mutable_namespace_()->set_id(namespace_id); |
2986 | 2.26k | table_req.set_table_type(PGSQL_TABLE_TYPE); |
2987 | 2.26k | table_req.mutable_schema()->CopyFrom(l->schema()); |
2988 | 2.26k | table_req.set_is_pg_catalog_table(true); |
2989 | 2.26k | table_req.set_table_id(table_id); |
2990 | | |
2991 | 2.26k | if (IsIndex(l->pb)) { |
2992 | 1.00k | const uint32_t indexed_table_oid = |
2993 | 1.00k | VERIFY_RESULT(GetPgsqlTableOid(GetIndexedTableId(l->pb))); |
2994 | 1.00k | const TableId indexed_table_id = GetPgsqlTableId(database_oid, indexed_table_oid); |
2995 | | |
2996 | | // Set index_info. |
2997 | | // Previously created INDEX wouldn't have the attribute index_info. |
2998 | 1.00k | if (l->pb.has_index_info()) { |
2999 | 1.00k | table_req.mutable_index_info()->CopyFrom(l->pb.index_info()); |
3000 | 1.00k | table_req.mutable_index_info()->set_indexed_table_id(indexed_table_id); |
3001 | 1.00k | } |
3002 | | |
3003 | | // Set deprecated field for index_info. |
3004 | 1.00k | table_req.set_indexed_table_id(indexed_table_id); |
3005 | 1.00k | table_req.set_is_local_index(PROTO_GET_IS_LOCAL(l->pb)); |
3006 | 1.00k | table_req.set_is_unique_index(PROTO_GET_IS_UNIQUE(l->pb)); |
3007 | 1.00k | } |
3008 | | |
3009 | 2.26k | auto s = CreateYsqlSysTable(&table_req, &table_resp); |
3010 | 2.26k | if (!s.ok()) { |
3011 | 1 | return s.CloneAndPrepend(Substitute( |
3012 | 1 | "Failure when creating PGSQL System Tables: $0", table_resp.error().ShortDebugString())); |
3013 | 1 | } |
3014 | | |
3015 | 2.26k | source_table_ids.push_back(table->id()); |
3016 | 2.26k | target_table_ids.push_back(table_id); |
3017 | 2.26k | } |
3018 | 21 | RETURN_NOT_OK( |
3019 | 21 | sys_catalog_->CopyPgsqlTables(source_table_ids, target_table_ids, leader_ready_term())); |
3020 | 21 | return Status::OK(); |
3021 | 21 | } |
3022 | | |
3023 | 718 | size_t CatalogManager::GetNumLiveTServersForPlacement(const PlacementId& placement_id) { |
3024 | 718 | BlacklistSet blacklist = BlacklistSetFromPB(); |
3025 | 718 | TSDescriptorVector ts_descs; |
3026 | 718 | master_->ts_manager()->GetAllLiveDescriptorsInCluster(&ts_descs, placement_id, blacklist); |
3027 | 718 | return ts_descs.size(); |
3028 | 718 | } |
3029 | | |
3030 | 116k | TSDescriptorVector CatalogManager::GetAllLiveNotBlacklistedTServers() const { |
3031 | 116k | TSDescriptorVector ts_descs; |
3032 | 116k | BlacklistSet blacklist = BlacklistSetFromPB(); |
3033 | 116k | master_->ts_manager()->GetAllLiveDescriptors(&ts_descs, blacklist); |
3034 | 116k | return ts_descs; |
3035 | 116k | } |
3036 | | |
3037 | | namespace { |
3038 | | |
3039 | 306k | size_t GetNumReplicasFromPlacementInfo(const PlacementInfoPB& placement_info) { |
3040 | 306k | return placement_info.num_replicas() > 0 ? |
3041 | 301k | placement_info.num_replicas() : FLAGS_replication_factor; |
3042 | 306k | } |
3043 | | |
3044 | | Status CheckNumReplicas(const PlacementInfoPB& placement_info, |
3045 | | const TSDescriptorVector& ts_descs, |
3046 | | const vector<Partition>& partitions, |
3047 | 4.01k | CreateTableResponsePB* resp) { |
3048 | 4.01k | auto max_tablets = FLAGS_max_create_tablets_per_ts * ts_descs.size(); |
3049 | 4.01k | auto num_replicas = GetNumReplicasFromPlacementInfo(placement_info); |
3050 | 4.01k | if (num_replicas > 1 && max_tablets > 0 && partitions.size() > max_tablets) { |
3051 | 0 | std::string msg = Substitute("The requested number of tablets ($0) is over the permitted " |
3052 | 0 | "maximum ($1)", partitions.size(), max_tablets); |
3053 | 0 | Status s = STATUS(InvalidArgument, msg); |
3054 | 0 | LOG(WARNING) << msg; |
3055 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::TOO_MANY_TABLETS, s); |
3056 | 0 | } |
3057 | | |
3058 | 4.01k | return Status::OK(); |
3059 | 4.01k | } |
3060 | | |
3061 | | } // namespace |
3062 | | |
3063 | | // Create a new table. |
3064 | | // See README file in this directory for a description of the design. |
3065 | | Status CatalogManager::CreateTable(const CreateTableRequestPB* orig_req, |
3066 | | CreateTableResponsePB* resp, |
3067 | 3.65k | rpc::RpcContext* rpc) { |
3068 | 0 | DVLOG(3) << __PRETTY_FUNCTION__ << " Begin. " << orig_req->DebugString(); |
3069 | | |
3070 | 3.65k | const bool is_pg_table = orig_req->table_type() == PGSQL_TABLE_TYPE; |
3071 | 3.65k | const bool is_pg_catalog_table = is_pg_table && orig_req->is_pg_catalog_table(); |
3072 | 3.65k | if (!is_pg_catalog_table || !FLAGS_hide_pg_catalog_table_creation_logs) { |
3073 | 3.65k | LOG(INFO) << "CreateTable from " << RequestorString(rpc) |
3074 | 3.65k | << ":\n" << orig_req->DebugString(); |
3075 | 0 | } else { |
3076 | 0 | LOG(INFO) << "CreateTable from " << RequestorString(rpc) << ": " << orig_req->name(); |
3077 | 0 | } |
3078 | | |
3079 | 3.65k | const bool is_transactional = orig_req->schema().table_properties().is_transactional(); |
3080 | | // If this is a transactional table, we need to create the transaction status table (if it does |
3081 | | // not exist already). |
3082 | 3.65k | if (is_transactional && (!is_pg_catalog_table || !FLAGS_create_initial_sys_catalog_snapshot)) { |
3083 | 1.87k | Status s = CreateGlobalTransactionStatusTableIfNeeded(rpc); |
3084 | 1.87k | if (!s.ok()) { |
3085 | 0 | return s.CloneAndPrepend("Error while creating transaction status table"); |
3086 | 0 | } |
3087 | 1.78k | } else { |
3088 | 0 | VLOG(1) |
3089 | 0 | << "Not attempting to create a transaction status table:\n" |
3090 | 0 | << " " << EXPR_VALUE_FOR_LOG(is_transactional) << "\n " |
3091 | 0 | << " " << EXPR_VALUE_FOR_LOG(is_pg_catalog_table) << "\n " |
3092 | 0 | << " " << EXPR_VALUE_FOR_LOG(FLAGS_create_initial_sys_catalog_snapshot); |
3093 | 1.78k | } |
3094 | | |
3095 | | // If this is a transactional table and there is a associated tablespace, try to create a |
3096 | | // local transaction status table for the tablespace if there is a placement attached to it |
3097 | | // (and if it does not exist already). |
3098 | 3.65k | if (GetAtomicFlag(&FLAGS_auto_create_local_transaction_tables)) { |
3099 | 3.65k | if (is_transactional && orig_req->has_tablespace_id()) { |
3100 | 1 | const auto& tablespace_id = orig_req->tablespace_id(); |
3101 | 1 | auto tablespace_pb = VERIFY_RESULT(GetTablespaceReplicationInfoWithRetry(tablespace_id)); |
3102 | 1 | if (tablespace_pb) { |
3103 | 0 | RETURN_NOT_OK(CreateLocalTransactionStatusTableIfNeeded(rpc, tablespace_id)); |
3104 | 1 | } else { |
3105 | 0 | VLOG(1) |
3106 | 0 | << "Not attempting to create a local transaction status table: " |
3107 | 0 | << "tablespace " << EXPR_VALUE_FOR_LOG(tablespace_id) << " has no placement\n"; |
3108 | 1 | } |
3109 | 3.65k | } else { |
3110 | 0 | VLOG(1) |
3111 | 0 | << "Not attempting to create a local transaction status table:\n" |
3112 | 0 | << " " << EXPR_VALUE_FOR_LOG(is_transactional) << "\n " |
3113 | 0 | << " " << EXPR_VALUE_FOR_LOG(orig_req->has_tablespace_id()); |
3114 | 3.65k | } |
3115 | 3.65k | } |
3116 | | |
3117 | 3.65k | if (is_pg_catalog_table) { |
3118 | 0 | return CreateYsqlSysTable(orig_req, resp); |
3119 | 0 | } |
3120 | | |
3121 | 3.65k | Status s; |
3122 | 3.65k | const char* const object_type = PROTO_PTR_IS_TABLE(orig_req) ? "table" : "index"; |
3123 | | |
3124 | | // Copy the request, so we can fill in some defaults. |
3125 | 3.65k | CreateTableRequestPB req = *orig_req; |
3126 | | |
3127 | | // Lookup the namespace and verify if it exists. |
3128 | 3.65k | TRACE("Looking up namespace"); |
3129 | 3.65k | auto ns = VERIFY_RESULT(FindNamespace(req.namespace_())); |
3130 | 3.65k | bool colocated; |
3131 | 3.65k | NamespaceId namespace_id; |
3132 | 3.65k | NamespaceName namespace_name; |
3133 | 3.65k | { |
3134 | 3.65k | auto ns_lock = ns->LockForRead(); |
3135 | 3.65k | if (ns->database_type() != GetDatabaseTypeForTable(req.table_type())) { |
3136 | 0 | Status s = STATUS(NotFound, "Namespace not found"); |
3137 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s); |
3138 | 0 | } |
3139 | 3.65k | namespace_id = ns->id(); |
3140 | 3.65k | namespace_name = ns->name(); |
3141 | 3.65k | colocated = ns->colocated(); |
3142 | 3.65k | } |
3143 | | |
3144 | | // For index table, find the table info |
3145 | 3.65k | scoped_refptr<TableInfo> indexed_table; |
3146 | 3.65k | if (IsIndex(req)) { |
3147 | 548 | TRACE("Looking up indexed table"); |
3148 | 548 | indexed_table = GetTableInfo(req.indexed_table_id()); |
3149 | 548 | if (indexed_table == nullptr) { |
3150 | 0 | return STATUS_SUBSTITUTE( |
3151 | 0 | NotFound, "The indexed table $0 does not exist", req.indexed_table_id()); |
3152 | 0 | } |
3153 | | |
3154 | 548 | TRACE("Locking indexed table"); |
3155 | 548 | RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(indexed_table->LockForRead(), resp)); |
3156 | 548 | } |
3157 | | |
3158 | | // Determine if this table should be colocated. If not specified, the table should be colocated if |
3159 | | // and only if the namespace is colocated. |
3160 | 3.65k | if (!req.colocated()) { |
3161 | | // Opt out of colocation if the request says so. |
3162 | 1.25k | colocated = false; |
3163 | 2.39k | } else if (indexed_table && !indexed_table->colocated()) { |
3164 | | // Opt out of colocation if the indexed table opted out of colocation. |
3165 | 542 | colocated = false; |
3166 | 542 | } |
3167 | | |
3168 | | // TODO: If this is a colocated index table in a colocated database, convert any hash partition |
3169 | | // columns into range partition columns. This is because postgres does not know that this index |
3170 | | // table is in a colocated database. When we get to the "tablespaces" step where we store this |
3171 | | // into PG metadata, then PG will know if db/table is colocated and do the work there. |
3172 | 3.65k | if ((colocated || req.has_tablegroup_id()) && IsIndex(req)) { |
3173 | 13 | for (auto& col_pb : *req.mutable_schema()->mutable_columns()) { |
3174 | 13 | col_pb.set_is_hash_key(false); |
3175 | 13 | } |
3176 | 5 | } |
3177 | | |
3178 | | // Validate schema. |
3179 | 3.65k | Schema schema; |
3180 | 3.65k | RETURN_NOT_OK(SchemaFromPB(req.schema(), &schema)); |
3181 | 3.65k | RETURN_NOT_OK(ValidateCreateTableSchema(schema, resp)); |
3182 | | |
3183 | | // checking that referenced user-defined types (if any) exist. |
3184 | 3.65k | { |
3185 | 3.65k | SharedLock lock(mutex_); |
3186 | 14.4k | for (size_t i = 0; i < schema.num_columns(); i++) { |
3187 | 72 | for (const auto &udt_id : schema.column(i).type()->GetUserDefinedTypeIds()) { |
3188 | 72 | if (FindPtrOrNull(udtype_ids_map_, udt_id) == nullptr) { |
3189 | 0 | Status s = STATUS(InvalidArgument, "Referenced user-defined type not found"); |
3190 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
3191 | 0 | } |
3192 | 72 | } |
3193 | 10.7k | } |
3194 | 3.65k | } |
3195 | | // TODO (ENG-1860) The referenced namespace and types retrieved/checked above could be deleted |
3196 | | // some time between this point and table creation below. |
3197 | | |
3198 | | // Usually the column ids are available if it's called on the backup-restoring code path |
3199 | | // (from CatalogManager::RecreateTable). Else the column ids must be empty in the client schema. |
3200 | 3.65k | if (!schema.has_column_ids()) { |
3201 | 3.65k | schema.InitColumnIdsByDefault(); |
3202 | 3.65k | } |
3203 | | |
3204 | 3.65k | if (schema.table_properties().HasCopartitionTableId()) { |
3205 | 0 | return CreateCopartitionedTable(req, resp, rpc, schema, ns); |
3206 | 0 | } |
3207 | | |
3208 | 3.65k | if (colocated || req.has_tablegroup_id()) { |
3209 | | // If the table is colocated, then there should be no hash partition columns. |
3210 | | // Do the same for tables that are being placed in tablegroups. |
3211 | 28 | if (schema.num_hash_key_columns() > 0) { |
3212 | 1 | Status s = STATUS(InvalidArgument, "Cannot colocate hash partitioned table"); |
3213 | 1 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
3214 | 1 | } |
3215 | 3.62k | } else if ( |
3216 | 3.62k | !req.partition_schema().has_hash_schema() && !req.partition_schema().has_range_schema()) { |
3217 | | // If neither hash nor range schema have been specified by the protobuf request, we assume the |
3218 | | // table uses a hash schema, and we use the table_type and hash_key to determine the hashing |
3219 | | // scheme (redis or multi-column) that should be used. |
3220 | 2.21k | if (req.table_type() == REDIS_TABLE_TYPE) { |
3221 | 107 | req.mutable_partition_schema()->set_hash_schema(PartitionSchemaPB::REDIS_HASH_SCHEMA); |
3222 | 2.10k | } else if (schema.num_hash_key_columns() > 0) { |
3223 | 2.10k | req.mutable_partition_schema()->set_hash_schema(PartitionSchemaPB::MULTI_COLUMN_HASH_SCHEMA); |
3224 | 0 | } else { |
3225 | 0 | Status s = STATUS(InvalidArgument, "Unknown table type or partitioning method"); |
3226 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
3227 | 0 | } |
3228 | 3.65k | } |
3229 | | |
3230 | | // Verify that custom placement policy has not been specified for colocated table. |
3231 | 3.65k | const bool is_replication_info_set = IsReplicationInfoSet(req.replication_info()); |
3232 | 3.65k | if (is_replication_info_set && colocated) { |
3233 | 0 | Status s = STATUS(InvalidArgument, "Custom placement policy should not be set for " |
3234 | 0 | "colocated tables"); |
3235 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_TABLE_REPLICATION_INFO, s); |
3236 | 0 | } |
3237 | | |
3238 | 3.65k | if (is_replication_info_set && req.table_type() == PGSQL_TABLE_TYPE) { |
3239 | 0 | const Status s = STATUS(InvalidArgument, "Cannot set placement policy for YSQL tables " |
3240 | 0 | "use Tablespaces instead"); |
3241 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s); |
3242 | 0 | } |
3243 | | |
3244 | | // Get placement info. |
3245 | 3.65k | const ReplicationInfoPB& replication_info = VERIFY_RESULT( |
3246 | 3.65k | GetTableReplicationInfo(req.replication_info(), req.tablespace_id())); |
3247 | 3.65k | const PlacementInfoPB& placement_info = replication_info.live_replicas(); |
3248 | | |
3249 | | // Calculate number of tablets to be used. Priorities: |
3250 | | // 1. Use Internally specified value from 'CreateTableRequestPB::num_tablets'. |
3251 | | // 2. Use User specified value from |
3252 | | // 'CreateTableRequestPB::SchemaPB::TablePropertiesPB::num_tablets'. |
3253 | | // Note, that the number will be saved in schema stored in the master persistent |
3254 | | // SysCatalog irrespective of which way we choose the number of tablets to create. |
3255 | | // If nothing is specified in this field, nothing will be stored in the table |
3256 | | // TablePropertiesPB for number of tablets |
3257 | | // 3. Calculate own value. |
3258 | 3.65k | int num_tablets = 0; |
3259 | 3.65k | if (req.has_num_tablets()) { |
3260 | 2.93k | num_tablets = req.num_tablets(); // Internal request. |
3261 | 2.93k | } |
3262 | | |
3263 | 3.65k | if (num_tablets <= 0 && schema.table_properties().HasNumTablets()) { |
3264 | 503 | num_tablets = schema.table_properties().num_tablets(); // User request. |
3265 | 503 | } |
3266 | | |
3267 | 3.65k | if (num_tablets <= 0) { |
3268 | | // Use default as client could have gotten the value before any tserver had heartbeated |
3269 | | // to (a new) master leader. |
3270 | 215 | const auto num_live_tservers = |
3271 | 215 | GetNumLiveTServersForPlacement(placement_info.placement_uuid()); |
3272 | 215 | num_tablets = narrow_cast<int>( |
3273 | 3 | num_live_tservers * (is_pg_table ? FLAGS_ysql_num_shards_per_tserver |
3274 | 212 | : FLAGS_yb_num_shards_per_tserver)); |
3275 | 215 | LOG(INFO) << "Setting default tablets to " << num_tablets << " with " |
3276 | 215 | << num_live_tservers << " primary servers"; |
3277 | 215 | } |
3278 | | |
3279 | | // Create partitions. |
3280 | 3.65k | PartitionSchema partition_schema; |
3281 | 3.65k | vector<Partition> partitions; |
3282 | 3.65k | if (colocated || req.has_tablegroup_id()) { |
3283 | 27 | RETURN_NOT_OK(partition_schema.CreatePartitions(1, &partitions)); |
3284 | 27 | req.clear_partition_schema(); |
3285 | 27 | num_tablets = 1; |
3286 | 3.62k | } else { |
3287 | 3.62k | RETURN_NOT_OK(PartitionSchema::FromPB(req.partition_schema(), schema, &partition_schema)); |
3288 | 3.62k | if (req.partitions_size() > 0) { |
3289 | 0 | if (req.partitions_size() != num_tablets) { |
3290 | 0 | Status s = STATUS(InvalidArgument, "Partitions are not defined for all tablets"); |
3291 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
3292 | 0 | } |
3293 | 0 | string last; |
3294 | 0 | for (const auto& p : req.partitions()) { |
3295 | 0 | Partition np; |
3296 | 0 | Partition::FromPB(p, &np); |
3297 | 0 | if (np.partition_key_start() != last) { |
3298 | 0 | Status s = STATUS(InvalidArgument, |
3299 | 0 | "Partitions does not cover the full partition keyspace"); |
3300 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
3301 | 0 | } |
3302 | 0 | last = np.partition_key_end(); |
3303 | 0 | partitions.push_back(std::move(np)); |
3304 | 0 | } |
3305 | 3.62k | } else { |
3306 | | // Supplied number of partitions is merely a suggestion, actual number of |
3307 | | // created partitions might differ. |
3308 | 3.62k | RETURN_NOT_OK(partition_schema.CreatePartitions(num_tablets, &partitions)); |
3309 | 3.62k | } |
3310 | | // The vector 'partitions' contains real setup partitions, so the variable |
3311 | | // should be updated. |
3312 | 3.62k | num_tablets = narrow_cast<int>(partitions.size()); |
3313 | 3.62k | } |
3314 | | |
3315 | 3.64k | TSDescriptorVector all_ts_descs; |
3316 | 3.64k | master_->ts_manager()->GetAllLiveDescriptors(&all_ts_descs); |
3317 | 3.64k | RETURN_NOT_OK(CheckNumReplicas(placement_info, all_ts_descs, partitions, resp)); |
3318 | | |
3319 | 3.64k | if (!FLAGS_TEST_skip_placement_validation_createtable_api) { |
3320 | 3.64k | ValidateReplicationInfoRequestPB validate_req; |
3321 | 3.64k | validate_req.mutable_replication_info()->CopyFrom(replication_info); |
3322 | 3.64k | ValidateReplicationInfoResponsePB validate_resp; |
3323 | 3.64k | RETURN_NOT_OK(ValidateReplicationInfo(&validate_req, &validate_resp)); |
3324 | 3.64k | } |
3325 | | |
3326 | 3.64k | LOG(INFO) << "Set number of tablets: " << num_tablets; |
3327 | 3.64k | req.set_num_tablets(num_tablets); |
3328 | | |
3329 | | // For index table, populate the index info. |
3330 | 3.64k | IndexInfoPB index_info; |
3331 | | |
3332 | 3.64k | const bool index_backfill_enabled = |
3333 | 3.64k | IsIndexBackfillEnabled(orig_req->table_type(), is_transactional); |
3334 | 3.64k | if (req.has_index_info()) { |
3335 | | // Current message format. |
3336 | 547 | index_info.CopyFrom(req.index_info()); |
3337 | | |
3338 | | // Assign column-ids that have just been computed and assigned to "index_info". |
3339 | 547 | if (!is_pg_table) { |
3340 | 0 | DCHECK_EQ(index_info.columns().size(), schema.num_columns()) |
3341 | 0 | << "Number of columns are not the same between index_info and index_schema"; |
3342 | 2.01k | for (size_t colidx = 0; colidx < schema.num_columns(); colidx++) { |
3343 | 1.62k | index_info.mutable_columns(narrow_cast<int>(colidx))->set_column_id( |
3344 | 1.62k | schema.column_id(colidx)); |
3345 | 1.62k | } |
3346 | 396 | } |
3347 | 3.10k | } else if (req.has_indexed_table_id()) { |
3348 | | // Old client message format when rolling upgrade (Not having "index_info"). |
3349 | 0 | IndexInfoBuilder index_info_builder(&index_info); |
3350 | 0 | index_info_builder.ApplyProperties(req.indexed_table_id(), |
3351 | 0 | req.is_local_index(), req.is_unique_index()); |
3352 | 0 | if (orig_req->table_type() != PGSQL_TABLE_TYPE) { |
3353 | 0 | Schema indexed_schema; |
3354 | 0 | RETURN_NOT_OK(indexed_table->GetSchema(&indexed_schema)); |
3355 | 0 | RETURN_NOT_OK(index_info_builder.ApplyColumnMapping(indexed_schema, schema)); |
3356 | 0 | } |
3357 | 0 | } |
3358 | | |
3359 | 3.64k | if ((req.has_index_info() || req.has_indexed_table_id()) && |
3360 | 547 | index_backfill_enabled && |
3361 | 488 | !req.skip_index_backfill()) { |
3362 | | // Start off the index table with major compactions disabled. We need this to retain the delete |
3363 | | // markers until the backfill process is completed. No need to set index_permissions in the |
3364 | | // index table. |
3365 | 427 | schema.SetRetainDeleteMarkers(true); |
3366 | 427 | } |
3367 | | |
3368 | 3.64k | LOG(INFO) << "CreateTable with IndexInfo " << AsString(index_info); |
3369 | | |
3370 | 3.64k | scoped_refptr<TableInfo> table; |
3371 | 3.64k | TabletInfos tablets; |
3372 | 3.64k | bool tablets_exist; |
3373 | 3.64k | bool tablegroup_tablets_exist = false; |
3374 | | |
3375 | 3.64k | { |
3376 | 3.64k | LockGuard lock(mutex_); |
3377 | 3.64k | auto ns_lock = ns->LockForRead(); |
3378 | 3.64k | TRACE("Acquired catalog manager lock"); |
3379 | | |
3380 | 3.64k | tablets_exist = |
3381 | 3.64k | colocated && colocated_tablet_ids_map_.find(ns->id()) != colocated_tablet_ids_map_.end(); |
3382 | | // Verify that the table does not exist. |
3383 | 3.64k | table = FindPtrOrNull(table_names_map_, {namespace_id, req.name()}); |
3384 | | |
3385 | 3.64k | if (table != nullptr) { |
3386 | 4 | s = STATUS_SUBSTITUTE(AlreadyPresent, |
3387 | 4 | "Object '$0.$1' already exists", ns->name(), table->name()); |
3388 | 4 | LOG(WARNING) << "Found table: " << table->ToStringWithState() |
3389 | 4 | << ". Failed creating table with error: " |
3390 | 4 | << s.ToString() << " Request:\n" << orig_req->DebugString(); |
3391 | | // If the table already exists, we set the response table_id field to the id of the table that |
3392 | | // already exists. This is necessary because before we return the error to the client (or |
3393 | | // success in case of a "CREATE TABLE IF NOT EXISTS" request) we want to wait for the existing |
3394 | | // table to be available to receive requests. And we need the table id for that. |
3395 | 4 | resp->set_table_id(table->id()); |
3396 | 4 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_ALREADY_PRESENT, s); |
3397 | 4 | } |
3398 | | |
3399 | | // Namespace state validity check: |
3400 | | // 1. Allow Namespaces that are RUNNING |
3401 | | // 2. Allow Namespaces that are PREPARING under 2 situations |
3402 | | // 2a. System Namespaces. |
3403 | | // 2b. The parent table from a Colocated Namespace. |
3404 | 3.64k | const auto parent_table_name = ns->id() + kColocatedParentTableNameSuffix; |
3405 | 3.64k | bool valid_ns_state = (ns->state() == SysNamespaceEntryPB::RUNNING) || |
3406 | 2 | (ns->state() == SysNamespaceEntryPB::PREPARING && |
3407 | 2 | (ns->name() == kSystemNamespaceName || req.name() == parent_table_name)); |
3408 | 3.64k | if (!valid_ns_state) { |
3409 | 0 | Status s = STATUS_SUBSTITUTE(TryAgain, "Invalid Namespace State ($0). Cannot create $1.$2", |
3410 | 0 | SysNamespaceEntryPB::State_Name(ns->state()), ns->name(), req.name()); |
3411 | 0 | return SetupError(resp->mutable_error(), NamespaceMasterError(ns->state()), s); |
3412 | 0 | } |
3413 | | |
3414 | | // Check whether this CREATE TABLE request which has a tablegroup_id is for a normal user table |
3415 | | // or the request to create the parent table for the tablegroup. This is done by checking the |
3416 | | // catalog manager maps. |
3417 | 3.64k | if (req.has_tablegroup_id() && |
3418 | 8 | tablegroup_tablet_ids_map_.find(ns->id()) != tablegroup_tablet_ids_map_.end() && |
3419 | 7 | tablegroup_tablet_ids_map_[ns->id()].find(req.tablegroup_id()) != |
3420 | 7 | tablegroup_tablet_ids_map_[ns->id()].end()) { |
3421 | 7 | tablegroup_tablets_exist = true; |
3422 | 7 | } |
3423 | | |
3424 | 3.64k | RETURN_NOT_OK(CreateTableInMemory( |
3425 | 3.64k | req, schema, partition_schema, namespace_id, namespace_name, partitions, &index_info, |
3426 | 3.64k | (!tablets_exist && !tablegroup_tablets_exist) ? &tablets : nullptr, resp, &table)); |
3427 | | |
3428 | | // Section is executed when a table is either the parent table or a user table in a tablegroup. |
3429 | | // It additionally sets the table metadata (and tablet metadata if this is the parent table) |
3430 | | // to have the colocated property so we can take advantage of code reuse. |
3431 | 3.64k | if (req.has_tablegroup_id()) { |
3432 | 8 | table->mutable_metadata()->mutable_dirty()->pb.set_colocated(true); |
3433 | 8 | if (tablegroup_tablets_exist) { |
3434 | | // If the table is not a tablegroup parent table, it performs a lookup for the proper tablet |
3435 | | // to place the table on as a child table. |
3436 | 7 | auto tablet = tablegroup_tablet_ids_map_[ns->id()][req.tablegroup_id()]; |
3437 | 7 | RSTATUS_DCHECK( |
3438 | 7 | tablet->colocated(), InternalError, |
3439 | 7 | "The tablet for tablegroup should be colocated."); |
3440 | 7 | tablets.push_back(tablet.get()); |
3441 | 7 | auto tablet_lock = tablet->LockForWrite(); |
3442 | 7 | tablet_lock.mutable_data()->pb.add_table_ids(table->id()); |
3443 | 7 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), tablet)); |
3444 | 7 | tablet_lock.Commit(); |
3445 | | |
3446 | 7 | tablet->mutable_metadata()->StartMutation(); |
3447 | 7 | table->AddTablet(tablet); |
3448 | 7 | tablegroup_ids_map_[req.tablegroup_id()]->AddChildTable(table->id()); |
3449 | 1 | } else { |
3450 | | // If the table is a tablegroup parent table, it creates a dummy tablet for the tablegroup |
3451 | | // along with updating the catalog manager maps. |
3452 | 1 | RSTATUS_DCHECK_EQ( |
3453 | 1 | tablets.size(), 1U, InternalError, |
3454 | 1 | "Only one tablet should be created for each tablegroup"); |
3455 | 1 | tablets[0]->mutable_metadata()->mutable_dirty()->pb.set_colocated(true); |
3456 | | // Update catalog manager maps for tablegroups |
3457 | 1 | tablegroup_tablet_ids_map_[ns->id()][req.tablegroup_id()] = |
3458 | 1 | tablet_map_->find(tablets[0]->id())->second; |
3459 | 1 | } |
3460 | 3.63k | } else if (colocated) { |
3461 | 19 | table->mutable_metadata()->mutable_dirty()->pb.set_colocated(true); |
3462 | | // if the tablet already exists, add the tablet to tablets |
3463 | 19 | if (tablets_exist) { |
3464 | 17 | auto tablet = colocated_tablet_ids_map_[ns->id()]; |
3465 | 17 | RSTATUS_DCHECK( |
3466 | 17 | tablet->colocated(), InternalError, |
3467 | 17 | "The tablet for colocated database should be colocated."); |
3468 | 17 | tablets.push_back(tablet.get()); |
3469 | 17 | auto tablet_lock = tablet->LockForWrite(); |
3470 | 17 | tablet_lock.mutable_data()->pb.add_table_ids(table->id()); |
3471 | 17 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), tablet)); |
3472 | 17 | tablet_lock.Commit(); |
3473 | | |
3474 | 17 | tablet->mutable_metadata()->StartMutation(); |
3475 | 17 | table->AddTablet(tablet); |
3476 | 2 | } else { // Record the tablet |
3477 | 2 | RSTATUS_DCHECK_EQ( |
3478 | 2 | tablets.size(), 1U, InternalError, |
3479 | 2 | "Only one tablet should be created for each colocated database"); |
3480 | 2 | tablets[0]->mutable_metadata()->mutable_dirty()->pb.set_colocated(true); |
3481 | 2 | colocated_tablet_ids_map_[ns->id()] = tablet_map_->find(tablets[0]->id())->second; |
3482 | 2 | } |
3483 | 19 | } |
3484 | 3.64k | if (req.has_matview_pg_table_id()) { |
3485 | 0 | matview_pg_table_ids_map_[req.table_id()] = req.matview_pg_table_id(); |
3486 | 0 | } |
3487 | 3.64k | } |
3488 | | |
3489 | | // For create transaction table requests with tablespace id, save the tablespace id. |
3490 | 3.64k | const auto is_transaction_status_table = |
3491 | 3.64k | orig_req->table_type() == TableType::TRANSACTION_STATUS_TABLE_TYPE; |
3492 | 3.64k | if (is_transaction_status_table && req.has_tablespace_id()) { |
3493 | 0 | table->mutable_metadata()->mutable_dirty()->pb.set_transaction_table_tablespace_id( |
3494 | 0 | req.tablespace_id()); |
3495 | 0 | } |
3496 | | |
3497 | | // Tables with a transaction should be rolled back if the transaction does not get committed. |
3498 | | // Store this on the table persistent state until the transaction has been a verified success. |
3499 | 3.64k | TransactionMetadata txn; |
3500 | 3.64k | if (req.has_transaction() && FLAGS_enable_transactional_ddl_gc) { |
3501 | 1.41k | table->mutable_metadata()->mutable_dirty()->pb.mutable_transaction()-> |
3502 | 1.41k | CopyFrom(req.transaction()); |
3503 | 1.41k | txn = VERIFY_RESULT(TransactionMetadata::FromPB(req.transaction())); |
3504 | 1.41k | RSTATUS_DCHECK(!txn.status_tablet.empty(), Corruption, "Given incomplete Transaction"); |
3505 | 1.41k | } |
3506 | | |
3507 | 3.64k | if (PREDICT_FALSE(FLAGS_TEST_simulate_slow_table_create_secs > 0) && |
3508 | 11 | req.table_type() != TableType::TRANSACTION_STATUS_TABLE_TYPE) { |
3509 | 8 | LOG(INFO) << "Simulating slow table creation"; |
3510 | 8 | SleepFor(MonoDelta::FromSeconds(FLAGS_TEST_simulate_slow_table_create_secs)); |
3511 | 8 | } |
3512 | | |
3513 | | // NOTE: the table and tablets are already locked for write at this point, |
3514 | | // since the CreateTableInfo/CreateTabletInfo functions leave them in that state. |
3515 | | // They will get committed at the end of this function. |
3516 | | // Sanity check: the tables and tablets should all be in "preparing" state. |
3517 | 3.64k | CHECK_EQ(SysTablesEntryPB::PREPARING, table->metadata().dirty().pb.state()); |
3518 | | // Update the on-disk table state to "running". |
3519 | 3.64k | table->mutable_metadata()->mutable_dirty()->pb.set_state(SysTablesEntryPB::RUNNING); |
3520 | 3.64k | TRACE("Inserted new table and tablet info into CatalogManager maps"); |
3521 | 0 | VLOG_WITH_PREFIX(1) << "Inserted new table and tablet info into CatalogManager maps"; |
3522 | | |
3523 | 3.64k | if (!tablets_exist && !tablegroup_tablets_exist) { |
3524 | | // Write Tablets to sys-tablets (in "preparing" state). |
3525 | 27.1k | for (const auto& tablet : tablets) { |
3526 | 27.1k | CHECK_EQ(SysTabletsEntryPB::PREPARING, tablet->metadata().dirty().pb.state()); |
3527 | 27.1k | } |
3528 | 3.62k | } |
3529 | | |
3530 | 3.64k | s = sys_catalog_->Upsert(leader_ready_term(), table, tablets); |
3531 | 3.64k | if (PREDICT_FALSE(!s.ok())) { |
3532 | 3 | return AbortTableCreation( |
3533 | 3 | table.get(), tablets, s.CloneAndPrepend("An error occurred while inserting to sys-tablets"), |
3534 | 3 | resp); |
3535 | 3 | } |
3536 | 3.64k | TRACE("Wrote table and tablets to system table"); |
3537 | | |
3538 | | // For index table, insert index info in the indexed table. |
3539 | 3.64k | if ((req.has_index_info() || req.has_indexed_table_id())) { |
3540 | 542 | if (index_backfill_enabled && !req.skip_index_backfill()) { |
3541 | 422 | if (is_pg_table) { |
3542 | | // YSQL: start at some permission before backfill. The real enforcement happens with |
3543 | | // pg_index system table's indislive and indisready columns. Choose WRITE_AND_DELETE |
3544 | | // because it will probably be less confusing. |
3545 | 89 | index_info.set_index_permissions(INDEX_PERM_WRITE_AND_DELETE); |
3546 | 333 | } else { |
3547 | | // YCQL |
3548 | 333 | index_info.set_index_permissions(INDEX_PERM_DELETE_ONLY); |
3549 | 333 | } |
3550 | 422 | } |
3551 | 542 | s = AddIndexInfoToTable(indexed_table, index_info, resp); |
3552 | 542 | if (PREDICT_FALSE(!s.ok())) { |
3553 | 0 | return AbortTableCreation( |
3554 | 0 | table.get(), tablets, s.CloneAndPrepend("An error occurred while inserting index info"), |
3555 | 0 | resp); |
3556 | 0 | } |
3557 | 3.64k | } |
3558 | | |
3559 | | // Commit the in-memory state. |
3560 | 3.64k | table->mutable_metadata()->CommitMutation(); |
3561 | | |
3562 | 27.1k | for (const auto& tablet : tablets) { |
3563 | 27.1k | tablet->mutable_metadata()->CommitMutation(); |
3564 | 27.1k | } |
3565 | | |
3566 | 3.64k | if ((colocated && tablets_exist) || (req.has_tablegroup_id() && tablegroup_tablets_exist)) { |
3567 | 24 | auto call = |
3568 | 24 | std::make_shared<AsyncAddTableToTablet>(master_, AsyncTaskPool(), tablets[0], table); |
3569 | 24 | table->AddTask(call); |
3570 | 24 | WARN_NOT_OK(ScheduleTask(call), "Failed to send AddTableToTablet request"); |
3571 | 24 | } |
3572 | | |
3573 | 3.64k | if (req.has_creator_role_name()) { |
3574 | 242 | const NamespaceName& keyspace_name = req.namespace_().name(); |
3575 | 242 | const TableName& table_name = req.name(); |
3576 | 242 | RETURN_NOT_OK(permissions_manager_->GrantPermissions( |
3577 | 242 | req.creator_role_name(), |
3578 | 242 | get_canonical_table(keyspace_name, table_name), |
3579 | 242 | table_name, |
3580 | 242 | keyspace_name, |
3581 | 242 | all_permissions_for_resource(ResourceType::TABLE), |
3582 | 242 | ResourceType::TABLE, |
3583 | 242 | resp)); |
3584 | 242 | } |
3585 | | |
3586 | | // Verify Transaction gets committed, which occurs after table create finishes. |
3587 | 3.64k | if (req.has_transaction() && PREDICT_TRUE(FLAGS_enable_transactional_ddl_gc)) { |
3588 | 1.41k | LOG(INFO) << "Enqueuing table for Transaction Verification: " << req.name(); |
3589 | 1.41k | std::function<Status(bool)> when_done = |
3590 | 1.41k | std::bind(&CatalogManager::VerifyTablePgLayer, this, table, _1); |
3591 | 1.41k | WARN_NOT_OK(background_tasks_thread_pool_->SubmitFunc( |
3592 | 1.41k | std::bind(&YsqlTransactionDdl::VerifyTransaction, ysql_transaction_.get(), txn, when_done)), |
3593 | 1.41k | "Could not submit VerifyTransaction to thread pool"); |
3594 | 1.41k | } |
3595 | | |
3596 | 3.64k | LOG(INFO) << "Successfully created " << object_type << " " << table->ToString() << " in " |
3597 | 3.64k | << ns->ToString() << " per request from " << RequestorString(rpc); |
3598 | 3.64k | background_tasks_->Wake(); |
3599 | | |
3600 | 3.64k | if (FLAGS_master_enable_metrics_snapshotter && |
3601 | 0 | !(req.table_type() == TableType::YQL_TABLE_TYPE && |
3602 | 0 | namespace_id == kSystemNamespaceId && |
3603 | 0 | req.name() == kMetricsSnapshotsTableName)) { |
3604 | 0 | Status s = CreateMetricsSnapshotsTableIfNeeded(rpc); |
3605 | 0 | if (!s.ok()) { |
3606 | 0 | return s.CloneAndPrepend("Error while creating metrics snapshots table"); |
3607 | 0 | } |
3608 | 3.64k | } |
3609 | | |
3610 | | // Increment transaction status version if needed. |
3611 | 3.64k | if (is_transaction_status_table) { |
3612 | 492 | RETURN_NOT_OK(IncrementTransactionTablesVersion()); |
3613 | 492 | } |
3614 | | |
3615 | 0 | DVLOG(3) << __PRETTY_FUNCTION__ << " Done."; |
3616 | 3.64k | return Status::OK(); |
3617 | 3.64k | } |
3618 | | |
3619 | 1.41k | Status CatalogManager::VerifyTablePgLayer(scoped_refptr<TableInfo> table, bool rpc_success) { |
3620 | | // Upon Transaction completion, check pg system table using OID to ensure SUCCESS. |
3621 | 1.41k | const uint32_t database_oid = VERIFY_RESULT(GetPgsqlDatabaseOidByTableId(table->id())); |
3622 | 1.41k | const auto pg_table_id = GetPgsqlTableId(database_oid, kPgClassTableOid); |
3623 | 1.41k | auto table_storage_id = GetPgsqlTableOid(table->id()); |
3624 | 1.41k | { |
3625 | 1.41k | SharedLock lock(mutex_); |
3626 | 1.41k | if (matview_pg_table_ids_map_.find(table->id()) != matview_pg_table_ids_map_.end()) { |
3627 | 0 | table_storage_id = GetPgsqlTableOid(matview_pg_table_ids_map_[table->id()]); |
3628 | 0 | } |
3629 | 1.41k | } |
3630 | 1.41k | auto entry_exists = VERIFY_RESULT( |
3631 | 1.41k | ysql_transaction_->PgEntryExists(pg_table_id, table_storage_id)); |
3632 | 1.41k | auto l = table->LockForWrite(); |
3633 | 1.41k | auto& metadata = table->mutable_metadata()->mutable_dirty()->pb; |
3634 | | |
3635 | 1.41k | SCHECK(metadata.state() == SysTablesEntryPB::RUNNING || |
3636 | 1.41k | metadata.state() == SysTablesEntryPB::ALTERING, Aborted, |
3637 | 1.41k | Substitute("Unexpected table state ($0), abandoning transaction GC work for $1", |
3638 | 1.41k | SysTablesEntryPB_State_Name(metadata.state()), table->ToString())); |
3639 | | |
3640 | | // #5981: Mark un-retryable rpc failures as pass to avoid infinite retry of GC'd txns. |
3641 | 1.39k | const bool txn_check_passed = entry_exists || !rpc_success; |
3642 | | |
3643 | 1.39k | if (txn_check_passed) { |
3644 | | // Remove the transaction from the entry since we're done processing it. |
3645 | 1.37k | metadata.clear_transaction(); |
3646 | 1.37k | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), table)); |
3647 | 1.37k | if (entry_exists) { |
3648 | 1.37k | LOG_WITH_PREFIX(INFO) << "Table transaction succeeded: " << table->ToString(); |
3649 | 0 | } else { |
3650 | 0 | LOG_WITH_PREFIX(WARNING) |
3651 | 0 | << "Unknown RPC failure, removing transaction on table: " << table->ToString(); |
3652 | 0 | } |
3653 | | // Commit the in-memory state. |
3654 | 1.37k | l.Commit(); |
3655 | 23 | } else { |
3656 | 23 | LOG(INFO) << "Table transaction failed, deleting: " << table->ToString(); |
3657 | | // Async enqueue delete. |
3658 | 23 | DeleteTableRequestPB del_tbl_req; |
3659 | 23 | del_tbl_req.mutable_table()->set_table_name(table->name()); |
3660 | 23 | del_tbl_req.mutable_table()->set_table_id(table->id()); |
3661 | 23 | del_tbl_req.set_is_index_table(table->is_index()); |
3662 | | |
3663 | 23 | RETURN_NOT_OK(background_tasks_thread_pool_->SubmitFunc( [this, del_tbl_req]() { |
3664 | 23 | DeleteTableResponsePB del_tbl_resp; |
3665 | 23 | WARN_NOT_OK(DeleteTable(&del_tbl_req, &del_tbl_resp, nullptr), |
3666 | 23 | "Failed to Delete Table with failed transaction"); |
3667 | 23 | })); |
3668 | 23 | } |
3669 | 1.39k | return Status::OK(); |
3670 | 1.39k | } |
3671 | | |
3672 | | Result<TabletInfos> CatalogManager::CreateTabletsFromTable(const vector<Partition>& partitions, |
3673 | 29.3k | const TableInfoPtr& table) { |
3674 | 29.3k | TabletInfos tablets; |
3675 | | // Create the TabletInfo objects in state PREPARING. |
3676 | 53.7k | for (const Partition& partition : partitions) { |
3677 | 53.7k | PartitionPB partition_pb; |
3678 | 53.7k | partition.ToPB(&partition_pb); |
3679 | 53.7k | tablets.push_back(CreateTabletInfo(table.get(), partition_pb)); |
3680 | 53.7k | } |
3681 | | |
3682 | | // Add the table/tablets to the in-memory map for the assignment. |
3683 | 29.3k | table->AddTablets(tablets); |
3684 | 29.3k | auto tablet_map_checkout = tablet_map_.CheckOut(); |
3685 | 53.7k | for (const TabletInfoPtr& tablet : tablets) { |
3686 | 53.7k | InsertOrDie(tablet_map_checkout.get_ptr(), tablet->tablet_id(), tablet); |
3687 | 53.7k | } |
3688 | | |
3689 | 29.3k | return tablets; |
3690 | 29.3k | } |
3691 | | |
3692 | | Status CatalogManager::CheckValidPlacementInfo(const PlacementInfoPB& placement_info, |
3693 | | const TSDescriptorVector& ts_descs, |
3694 | 32.3k | ValidateReplicationInfoResponsePB* resp) { |
3695 | 32.3k | size_t num_live_tservers = ts_descs.size(); |
3696 | 32.3k | size_t num_replicas = GetNumReplicasFromPlacementInfo(placement_info); |
3697 | 32.3k | Status s; |
3698 | 32.3k | string msg; |
3699 | | |
3700 | | // Verify that the number of replicas isn't larger than the required number of live tservers. |
3701 | | // To ensure quorum, we need n/2 + 1 live tservers. |
3702 | 32.3k | size_t replica_quorum_needed = num_replicas / 2 + 1; |
3703 | 32.3k | if (FLAGS_catalog_manager_check_ts_count_for_create_table && |
3704 | 31.9k | replica_quorum_needed > num_live_tservers) { |
3705 | 2 | msg = Substitute("Not enough live tablet servers to create table with replication factor $0. " |
3706 | 2 | "Need at least $1 tablet servers whereas $2 are alive.", |
3707 | 2 | num_replicas, replica_quorum_needed, num_live_tservers); |
3708 | 2 | LOG(WARNING) << msg |
3709 | 2 | << ". Placement info: " << placement_info.ShortDebugString() |
3710 | 2 | << ", replication factor flag: " << FLAGS_replication_factor; |
3711 | 2 | s = STATUS(InvalidArgument, msg); |
3712 | 2 | return SetupError(resp->mutable_error(), MasterErrorPB::REPLICATION_FACTOR_TOO_HIGH, s); |
3713 | 2 | } |
3714 | | |
3715 | | // Verify that placement requests are reasonable. |
3716 | 32.3k | if (!placement_info.placement_blocks().empty()) { |
3717 | 107 | size_t minimum_sum = 0; |
3718 | 242 | for (const auto& pb : placement_info.placement_blocks()) { |
3719 | 242 | minimum_sum += pb.min_num_replicas(); |
3720 | 242 | if (!pb.has_cloud_info()) { |
3721 | 1 | msg = Substitute("Got placement info without cloud info set: $0", pb.ShortDebugString()); |
3722 | 1 | s = STATUS(InvalidArgument, msg); |
3723 | 1 | LOG(WARNING) << msg; |
3724 | 1 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
3725 | 1 | } |
3726 | 242 | } |
3727 | | // Total replicas requested should be at least the sum of minimums |
3728 | | // requested in individual placement blocks. |
3729 | 106 | if (minimum_sum > num_replicas) { |
3730 | 1 | msg = Substitute("Sum of minimum replicas per placement ($0) is greater than num_replicas " |
3731 | 1 | " ($1)", minimum_sum, num_replicas); |
3732 | 1 | s = STATUS(InvalidArgument, msg); |
3733 | 1 | LOG(WARNING) << msg; |
3734 | 1 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
3735 | 1 | } |
3736 | | |
3737 | | // Verify that there are enough TServers in the requested placements |
3738 | | // to match the total required replication factor. |
3739 | 105 | auto allowed_ts = VERIFY_RESULT(FindTServersForPlacementInfo(placement_info, ts_descs)); |
3740 | | |
3741 | | // Fail if we don't have enough tablet servers in the areas requested. |
3742 | | // We need n/2 + 1 for quorum. |
3743 | 105 | if (allowed_ts.size() < replica_quorum_needed) { |
3744 | 1 | msg = Substitute("Not enough tablet servers in the requested placements. " |
3745 | 1 | "Need at least $0, have $1", |
3746 | 1 | replica_quorum_needed, allowed_ts.size()); |
3747 | 1 | s = STATUS(InvalidArgument, msg); |
3748 | 1 | LOG(WARNING) << msg; |
3749 | 1 | return SetupError(resp->mutable_error(), MasterErrorPB::REPLICATION_FACTOR_TOO_HIGH, s); |
3750 | 1 | } |
3751 | | |
3752 | | // Try allocating tservers for the replicas and see if we can place a quorum |
3753 | | // number of replicas. |
3754 | | // Essentially, the logic is: |
3755 | | // 1. We satisfy whatever we can from the minimums. |
3756 | | // 2. We then satisfy whatever we can from the slack. |
3757 | | // Here it doesn't whether where we put the slack replicas as long as |
3758 | | // the tservers are chosen from any of the valid placement blocks. |
3759 | | // Overall, if in this process we are able to place n/2 + 1 replicas |
3760 | | // then we succeed otherwise we fail. |
3761 | 104 | size_t total_extra_replicas = num_replicas - minimum_sum; |
3762 | 104 | size_t total_feasible_replicas = 0; |
3763 | 104 | size_t total_extra_servers = 0; |
3764 | 239 | for (const auto& pb : placement_info.placement_blocks()) { |
3765 | 239 | auto allowed_ts = VERIFY_RESULT(FindTServersForPlacementBlock(pb, ts_descs)); |
3766 | 239 | size_t allowed_ts_size = allowed_ts.size(); |
3767 | 239 | size_t min_num_replicas = pb.min_num_replicas(); |
3768 | | // For every placement block, we can only satisfy upto the number of |
3769 | | // tservers present in that particular placement block. |
3770 | 239 | total_feasible_replicas += min(allowed_ts_size, min_num_replicas); |
3771 | | // Extra tablet servers beyond min_num_replicas will be used to place |
3772 | | // the extra replicas over and above the minimums. |
3773 | 239 | if (allowed_ts_size > min_num_replicas) { |
3774 | 168 | total_extra_servers += allowed_ts_size - min_num_replicas; |
3775 | 168 | } |
3776 | 239 | } |
3777 | | // The total number of extra replicas that we can put cannot be more than |
3778 | | // the total tablet servers that are extra. |
3779 | 104 | total_feasible_replicas += min(total_extra_replicas, total_extra_servers); |
3780 | | |
3781 | | // If we place the replicas in accordance with above, we should be able to place |
3782 | | // at least replica_quorum_needed otherwise we fail. |
3783 | 104 | if (total_feasible_replicas < replica_quorum_needed) { |
3784 | 1 | msg = Substitute("Not enough tablet servers in the requested placements. " |
3785 | 1 | "Can only find $0 tablet servers for the replicas but need at least " |
3786 | 1 | "$1.", total_feasible_replicas, replica_quorum_needed); |
3787 | 1 | s = STATUS(InvalidArgument, msg); |
3788 | 1 | LOG(WARNING) << msg; |
3789 | 1 | return SetupError(resp->mutable_error(), MasterErrorPB::REPLICATION_FACTOR_TOO_HIGH, s); |
3790 | 1 | } |
3791 | 32.3k | } |
3792 | | |
3793 | 32.3k | return Status::OK(); |
3794 | 32.3k | } |
3795 | | |
3796 | | Status CatalogManager::CreateTableInMemory(const CreateTableRequestPB& req, |
3797 | | const Schema& schema, |
3798 | | const PartitionSchema& partition_schema, |
3799 | | const NamespaceId& namespace_id, |
3800 | | const NamespaceName& namespace_name, |
3801 | | const std::vector<Partition>& partitions, |
3802 | | IndexInfoPB* index_info, |
3803 | | TabletInfos* tablets, |
3804 | | CreateTableResponsePB* resp, |
3805 | 31.6k | scoped_refptr<TableInfo>* table) { |
3806 | | // Add the new table in "preparing" state. |
3807 | 31.6k | *table = CreateTableInfo(req, schema, partition_schema, namespace_id, namespace_name, index_info); |
3808 | 31.6k | const TableId& table_id = (*table)->id(); |
3809 | | |
3810 | 0 | VLOG_WITH_PREFIX_AND_FUNC(2) |
3811 | 0 | << "Table: " << (**table).ToString() << ", create_tablets: " << (tablets ? "YES" : "NO"); |
3812 | | |
3813 | 31.6k | auto table_ids_map_checkout = table_ids_map_.CheckOut(); |
3814 | 31.6k | (*table_ids_map_checkout)[table_id] = *table; |
3815 | | // Do not add Postgres tables to the name map as the table name is not unique in a namespace. |
3816 | 31.6k | if (req.table_type() != PGSQL_TABLE_TYPE) { |
3817 | 27.9k | table_names_map_[{namespace_id, req.name()}] = *table; |
3818 | 27.9k | } |
3819 | | |
3820 | 31.6k | if (req.table_type() == TRANSACTION_STATUS_TABLE_TYPE) { |
3821 | 563 | transaction_table_ids_set_.insert(table_id); |
3822 | 563 | } |
3823 | | |
3824 | 31.6k | if (tablets) { |
3825 | 29.3k | *tablets = VERIFY_RESULT(CreateTabletsFromTable(partitions, *table)); |
3826 | 29.3k | } |
3827 | | |
3828 | 31.6k | if (resp != nullptr) { |
3829 | 6.27k | resp->set_table_id(table_id); |
3830 | 6.27k | } |
3831 | | |
3832 | 31.6k | HandleNewTableId(table_id); |
3833 | | |
3834 | 31.6k | return Status::OK(); |
3835 | 31.6k | } |
3836 | | |
3837 | | Result<bool> CatalogManager::TableExists( |
3838 | 2.50k | const std::string& namespace_name, const std::string& table_name) const { |
3839 | 2.50k | TableIdentifierPB table_id_pb; |
3840 | 2.50k | table_id_pb.set_table_name(table_name); |
3841 | 2.50k | table_id_pb.mutable_namespace_()->set_name(namespace_name); |
3842 | 2.50k | return DoesTableExist(FindTable(table_id_pb)); |
3843 | 2.50k | } |
3844 | | |
3845 | | CHECKED_STATUS CatalogManager::CreateTransactionStatusTable( |
3846 | | const CreateTransactionStatusTableRequestPB* req, CreateTransactionStatusTableResponsePB* resp, |
3847 | 0 | rpc::RpcContext *rpc) { |
3848 | 0 | const string& table_name = req->table_name(); |
3849 | 0 | Status s = CreateTransactionStatusTableInternal(rpc, table_name, nullptr /* tablespace_id */); |
3850 | 0 | if (s.IsAlreadyPresent()) { |
3851 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_ALREADY_PRESENT, s); |
3852 | 0 | } |
3853 | 0 | if (!s.ok()) { |
3854 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INTERNAL_ERROR, s); |
3855 | 0 | } |
3856 | 0 | return Status::OK(); |
3857 | 0 | } |
3858 | | |
3859 | | CHECKED_STATUS CatalogManager::CreateTransactionStatusTableInternal( |
3860 | 2.34k | rpc::RpcContext *rpc, const string& table_name, const TablespaceId* tablespace_id) { |
3861 | 2.34k | if (VERIFY_RESULT(TableExists(kSystemNamespaceName, table_name))) { |
3862 | 1.78k | return STATUS_SUBSTITUTE(AlreadyPresent, "Table already exists: $0", table_name); |
3863 | 1.78k | } |
3864 | | |
3865 | 565 | LOG(INFO) << "Creating transaction status table " << table_name; |
3866 | | // Set up a CreateTable request internally. |
3867 | 565 | CreateTableRequestPB req; |
3868 | 565 | CreateTableResponsePB resp; |
3869 | 565 | req.set_name(table_name); |
3870 | 565 | req.mutable_namespace_()->set_name(kSystemNamespaceName); |
3871 | 565 | req.set_table_type(TableType::TRANSACTION_STATUS_TABLE_TYPE); |
3872 | 565 | if (tablespace_id) { |
3873 | 0 | req.set_tablespace_id(*tablespace_id); |
3874 | 0 | } |
3875 | | |
3876 | | // Explicitly set the number tablets if the corresponding flag is set, otherwise CreateTable |
3877 | | // will use the same defaults as for regular tables. |
3878 | 565 | int num_tablets; |
3879 | 565 | if (FLAGS_transaction_table_num_tablets > 0) { |
3880 | 71 | num_tablets = FLAGS_transaction_table_num_tablets; |
3881 | 494 | } else { |
3882 | 494 | auto placement_uuid = |
3883 | 494 | cluster_config_->LockForRead()->pb.replication_info().live_replicas().placement_uuid(); |
3884 | 494 | num_tablets = narrow_cast<int>(GetNumLiveTServersForPlacement(placement_uuid) * |
3885 | 494 | FLAGS_transaction_table_num_tablets_per_tserver); |
3886 | 494 | } |
3887 | 565 | req.mutable_schema()->mutable_table_properties()->set_num_tablets(num_tablets); |
3888 | | |
3889 | 565 | ColumnSchema hash(kRedisKeyColumnName, BINARY, /* is_nullable */ false, /* is_hash_key */ true); |
3890 | 565 | ColumnSchemaToPB(hash, req.mutable_schema()->mutable_columns()->Add()); |
3891 | | |
3892 | 565 | Status s = CreateTable(&req, &resp, rpc); |
3893 | | // We do not lock here so it is technically possible that the table was already created. |
3894 | | // If so, there is nothing to do so we just ignore the "AlreadyPresent" error. |
3895 | 565 | if (!s.ok() && !s.IsAlreadyPresent()) { |
3896 | 2 | return s; |
3897 | 2 | } |
3898 | | |
3899 | 563 | return Status::OK(); |
3900 | 563 | } |
3901 | | |
3902 | 0 | bool CatalogManager::DoesTransactionTableExistForTablespace(const TablespaceId& tablespace_id) { |
3903 | 0 | SharedLock lock(mutex_); |
3904 | 0 | for (const auto& table_id : transaction_table_ids_set_) { |
3905 | 0 | auto table = table_ids_map_->find(table_id); |
3906 | 0 | if (table == table_ids_map_->end()) { |
3907 | 0 | LOG(DFATAL) << "Table uuid " << table_id |
3908 | 0 | << " in transaction_table_ids_set_ but not in table_ids_map_"; |
3909 | 0 | continue; |
3910 | 0 | } |
3911 | 0 | auto this_tablespace_id = GetTransactionStatusTableTablespace(table->second); |
3912 | 0 | if (this_tablespace_id && *this_tablespace_id == tablespace_id) { |
3913 | 0 | return true; |
3914 | 0 | } |
3915 | 0 | } |
3916 | 0 | return false; |
3917 | 0 | } |
3918 | | |
3919 | | CHECKED_STATUS CatalogManager::CreateLocalTransactionStatusTableIfNeeded( |
3920 | 0 | rpc::RpcContext *rpc, const TablespaceId& tablespace_id) { |
3921 | 0 | std::lock_guard<std::mutex> lock(tablespace_transaction_table_creation_mutex_); |
3922 | |
|
3923 | 0 | if (DoesTransactionTableExistForTablespace(tablespace_id)) { |
3924 | 0 | VLOG(1) << "Transaction status table already exists, not creating."; |
3925 | 0 | return Status::OK(); |
3926 | 0 | } |
3927 | | |
3928 | 0 | std::string table_name; |
3929 | 0 | if (FLAGS_TEST_name_transaction_tables_with_tablespace_id) { |
3930 | 0 | uint32_t tablespace_oid = VERIFY_RESULT(GetPgsqlTablespaceOid(tablespace_id)); |
3931 | 0 | table_name = kTransactionTablePrefix + std::to_string(tablespace_oid); |
3932 | 0 | } else { |
3933 | 0 | std::string uuid; |
3934 | 0 | RETURN_NOT_OK(yb::Uuid::Generate().ToString(&uuid)); |
3935 | 0 | table_name = kTransactionTablePrefix + uuid; |
3936 | 0 | } |
3937 | |
|
3938 | 0 | return CreateTransactionStatusTableInternal(rpc, table_name, &tablespace_id); |
3939 | 0 | } |
3940 | | |
3941 | 2.34k | CHECKED_STATUS CatalogManager::CreateGlobalTransactionStatusTableIfNeeded(rpc::RpcContext *rpc) { |
3942 | 2.34k | Status s = CreateTransactionStatusTableInternal( |
3943 | 2.34k | rpc, kGlobalTransactionsTableName, nullptr /* tablespace_id */); |
3944 | 2.34k | if (s.IsAlreadyPresent()) { |
3945 | 0 | VLOG(1) << "Transaction status table already exists, not creating."; |
3946 | 1.78k | return Status::OK(); |
3947 | 1.78k | } |
3948 | 565 | return s; |
3949 | 565 | } |
3950 | | |
3951 | | CHECKED_STATUS CatalogManager::GetGlobalTransactionStatusTablets( |
3952 | 2.01k | GetTransactionStatusTabletsResponsePB* resp) { |
3953 | 2.01k | TableIdentifierPB global_txn_table_identifier; |
3954 | 2.01k | global_txn_table_identifier.set_table_name(kGlobalTransactionsTableName); |
3955 | 2.01k | global_txn_table_identifier.mutable_namespace_()->set_name(kSystemNamespaceName); |
3956 | 2.01k | scoped_refptr<TableInfo> global_txn_table = VERIFY_RESULT(FindTable(global_txn_table_identifier)); |
3957 | | |
3958 | 2.01k | RETURN_NOT_OK(WaitForCreateTableToFinish(global_txn_table->id())); |
3959 | | |
3960 | 2.01k | auto l = global_txn_table->LockForRead(); |
3961 | 2.01k | RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp)); |
3962 | | |
3963 | 39.4k | for (const auto& tablet : global_txn_table->GetTablets()) { |
3964 | 39.4k | TabletLocationsPB locs_pb; |
3965 | 39.4k | RETURN_NOT_OK(BuildLocationsForTablet(tablet, &locs_pb)); |
3966 | 39.4k | resp->add_global_tablet_id(tablet->tablet_id()); |
3967 | 39.4k | } |
3968 | | |
3969 | 2.01k | return Status::OK(); |
3970 | 2.01k | } |
3971 | | |
3972 | | Result<std::vector<TableId>> CatalogManager::GetPlacementLocalTransactionStatusTables( |
3973 | 2.01k | const CloudInfoPB& placement) { |
3974 | 2.01k | std::vector<TableId> same_placement_transaction_tables; |
3975 | 2.01k | auto tablespace_manager = GetTablespaceManager(); |
3976 | | |
3977 | 2.01k | SharedLock lock(mutex_); |
3978 | 1.98k | for (const auto& table_id : transaction_table_ids_set_) { |
3979 | 1.98k | auto table = table_ids_map_->find(table_id); |
3980 | 1.98k | if (table == table_ids_map_->end()) { |
3981 | 0 | LOG(DFATAL) << "Table uuid " << table_id |
3982 | 0 | << " in transaction_table_ids_set_ but not in table_ids_map_"; |
3983 | 0 | continue; |
3984 | 0 | } |
3985 | | // system.transaction is filtered out because it cannot have a placement set. |
3986 | 1.98k | auto table_info = table->second; |
3987 | 1.98k | auto lock = table_info->LockForRead(); |
3988 | 1.98k | auto tablespace_id = GetTransactionStatusTableTablespace(table_info); |
3989 | 1.98k | auto cloud_info = lock->pb.replication_info(); |
3990 | 2.00k | if (!IsReplicationInfoSet(cloud_info)) { |
3991 | 2.00k | if (tablespace_id) { |
3992 | 0 | const auto result = tablespace_manager->GetTablespaceReplicationInfo(*tablespace_id); |
3993 | 0 | if (!result.ok() || !*result || !IsReplicationInfoSet(**result)) { |
3994 | 0 | continue; |
3995 | 0 | } |
3996 | 0 | cloud_info = **result; |
3997 | 0 | } |
3998 | 2.00k | } |
3999 | 1.98k | const auto& txn_table_replicas = cloud_info.live_replicas(); |
4000 | | // Skip transaction tables spanning multiple regions, since using them will incur global |
4001 | | // latencies. See #11268. |
4002 | 1.98k | if (CatalogManagerUtil::DoesPlacementInfoSpanMultipleRegions(txn_table_replicas)) { |
4003 | 0 | continue; |
4004 | 0 | } |
4005 | 1.98k | if (CatalogManagerUtil::DoesPlacementInfoContainCloudInfo(txn_table_replicas, placement)) { |
4006 | 0 | same_placement_transaction_tables.push_back(table_id); |
4007 | 0 | } |
4008 | 1.98k | } |
4009 | | |
4010 | 2.01k | return same_placement_transaction_tables; |
4011 | 2.01k | } |
4012 | | |
4013 | | CHECKED_STATUS CatalogManager::GetPlacementLocalTransactionStatusTablets( |
4014 | | const CloudInfoPB& placement, |
4015 | 2.01k | GetTransactionStatusTabletsResponsePB* resp) { |
4016 | 2.01k | auto same_placement_transaction_tables = VERIFY_RESULT(GetPlacementLocalTransactionStatusTables( |
4017 | 2.01k | placement)); |
4018 | | |
4019 | 2.01k | if (!same_placement_transaction_tables.empty()) { |
4020 | 0 | for (const auto& table_id : same_placement_transaction_tables) { |
4021 | 0 | RETURN_NOT_OK(WaitForCreateTableToFinish(table_id)); |
4022 | 0 | } |
4023 | |
|
4024 | 0 | SharedLock lock(mutex_); |
4025 | 0 | for (const auto& table_id : same_placement_transaction_tables) { |
4026 | 0 | if (!table_ids_map_->count(table_id)) { |
4027 | 0 | Status s = STATUS_FORMAT( |
4028 | 0 | NotFound, "Transaction table with id $0 does not exist", table_id); |
4029 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); |
4030 | 0 | } |
4031 | | |
4032 | 0 | auto& table_info = *table_ids_map_->at(table_id); |
4033 | 0 | auto lock = table_info.LockForRead(); |
4034 | 0 | for (const auto& tablet : table_info.GetTablets()) { |
4035 | 0 | TabletLocationsPB locs_pb; |
4036 | 0 | RETURN_NOT_OK(BuildLocationsForTablet(tablet, &locs_pb)); |
4037 | 0 | resp->add_placement_local_tablet_id(tablet->tablet_id()); |
4038 | 0 | } |
4039 | 0 | } |
4040 | 0 | } |
4041 | | |
4042 | 2.01k | return Status::OK(); |
4043 | 2.01k | } |
4044 | | |
4045 | | CHECKED_STATUS CatalogManager::GetTransactionStatusTablets( |
4046 | | const GetTransactionStatusTabletsRequestPB* req, |
4047 | | GetTransactionStatusTabletsResponsePB* resp, |
4048 | 2.01k | rpc::RpcContext *rpc) { |
4049 | | |
4050 | 2.01k | RETURN_NOT_OK(GetGlobalTransactionStatusTablets(resp)); |
4051 | | |
4052 | 2.01k | if (req->has_placement()) { |
4053 | 2.01k | RETURN_NOT_OK(GetPlacementLocalTransactionStatusTablets(req->placement(), resp)); |
4054 | 2.01k | } |
4055 | | |
4056 | 2.01k | return Status::OK(); |
4057 | 2.01k | } |
4058 | | |
4059 | 0 | Status CatalogManager::CreateMetricsSnapshotsTableIfNeeded(rpc::RpcContext *rpc) { |
4060 | 0 | if (VERIFY_RESULT(TableExists(kSystemNamespaceName, kMetricsSnapshotsTableName))) { |
4061 | 0 | return Status::OK(); |
4062 | 0 | } |
4063 | | |
4064 | | // Set up a CreateTable request internally. |
4065 | 0 | CreateTableRequestPB req; |
4066 | 0 | CreateTableResponsePB resp; |
4067 | 0 | req.set_name(kMetricsSnapshotsTableName); |
4068 | 0 | req.mutable_namespace_()->set_name(kSystemNamespaceName); |
4069 | 0 | req.set_table_type(TableType::YQL_TABLE_TYPE); |
4070 | | |
4071 | | // Explicitly set the number tablets if the corresponding flag is set, otherwise CreateTable |
4072 | | // will use the same defaults as for regular tables. |
4073 | 0 | if (FLAGS_metrics_snapshots_table_num_tablets > 0) { |
4074 | 0 | req.mutable_schema()->mutable_table_properties()->set_num_tablets( |
4075 | 0 | FLAGS_metrics_snapshots_table_num_tablets); |
4076 | 0 | } |
4077 | | |
4078 | | // Schema description: "node" refers to tserver uuid. "entity_type" can be either |
4079 | | // "tserver" or "table". "entity_id" is uuid of corresponding tserver or table. |
4080 | | // "metric" is the name of the metric and "value" is its val. "ts" is time at |
4081 | | // which the snapshot was recorded. "details" is a json column for future extensibility. |
4082 | |
|
4083 | 0 | YBSchemaBuilder schemaBuilder; |
4084 | 0 | schemaBuilder.AddColumn("node")->Type(STRING)->HashPrimaryKey()->NotNull(); |
4085 | 0 | schemaBuilder.AddColumn("entity_type")->Type(STRING)->PrimaryKey()->NotNull(); |
4086 | 0 | schemaBuilder.AddColumn("entity_id")->Type(STRING)->PrimaryKey()->NotNull(); |
4087 | 0 | schemaBuilder.AddColumn("metric")->Type(STRING)->PrimaryKey()->NotNull(); |
4088 | 0 | schemaBuilder.AddColumn("ts")->Type(TIMESTAMP)->PrimaryKey()->NotNull()-> |
4089 | 0 | SetSortingType(SortingType::kDescending); |
4090 | 0 | schemaBuilder.AddColumn("value")->Type(INT64); |
4091 | 0 | schemaBuilder.AddColumn("details")->Type(JSONB); |
4092 | |
|
4093 | 0 | YBSchema ybschema; |
4094 | 0 | CHECK_OK(schemaBuilder.Build(&ybschema)); |
4095 | |
|
4096 | 0 | auto schema = yb::client::internal::GetSchema(ybschema); |
4097 | 0 | SchemaToPB(schema, req.mutable_schema()); |
4098 | |
|
4099 | 0 | Status s = CreateTable(&req, &resp, rpc); |
4100 | | // We do not lock here so it is technically possible that the table was already created. |
4101 | | // If so, there is nothing to do so we just ignore the "AlreadyPresent" error. |
4102 | 0 | if (s.IsAlreadyPresent()) { |
4103 | 0 | return Status::OK(); |
4104 | 0 | } |
4105 | 0 | return s; |
4106 | 0 | } |
4107 | | |
4108 | | Status CatalogManager::IsCreateTableDone(const IsCreateTableDoneRequestPB* req, |
4109 | 20.3k | IsCreateTableDoneResponsePB* resp) { |
4110 | 20.3k | TRACE("Looking up table"); |
4111 | | // 1. Lookup the table and verify if it exists. |
4112 | 20.3k | scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTable(req->table())); |
4113 | | |
4114 | 20.3k | TRACE("Locking table"); |
4115 | 20.3k | auto l = table->LockForRead(); |
4116 | 20.3k | RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp)); |
4117 | 20.3k | const auto& pb = l->pb; |
4118 | | |
4119 | | // 2. Verify if the create is in-progress. |
4120 | 20.3k | TRACE("Verify if the table creation is in progress for $0", table->ToString()); |
4121 | 20.3k | resp->set_done(!table->IsCreateInProgress()); |
4122 | | |
4123 | | // 3. Set any current errors, if we are experiencing issues creating the table. This will be |
4124 | | // bubbled up to the MasterService layer. If it is an error, it gets wrapped around in |
4125 | | // MasterErrorPB::UNKNOWN_ERROR. |
4126 | 20.3k | RETURN_NOT_OK(table->GetCreateTableErrorStatus()); |
4127 | | |
4128 | | // 4. If this is an index, we are not done until the index is in the indexed table's schema. An |
4129 | | // exception is YSQL system table indexes, which don't get added to their indexed tables' schemas. |
4130 | 20.3k | if (resp->done() && IsIndex(pb)) { |
4131 | 690 | auto& indexed_table_id = GetIndexedTableId(pb); |
4132 | | // For user indexes (which add index info to indexed table's schema), |
4133 | | // - if this index is created without backfill, |
4134 | | // - waiting for the index to be in the indexed table's schema is sufficient, and, by that |
4135 | | // point, things are fully created. |
4136 | | // - if this index is created with backfill |
4137 | | // - and it's YCQL, |
4138 | | // - waiting for the index to be in the indexed table's schema means waiting for the |
4139 | | // DELETE_ONLY index permission, and it's fine to return to the client before the index |
4140 | | // gets the rest of the permissions because the expectation is that backfill will be |
4141 | | // completed asynchronously. |
4142 | | // - and it's YSQL, |
4143 | | // - waiting for the index to be in the indexed table's schema means just that (DocDB index |
4144 | | // permissions don't really matter for YSQL besides being used for backfill purposes), and |
4145 | | // it's a signal for postgres to continue the index backfill process, activating index |
4146 | | // state flags then later triggering backfill and so on. |
4147 | | // For YSQL system indexes (which don't add index info to indexed table's schema), |
4148 | | // - there's nothing additional to wait on. |
4149 | | // Therefore, the only thing needed here is to check whether the index info is in the indexed |
4150 | | // table's schema for user indexes. |
4151 | 690 | if (pb.table_type() == YQL_TABLE_TYPE || |
4152 | 690 | (pb.table_type() == PGSQL_TABLE_TYPE && IsUserCreatedTable(*table))) { |
4153 | 690 | GetTableSchemaRequestPB get_schema_req; |
4154 | 690 | GetTableSchemaResponsePB get_schema_resp; |
4155 | 690 | get_schema_req.mutable_table()->set_table_id(indexed_table_id); |
4156 | 690 | const bool get_fully_applied_indexes = true; |
4157 | 690 | const Status s = GetTableSchemaInternal(&get_schema_req, |
4158 | 690 | &get_schema_resp, |
4159 | 690 | get_fully_applied_indexes); |
4160 | 690 | if (!s.ok()) { |
4161 | 0 | resp->mutable_error()->Swap(get_schema_resp.mutable_error()); |
4162 | 0 | return s; |
4163 | 0 | } |
4164 | | |
4165 | 690 | resp->set_done(false); |
4166 | 1.30k | for (const auto& index : get_schema_resp.indexes()) { |
4167 | 1.30k | if (index.has_table_id() && index.table_id() == table->id()) { |
4168 | 606 | resp->set_done(true); |
4169 | 606 | break; |
4170 | 606 | } |
4171 | 1.30k | } |
4172 | 690 | } |
4173 | 690 | } |
4174 | | |
4175 | | // Sanity check that this table is present in system.partitions if it is a YCQL table. |
4176 | | // Only check if we are automatically generating the vtable on changes. If we are creating via |
4177 | | // the bg task, then there may be a delay. |
4178 | 20.3k | if (DCHECK_IS_ON() && |
4179 | 20.3k | resp->done() && |
4180 | 7.46k | IsYcqlTable(*table) && |
4181 | 1.91k | YQLPartitionsVTable::GeneratePartitionsVTableOnChanges() && |
4182 | 1.91k | FLAGS_TEST_catalog_manager_check_yql_partitions_exist_for_is_create_table_done) { |
4183 | 1.91k | Schema schema; |
4184 | 1.91k | RETURN_NOT_OK(table->GetSchema(&schema)); |
4185 | | // Copartitioned tables don't actually create tablets currently (unimplemented), so ignore them. |
4186 | 1.91k | if (!schema.table_properties().HasCopartitionTableId()) { |
4187 | 1.91k | DCHECK(GetYqlPartitionsVtable().CheckTableIsPresent(table->id(), table->NumPartitions())); |
4188 | 1.91k | } |
4189 | 1.91k | } |
4190 | | |
4191 | | // If this is a transactional table we are not done until the transaction status table is created. |
4192 | | // However, if we are currently initializing the system catalog snapshot, we don't create the |
4193 | | // transactions table. |
4194 | 20.3k | if (!FLAGS_create_initial_sys_catalog_snapshot && |
4195 | 20.3k | resp->done() && pb.schema().table_properties().is_transactional()) { |
4196 | 2.08k | RETURN_NOT_OK(IsTransactionStatusTableCreated(resp)); |
4197 | 2.08k | } |
4198 | | |
4199 | | // We are not done until the metrics snapshots table is created. |
4200 | 20.3k | if (FLAGS_master_enable_metrics_snapshotter && resp->done() && |
4201 | 0 | !(table->GetTableType() == TableType::YQL_TABLE_TYPE && |
4202 | 0 | table->namespace_id() == kSystemNamespaceId && |
4203 | 0 | table->name() == kMetricsSnapshotsTableName)) { |
4204 | 0 | RETURN_NOT_OK(IsMetricsSnapshotsTableCreated(resp)); |
4205 | 0 | } |
4206 | | |
4207 | | // If this is a colocated table and there is a pending AddTableToTablet task then we are not done. |
4208 | 20.3k | if (resp->done() && pb.colocated()) { |
4209 | 57 | resp->set_done(!table->HasTasks(MonitoredTask::Type::ASYNC_ADD_TABLE_TO_TABLET)); |
4210 | 57 | } |
4211 | | |
4212 | 20.3k | return Status::OK(); |
4213 | 20.3k | } |
4214 | | |
4215 | | Status CatalogManager::IsCreateTableInProgress(const TableId& table_id, |
4216 | | CoarseTimePoint deadline, |
4217 | 1.99k | bool* create_in_progress) { |
4218 | 1.99k | DCHECK_ONLY_NOTNULL(create_in_progress); |
4219 | 1.99k | DCHECK(!table_id.empty()); |
4220 | | |
4221 | 1.99k | IsCreateTableDoneRequestPB req; |
4222 | 1.99k | IsCreateTableDoneResponsePB resp; |
4223 | 1.99k | req.mutable_table()->set_table_id(table_id); |
4224 | 1.99k | RETURN_NOT_OK(IsCreateTableDone(&req, &resp)); |
4225 | | |
4226 | 1.99k | if (resp.has_error()) { |
4227 | 0 | return StatusFromPB(resp.error().status()); |
4228 | 0 | } |
4229 | | |
4230 | 1.99k | *create_in_progress = !resp.done(); |
4231 | 1.99k | return Status::OK(); |
4232 | 1.99k | } |
4233 | | |
4234 | 2.01k | Status CatalogManager::WaitForCreateTableToFinish(const TableId& table_id) { |
4235 | 2.01k | MonoDelta default_admin_operation_timeout( |
4236 | 2.01k | MonoDelta::FromSeconds(FLAGS_yb_client_admin_operation_timeout_sec)); |
4237 | 2.01k | auto deadline = CoarseMonoClock::Now() + default_admin_operation_timeout; |
4238 | | |
4239 | 2.01k | return client::RetryFunc( |
4240 | 2.01k | deadline, "Waiting on Create Table to be completed", "Timed out waiting for Table Creation", |
4241 | 2.01k | std::bind(&CatalogManager::IsCreateTableInProgress, this, table_id, _1, _2)); |
4242 | 2.01k | } |
4243 | | |
4244 | 2.08k | Status CatalogManager::IsTransactionStatusTableCreated(IsCreateTableDoneResponsePB* resp) { |
4245 | 2.08k | IsCreateTableDoneRequestPB req; |
4246 | | |
4247 | 2.08k | req.mutable_table()->set_table_name(kGlobalTransactionsTableName); |
4248 | 2.08k | req.mutable_table()->mutable_namespace_()->set_name(kSystemNamespaceName); |
4249 | | |
4250 | 2.08k | return IsCreateTableDone(&req, resp); |
4251 | 2.08k | } |
4252 | | |
4253 | 0 | Status CatalogManager::IsMetricsSnapshotsTableCreated(IsCreateTableDoneResponsePB* resp) { |
4254 | 0 | IsCreateTableDoneRequestPB req; |
4255 | |
|
4256 | 0 | req.mutable_table()->set_table_name(kMetricsSnapshotsTableName); |
4257 | 0 | req.mutable_table()->mutable_namespace_()->set_name(kSystemNamespaceName); |
4258 | 0 | req.mutable_table()->mutable_namespace_()->set_database_type(YQLDatabase::YQL_DATABASE_CQL); |
4259 | |
|
4260 | 0 | return IsCreateTableDone(&req, resp); |
4261 | 0 | } |
4262 | | |
4263 | 7 | std::string CatalogManager::GenerateId(boost::optional<const SysRowEntryType> entity_type) { |
4264 | 7 | SharedLock lock(mutex_); |
4265 | 7 | return GenerateIdUnlocked(entity_type); |
4266 | 7 | } |
4267 | | |
4268 | | std::string CatalogManager::GenerateIdUnlocked( |
4269 | 83.9k | boost::optional<const SysRowEntryType> entity_type) { |
4270 | 83.9k | while (true) { |
4271 | | // Generate id and make sure it is unique within its category. |
4272 | 83.9k | std::string id = GenerateObjectId(); |
4273 | 83.9k | if (!entity_type) { |
4274 | 7 | return id; |
4275 | 7 | } |
4276 | 83.9k | switch (*entity_type) { |
4277 | 2.03k | case SysRowEntryType::NAMESPACE: |
4278 | 2.03k | if (FindPtrOrNull(namespace_ids_map_, id) == nullptr) return id; |
4279 | 0 | break; |
4280 | 27.9k | case SysRowEntryType::TABLE: |
4281 | 27.9k | if (FindPtrOrNull(*table_ids_map_, id) == nullptr) return id; |
4282 | 0 | break; |
4283 | 53.7k | case SysRowEntryType::TABLET: |
4284 | 53.7k | if (FindPtrOrNull(*tablet_map_, id) == nullptr) return id; |
4285 | 0 | break; |
4286 | 45 | case SysRowEntryType::UDTYPE: |
4287 | 45 | if (FindPtrOrNull(udtype_ids_map_, id) == nullptr) return id; |
4288 | 0 | break; |
4289 | 0 | case SysRowEntryType::SNAPSHOT: |
4290 | 0 | return id; |
4291 | 157 | case SysRowEntryType::CDC_STREAM: |
4292 | 157 | if (!CDCStreamExistsUnlocked(id)) return id; |
4293 | 0 | break; |
4294 | 0 | case SysRowEntryType::CLUSTER_CONFIG: FALLTHROUGH_INTENDED; |
4295 | 0 | case SysRowEntryType::ROLE: FALLTHROUGH_INTENDED; |
4296 | 0 | case SysRowEntryType::REDIS_CONFIG: FALLTHROUGH_INTENDED; |
4297 | 0 | case SysRowEntryType::UNIVERSE_REPLICATION: FALLTHROUGH_INTENDED; |
4298 | 0 | case SysRowEntryType::SYS_CONFIG: FALLTHROUGH_INTENDED; |
4299 | 0 | case SysRowEntryType::SNAPSHOT_SCHEDULE: FALLTHROUGH_INTENDED; |
4300 | 0 | case SysRowEntryType::DDL_LOG_ENTRY: FALLTHROUGH_INTENDED; |
4301 | 0 | case SysRowEntryType::UNKNOWN: |
4302 | 0 | LOG(DFATAL) << "Invalid id type: " << *entity_type; |
4303 | 0 | return id; |
4304 | 83.9k | } |
4305 | 83.9k | } |
4306 | 83.9k | } |
4307 | | |
4308 | | scoped_refptr<TableInfo> CatalogManager::CreateTableInfo(const CreateTableRequestPB& req, |
4309 | | const Schema& schema, |
4310 | | const PartitionSchema& partition_schema, |
4311 | | const NamespaceId& namespace_id, |
4312 | | const NamespaceName& namespace_name, |
4313 | 31.6k | IndexInfoPB* index_info) { |
4314 | 31.6k | DCHECK(schema.has_column_ids()); |
4315 | 31.6k | TableId table_id |
4316 | 27.9k | = !req.table_id().empty() ? req.table_id() : GenerateIdUnlocked(SysRowEntryType::TABLE); |
4317 | 31.6k | scoped_refptr<TableInfo> table = NewTableInfo(table_id); |
4318 | 31.6k | if (req.has_tablespace_id()) { |
4319 | 3 | table->SetTablespaceIdForTableCreation(req.tablespace_id()); |
4320 | 3 | } |
4321 | 31.6k | table->mutable_metadata()->StartMutation(); |
4322 | 31.6k | SysTablesEntryPB *metadata = &table->mutable_metadata()->mutable_dirty()->pb; |
4323 | 31.6k | metadata->set_state(SysTablesEntryPB::PREPARING); |
4324 | 31.6k | metadata->set_name(req.name()); |
4325 | 31.6k | metadata->set_table_type(req.table_type()); |
4326 | 31.6k | metadata->set_namespace_id(namespace_id); |
4327 | 31.6k | metadata->set_namespace_name(namespace_name); |
4328 | 31.6k | metadata->set_version(0); |
4329 | 31.6k | metadata->set_next_column_id(ColumnId(schema.max_col_id() + 1)); |
4330 | 31.6k | if (req.has_replication_info()) { |
4331 | 1 | metadata->mutable_replication_info()->CopyFrom(req.replication_info()); |
4332 | 1 | } |
4333 | | // Use the Schema object passed in, since it has the column IDs already assigned, |
4334 | | // whereas the user request PB does not. |
4335 | 31.6k | SchemaToPB(schema, metadata->mutable_schema()); |
4336 | 31.6k | partition_schema.ToPB(metadata->mutable_partition_schema()); |
4337 | | // For index table, set index details (indexed table id and whether the index is local). |
4338 | 31.6k | if (req.has_index_info()) { |
4339 | 1.59k | metadata->mutable_index_info()->CopyFrom(req.index_info()); |
4340 | | |
4341 | | // Set the deprecated fields also for compatibility reasons. |
4342 | 1.59k | metadata->set_indexed_table_id(req.index_info().indexed_table_id()); |
4343 | 1.59k | metadata->set_is_local_index(req.index_info().is_local()); |
4344 | 1.59k | metadata->set_is_unique_index(req.index_info().is_unique()); |
4345 | | |
4346 | | // Setup index info. |
4347 | 1.59k | if (index_info != nullptr) { |
4348 | 586 | index_info->set_table_id(table->id()); |
4349 | 586 | metadata->mutable_index_info()->CopyFrom(*index_info); |
4350 | 586 | } |
4351 | 30.0k | } else if (req.has_indexed_table_id()) { |
4352 | | // Read data from the deprecated field and update the new fields. |
4353 | 18 | metadata->mutable_index_info()->set_indexed_table_id(req.indexed_table_id()); |
4354 | 18 | metadata->mutable_index_info()->set_is_local(req.is_local_index()); |
4355 | 18 | metadata->mutable_index_info()->set_is_unique(req.is_unique_index()); |
4356 | | |
4357 | | // Set the deprecated fields also for compatibility reasons. |
4358 | 18 | metadata->set_indexed_table_id(req.indexed_table_id()); |
4359 | 18 | metadata->set_is_local_index(req.is_local_index()); |
4360 | 18 | metadata->set_is_unique_index(req.is_unique_index()); |
4361 | | |
4362 | | // Setup index info. |
4363 | 18 | if (index_info != nullptr) { |
4364 | 18 | index_info->set_table_id(table->id()); |
4365 | 18 | metadata->mutable_index_info()->CopyFrom(*index_info); |
4366 | 18 | } |
4367 | 18 | } |
4368 | | |
4369 | 31.6k | if (req.is_pg_shared_table()) { |
4370 | 0 | metadata->set_is_pg_shared_table(true); |
4371 | 0 | } |
4372 | | |
4373 | 31.6k | return table; |
4374 | 31.6k | } |
4375 | | |
4376 | | TabletInfoPtr CatalogManager::CreateTabletInfo(TableInfo* table, |
4377 | 53.7k | const PartitionPB& partition) { |
4378 | 53.7k | auto tablet = make_scoped_refptr<TabletInfo>(table, GenerateIdUnlocked(SysRowEntryType::TABLET)); |
4379 | 0 | VLOG_WITH_PREFIX_AND_FUNC(2) |
4380 | 0 | << "Table: " << table->ToString() << ", tablet: " << tablet->ToString(); |
4381 | | |
4382 | 53.7k | tablet->mutable_metadata()->StartMutation(); |
4383 | 53.7k | SysTabletsEntryPB *metadata = &tablet->mutable_metadata()->mutable_dirty()->pb; |
4384 | 53.7k | metadata->set_state(SysTabletsEntryPB::PREPARING); |
4385 | 53.7k | metadata->mutable_partition()->CopyFrom(partition); |
4386 | 53.7k | metadata->set_table_id(table->id()); |
4387 | | // This is important: we are setting the first table id in the table_ids list |
4388 | | // to be the id of the original table that creates the tablet. |
4389 | 53.7k | metadata->add_table_ids(table->id()); |
4390 | 53.7k | return tablet; |
4391 | 53.7k | } |
4392 | | |
4393 | | Status CatalogManager::RemoveTableIdsFromTabletInfo( |
4394 | | TabletInfoPtr tablet_info, |
4395 | 48 | std::unordered_set<TableId> tables_to_remove) { |
4396 | 48 | auto tablet_lock = tablet_info->LockForWrite(); |
4397 | | |
4398 | 48 | google::protobuf::RepeatedPtrField<std::string> new_table_ids; |
4399 | 14.1k | for (const auto& table_id : tablet_lock->pb.table_ids()) { |
4400 | 14.1k | if (tables_to_remove.find(table_id) == tables_to_remove.end()) { |
4401 | 11.9k | *new_table_ids.Add() = std::move(table_id); |
4402 | 11.9k | } |
4403 | 14.1k | } |
4404 | 48 | tablet_lock.mutable_data()->pb.mutable_table_ids()->Swap(&new_table_ids); |
4405 | | |
4406 | 48 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), tablet_info)); |
4407 | 44 | tablet_lock.Commit(); |
4408 | 44 | return Status::OK(); |
4409 | 48 | } |
4410 | | |
4411 | | Result<scoped_refptr<TableInfo>> CatalogManager::FindTable( |
4412 | 434k | const TableIdentifierPB& table_identifier) const { |
4413 | 434k | SharedLock lock(mutex_); |
4414 | 434k | return FindTableUnlocked(table_identifier); |
4415 | 434k | } |
4416 | | |
4417 | | Result<scoped_refptr<TableInfo>> CatalogManager::FindTableUnlocked( |
4418 | 434k | const TableIdentifierPB& table_identifier) const { |
4419 | 434k | if (table_identifier.has_table_id()) { |
4420 | 320k | return FindTableByIdUnlocked(table_identifier.table_id()); |
4421 | 320k | } |
4422 | | |
4423 | 114k | if (table_identifier.has_table_name()) { |
4424 | 114k | auto namespace_info = VERIFY_RESULT(FindNamespaceUnlocked(table_identifier.namespace_())); |
4425 | | |
4426 | | // We can't lookup YSQL table by name because Postgres concept of "schemas" |
4427 | | // introduces ambiguity. |
4428 | 114k | if (namespace_info->database_type() == YQL_DATABASE_PGSQL) { |
4429 | 0 | return STATUS(InvalidArgument, "Cannot lookup YSQL table by name"); |
4430 | 0 | } |
4431 | | |
4432 | 114k | auto it = table_names_map_.find({namespace_info->id(), table_identifier.table_name()}); |
4433 | 114k | if (it == table_names_map_.end()) { |
4434 | 3.01k | return STATUS_EC_FORMAT( |
4435 | 3.01k | NotFound, MasterError(MasterErrorPB::OBJECT_NOT_FOUND), |
4436 | 3.01k | "Table $0.$1 not found", namespace_info->name(), table_identifier.table_name()); |
4437 | 3.01k | } |
4438 | 111k | return it->second; |
4439 | 111k | } |
4440 | | |
4441 | 11 | return STATUS(InvalidArgument, "Neither table id or table name are specified", |
4442 | 11 | table_identifier.ShortDebugString()); |
4443 | 11 | } |
4444 | | |
4445 | | Result<scoped_refptr<TableInfo>> CatalogManager::FindTableById( |
4446 | 3.53k | const TableId& table_id) const { |
4447 | 3.53k | SharedLock lock(mutex_); |
4448 | 3.53k | return FindTableByIdUnlocked(table_id); |
4449 | 3.53k | } |
4450 | | |
4451 | | Result<scoped_refptr<TableInfo>> CatalogManager::FindTableByIdUnlocked( |
4452 | 323k | const TableId& table_id) const { |
4453 | 323k | auto it = table_ids_map_->find(table_id); |
4454 | 323k | if (it == table_ids_map_->end()) { |
4455 | 76 | return STATUS_EC_FORMAT( |
4456 | 76 | NotFound, MasterError(MasterErrorPB::OBJECT_NOT_FOUND), |
4457 | 76 | "Table with identifier $0 not found", table_id); |
4458 | 76 | } |
4459 | 323k | return it->second; |
4460 | 323k | } |
4461 | | |
4462 | | Result<scoped_refptr<NamespaceInfo>> CatalogManager::FindNamespaceById( |
4463 | 623k | const NamespaceId& id) const { |
4464 | 623k | SharedLock lock(mutex_); |
4465 | 623k | return FindNamespaceByIdUnlocked(id); |
4466 | 623k | } |
4467 | | |
4468 | | Result<scoped_refptr<NamespaceInfo>> CatalogManager::FindNamespaceByIdUnlocked( |
4469 | 659k | const NamespaceId& id) const { |
4470 | 659k | auto it = namespace_ids_map_.find(id); |
4471 | 659k | if (it == namespace_ids_map_.end()) { |
4472 | 0 | VLOG_WITH_FUNC(4) << "Not found: " << id << "\n" << GetStackTrace(); |
4473 | 3 | return STATUS(NotFound, "Keyspace identifier not found", id, |
4474 | 3 | MasterError(MasterErrorPB::NAMESPACE_NOT_FOUND)); |
4475 | 3 | } |
4476 | 659k | return it->second; |
4477 | 659k | } |
4478 | | |
4479 | | Result<scoped_refptr<NamespaceInfo>> CatalogManager::FindNamespaceUnlocked( |
4480 | 156k | const NamespaceIdentifierPB& ns_identifier) const { |
4481 | 156k | if (ns_identifier.has_id()) { |
4482 | 35.9k | return FindNamespaceByIdUnlocked(ns_identifier.id()); |
4483 | 35.9k | } |
4484 | | |
4485 | 121k | if (ns_identifier.has_name()) { |
4486 | 120k | auto db = GetDatabaseType(ns_identifier); |
4487 | 120k | auto it = namespace_names_mapper_[db].find(ns_identifier.name()); |
4488 | 120k | if (it == namespace_names_mapper_[db].end()) { |
4489 | 1.70k | return STATUS(NotFound, "Keyspace name not found", ns_identifier.name(), |
4490 | 1.70k | MasterError(MasterErrorPB::NAMESPACE_NOT_FOUND)); |
4491 | 1.70k | } |
4492 | 119k | return it->second; |
4493 | 119k | } |
4494 | | |
4495 | 4 | LOG(DFATAL) << __func__ << ": " << ns_identifier.ShortDebugString() << ", \n" << GetStackTrace(); |
4496 | 4 | return STATUS(NotFound, "Neither keyspace id nor keyspace name is specified", |
4497 | 4 | ns_identifier.ShortDebugString(), MasterError(MasterErrorPB::NAMESPACE_NOT_FOUND)); |
4498 | 4 | } |
4499 | | |
4500 | | Result<scoped_refptr<NamespaceInfo>> CatalogManager::FindNamespace( |
4501 | 14.4k | const NamespaceIdentifierPB& ns_identifier) const { |
4502 | 14.4k | SharedLock lock(mutex_); |
4503 | 14.4k | return FindNamespaceUnlocked(ns_identifier); |
4504 | 14.4k | } |
4505 | | |
4506 | | Result<TableDescription> CatalogManager::DescribeTable( |
4507 | 0 | const TableIdentifierPB& table_identifier, bool succeed_if_create_in_progress) { |
4508 | 0 | TRACE("Looking up table"); |
4509 | 0 | return DescribeTable(VERIFY_RESULT(FindTable(table_identifier)), succeed_if_create_in_progress); |
4510 | 0 | } |
4511 | | |
4512 | | Result<TableDescription> CatalogManager::DescribeTable( |
4513 | 7 | const TableInfoPtr& table_info, bool succeed_if_create_in_progress) { |
4514 | 7 | TableDescription result; |
4515 | 7 | result.table_info = table_info; |
4516 | 7 | NamespaceId namespace_id; |
4517 | 7 | { |
4518 | 7 | TRACE("Locking table"); |
4519 | 7 | auto l = table_info->LockForRead(); |
4520 | | |
4521 | 7 | if (!succeed_if_create_in_progress && table_info->IsCreateInProgress()) { |
4522 | 0 | return STATUS(IllegalState, "Table creation is in progress", table_info->ToString(), |
4523 | 0 | MasterError(MasterErrorPB::TABLE_CREATION_IS_IN_PROGRESS)); |
4524 | 0 | } |
4525 | | |
4526 | 7 | result.tablet_infos = table_info->GetTablets(); |
4527 | | |
4528 | 7 | namespace_id = table_info->namespace_id(); |
4529 | 7 | } |
4530 | | |
4531 | 7 | TRACE("Looking up namespace"); |
4532 | 7 | result.namespace_info = VERIFY_RESULT(FindNamespaceById(namespace_id)); |
4533 | | |
4534 | 7 | return result; |
4535 | 7 | } |
4536 | | |
4537 | 0 | Result<string> CatalogManager::GetPgSchemaName(const TableInfoPtr& table_info) { |
4538 | 0 | RSTATUS_DCHECK_EQ(table_info->GetTableType(), PGSQL_TABLE_TYPE, InternalError, |
4539 | 0 | Format("Expected YSQL table, got: $0", table_info->GetTableType())); |
4540 | |
|
4541 | 0 | const uint32_t database_oid = VERIFY_RESULT(GetPgsqlDatabaseOid(table_info->namespace_id())); |
4542 | 0 | uint32_t table_oid = VERIFY_RESULT(GetPgsqlTableOid(table_info->id())); |
4543 | 0 | { |
4544 | 0 | if (matview_pg_table_ids_map_.find(table_info->id()) != matview_pg_table_ids_map_.end()) { |
4545 | 0 | table_oid = VERIFY_RESULT(GetPgsqlTableOid(matview_pg_table_ids_map_[table_info->id()])); |
4546 | 0 | } |
4547 | 0 | } |
4548 | 0 | const uint32_t relnamespace_oid = VERIFY_RESULT( |
4549 | 0 | sys_catalog_->ReadPgClassRelnamespace(database_oid, table_oid)); |
4550 | 0 | return sys_catalog_->ReadPgNamespaceNspname(database_oid, relnamespace_oid); |
4551 | 0 | } |
4552 | | |
4553 | | // Truncate a Table. |
4554 | | Status CatalogManager::TruncateTable(const TruncateTableRequestPB* req, |
4555 | | TruncateTableResponsePB* resp, |
4556 | 11.8k | rpc::RpcContext* rpc) { |
4557 | 11.8k | LOG(INFO) << "Servicing TruncateTable request from " << RequestorString(rpc) |
4558 | 11.8k | << ": " << req->ShortDebugString(); |
4559 | | |
4560 | 14.9k | for (int i = 0; i < req->table_ids_size(); i++) { |
4561 | 3.05k | RETURN_NOT_OK(TruncateTable(req->table_ids(i), resp, rpc)); |
4562 | 3.05k | } |
4563 | | |
4564 | 11.8k | return Status::OK(); |
4565 | 11.8k | } |
4566 | | |
4567 | | Status CatalogManager::TruncateTable(const TableId& table_id, |
4568 | | TruncateTableResponsePB* resp, |
4569 | 6.43k | rpc::RpcContext* rpc) { |
4570 | | // Lookup the table and verify if it exists. |
4571 | 6.43k | TRACE(Substitute("Looking up object by id $0", table_id)); |
4572 | 6.43k | scoped_refptr<TableInfo> table; |
4573 | 6.43k | { |
4574 | 6.43k | SharedLock lock(mutex_); |
4575 | 6.43k | table = FindPtrOrNull(*table_ids_map_, table_id); |
4576 | 6.43k | if (table == nullptr) { |
4577 | 0 | Status s = STATUS_SUBSTITUTE(NotFound, "The object with id $0 does not exist", table_id); |
4578 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); |
4579 | 0 | } |
4580 | 6.43k | } |
4581 | | |
4582 | 6.43k | TRACE(Substitute("Locking object with id $0", table_id)); |
4583 | 6.43k | auto l = table->LockForRead(); |
4584 | 6.43k | RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp)); |
4585 | | |
4586 | | // Truncate on a colocated table should not hit master because it should be handled by a write |
4587 | | // DML that creates a table-level tombstone. |
4588 | 0 | LOG_IF(WARNING, table->IsColocatedUserTable()) << "cannot truncate a colocated table on master"; |
4589 | | |
4590 | 6.43k | if (!FLAGS_enable_delete_truncate_xcluster_replicated_table && IsCdcEnabled(*table)) { |
4591 | 0 | return STATUS(NotSupported, |
4592 | 0 | "Cannot truncate a table in replication.", |
4593 | 0 | table_id, |
4594 | 0 | MasterError(MasterErrorPB::INVALID_REQUEST)); |
4595 | 0 | } |
4596 | | |
4597 | | // Send a Truncate() request to each tablet in the table. |
4598 | 6.43k | SendTruncateTableRequest(table); |
4599 | | |
4600 | 6.43k | LOG(INFO) << "Successfully initiated TRUNCATE for " << table->ToString() << " per request from " |
4601 | 6.43k | << RequestorString(rpc); |
4602 | 6.43k | background_tasks_->Wake(); |
4603 | | |
4604 | | // Truncate indexes also. |
4605 | | // Note: PG table does not have references to indexes in the base table, so associated indexes |
4606 | | // must be truncated from the PG code separately. |
4607 | 6.43k | const bool is_index = IsIndex(l->pb); |
4608 | 0 | DCHECK(!is_index || l->pb.indexes().empty()) << "indexes should be empty for index table"; |
4609 | 3.37k | for (const auto& index_info : l->pb.indexes()) { |
4610 | 3.37k | RETURN_NOT_OK(TruncateTable(index_info.table_id(), resp, rpc)); |
4611 | 3.37k | } |
4612 | | |
4613 | 6.43k | return Status::OK(); |
4614 | 6.43k | } |
4615 | | |
4616 | 6.43k | void CatalogManager::SendTruncateTableRequest(const scoped_refptr<TableInfo>& table) { |
4617 | 53.6k | for (const auto& tablet : table->GetTablets()) { |
4618 | 53.6k | SendTruncateTabletRequest(tablet); |
4619 | 53.6k | } |
4620 | 6.43k | } |
4621 | | |
4622 | 53.6k | void CatalogManager::SendTruncateTabletRequest(const scoped_refptr<TabletInfo>& tablet) { |
4623 | 53.6k | LOG_WITH_PREFIX(INFO) << "Truncating tablet " << tablet->id(); |
4624 | 53.6k | auto call = std::make_shared<AsyncTruncate>(master_, AsyncTaskPool(), tablet); |
4625 | 53.6k | tablet->table()->AddTask(call); |
4626 | 53.6k | WARN_NOT_OK( |
4627 | 53.6k | ScheduleTask(call), |
4628 | 53.6k | Substitute("Failed to send truncate request for tablet $0", tablet->id())); |
4629 | 53.6k | } |
4630 | | |
4631 | | Status CatalogManager::IsTruncateTableDone(const IsTruncateTableDoneRequestPB* req, |
4632 | 8.89k | IsTruncateTableDoneResponsePB* resp) { |
4633 | 8.89k | LOG(INFO) << "Servicing IsTruncateTableDone request for table id " << req->table_id(); |
4634 | | |
4635 | | // Lookup the truncated table. |
4636 | 8.89k | TRACE("Looking up table $0", req->table_id()); |
4637 | 8.89k | scoped_refptr<TableInfo> table; |
4638 | 8.89k | { |
4639 | 8.89k | SharedLock lock(mutex_); |
4640 | 8.89k | table = FindPtrOrNull(*table_ids_map_, req->table_id()); |
4641 | 8.89k | } |
4642 | | |
4643 | 8.89k | if (table == nullptr) { |
4644 | 0 | Status s = STATUS(NotFound, "The object does not exist: table with id", req->table_id()); |
4645 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); |
4646 | 0 | } |
4647 | | |
4648 | 8.89k | TRACE("Locking table"); |
4649 | 8.89k | RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(table->LockForRead(), resp)); |
4650 | | |
4651 | 8.89k | resp->set_done(!table->HasTasks(MonitoredTask::Type::ASYNC_TRUNCATE_TABLET)); |
4652 | 8.89k | return Status::OK(); |
4653 | 8.89k | } |
4654 | | |
4655 | | // Note: only used by YSQL as of 2020-10-29. |
4656 | | Status CatalogManager::BackfillIndex( |
4657 | | const BackfillIndexRequestPB* req, |
4658 | | BackfillIndexResponsePB* resp, |
4659 | 89 | rpc::RpcContext* rpc) { |
4660 | 89 | const TableIdentifierPB& index_table_identifier = req->index_identifier(); |
4661 | | |
4662 | 89 | scoped_refptr<TableInfo> index_table = VERIFY_RESULT(FindTable(index_table_identifier)); |
4663 | | |
4664 | 89 | if (index_table->GetTableType() != PGSQL_TABLE_TYPE) { |
4665 | | // This request is only supported for YSQL for now. YCQL has its own mechanism. |
4666 | 0 | return STATUS( |
4667 | 0 | InvalidArgument, |
4668 | 0 | "Unexpected non-YSQL table", |
4669 | 0 | index_table_identifier.ShortDebugString()); |
4670 | 0 | } |
4671 | | |
4672 | | // Collect indexed_table. |
4673 | 89 | scoped_refptr<TableInfo> indexed_table; |
4674 | 89 | { |
4675 | 89 | auto l = index_table->LockForRead(); |
4676 | 89 | TableId indexed_table_id = GetIndexedTableId(l->pb); |
4677 | 89 | resp->mutable_table_identifier()->set_table_id(indexed_table_id); |
4678 | 89 | indexed_table = GetTableInfo(indexed_table_id); |
4679 | 89 | } |
4680 | | |
4681 | 89 | if (indexed_table == nullptr) { |
4682 | 0 | return STATUS(InvalidArgument, "Empty indexed table", |
4683 | 0 | index_table_identifier.ShortDebugString()); |
4684 | 0 | } |
4685 | | |
4686 | | // TODO(jason): when ready to use INDEX_PERM_DO_BACKFILL for resuming backfill across master |
4687 | | // leader changes, replace the following (issue #6218). |
4688 | | |
4689 | | // Collect index_info_pb. |
4690 | 89 | IndexInfoPB index_info_pb; |
4691 | 89 | indexed_table->GetIndexInfo(index_table->id()).ToPB(&index_info_pb); |
4692 | 89 | if (index_info_pb.index_permissions() != INDEX_PERM_WRITE_AND_DELETE) { |
4693 | 0 | return SetupError( |
4694 | 0 | resp->mutable_error(), |
4695 | 0 | MasterErrorPB::INVALID_SCHEMA, |
4696 | 0 | STATUS_FORMAT( |
4697 | 0 | InvalidArgument, |
4698 | 0 | "Expected WRITE_AND_DELETE perm, got $0", |
4699 | 0 | IndexPermissions_Name(index_info_pb.index_permissions()))); |
4700 | 0 | } |
4701 | | |
4702 | 89 | return MultiStageAlterTable::StartBackfillingData( |
4703 | 89 | this, indexed_table, {index_info_pb}, boost::none); |
4704 | 89 | } |
4705 | | |
4706 | | Status CatalogManager::GetBackfillJobs( |
4707 | | const GetBackfillJobsRequestPB* req, |
4708 | | GetBackfillJobsResponsePB* resp, |
4709 | 695 | rpc::RpcContext* rpc) { |
4710 | 695 | TableIdentifierPB table_id = req->table_identifier(); |
4711 | | |
4712 | 695 | scoped_refptr<TableInfo> indexed_table = VERIFY_RESULT(FindTable(table_id)); |
4713 | 695 | if (indexed_table == nullptr) { |
4714 | 0 | Status s = STATUS(NotFound, "Requested table $0 does not exist", table_id.ShortDebugString()); |
4715 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); |
4716 | 0 | } |
4717 | | |
4718 | 695 | { |
4719 | 695 | auto l = indexed_table->LockForRead(); |
4720 | 695 | resp->mutable_backfill_jobs()->CopyFrom(l->pb.backfill_jobs()); |
4721 | 695 | } |
4722 | 695 | return Status::OK(); |
4723 | 695 | } |
4724 | | |
4725 | | Status CatalogManager::LaunchBackfillIndexForTable( |
4726 | | const LaunchBackfillIndexForTableRequestPB* req, |
4727 | | LaunchBackfillIndexForTableResponsePB* resp, |
4728 | 1 | rpc::RpcContext* rpc) { |
4729 | 1 | const TableIdentifierPB& table_id = req->table_identifier(); |
4730 | | |
4731 | 1 | scoped_refptr<TableInfo> indexed_table = VERIFY_RESULT(FindTable(table_id)); |
4732 | 1 | if (indexed_table == nullptr) { |
4733 | 0 | Status s = STATUS(NotFound, "Requested table $0 does not exist", table_id.ShortDebugString()); |
4734 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); |
4735 | 0 | } |
4736 | 1 | if (indexed_table->GetTableType() != YQL_TABLE_TYPE) { |
4737 | | // This request is only supported for YCQL for now. YSQL has its own mechanism. |
4738 | 0 | return STATUS(InvalidArgument, "Unexpected non-YCQL table $0", table_id.ShortDebugString()); |
4739 | 0 | } |
4740 | | |
4741 | 1 | uint32_t current_version; |
4742 | 1 | { |
4743 | 1 | auto l = indexed_table->LockForRead(); |
4744 | 1 | if (l->pb.state() != SysTablesEntryPB::RUNNING) { |
4745 | 0 | Status s = STATUS(TryAgain, |
4746 | 0 | "The table is in state $0. An alter may already be in progress.", |
4747 | 0 | SysTablesEntryPB_State_Name(l->pb.state())); |
4748 | 0 | VLOG(2) << "Table " << indexed_table->ToString() << " is not running returning " << s; |
4749 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INTERNAL_ERROR, s); |
4750 | 0 | } |
4751 | 1 | current_version = l->pb.version(); |
4752 | 1 | } |
4753 | | |
4754 | 1 | auto s = MultiStageAlterTable::LaunchNextTableInfoVersionIfNecessary( |
4755 | 1 | this, indexed_table, current_version, /* respect deferrals for backfill */ false); |
4756 | 1 | if (!s.ok()) { |
4757 | 0 | VLOG(3) << __func__ << " Done failed " << s; |
4758 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::UNKNOWN_ERROR, s); |
4759 | 0 | } |
4760 | 1 | return Status::OK(); |
4761 | 1 | } |
4762 | | |
4763 | | Status CatalogManager::MarkIndexInfoFromTableForDeletion( |
4764 | | const TableId& indexed_table_id, const TableId& index_table_id, bool multi_stage, |
4765 | 372 | DeleteTableResponsePB* resp) { |
4766 | | // Lookup the indexed table and verify if it exists. |
4767 | 372 | scoped_refptr<TableInfo> indexed_table = GetTableInfo(indexed_table_id); |
4768 | 372 | if (indexed_table == nullptr) { |
4769 | 0 | LOG(WARNING) << "Indexed table " << indexed_table_id << " for index " |
4770 | 0 | << index_table_id << " not found"; |
4771 | 0 | return Status::OK(); |
4772 | 0 | } |
4773 | | |
4774 | 372 | if (resp) { |
4775 | 372 | auto ns_info = VERIFY_RESULT(master_->catalog_manager()->FindNamespaceById( |
4776 | 372 | indexed_table->namespace_id())); |
4777 | 372 | auto* resp_indexed_table = resp->mutable_indexed_table(); |
4778 | 372 | resp_indexed_table->mutable_namespace_()->set_name(ns_info->name()); |
4779 | 372 | resp_indexed_table->set_table_name(indexed_table->name()); |
4780 | 372 | resp_indexed_table->set_table_id(indexed_table_id); |
4781 | 372 | } |
4782 | 372 | if (multi_stage) { |
4783 | 104 | RETURN_NOT_OK(MultiStageAlterTable::UpdateIndexPermission( |
4784 | 104 | this, indexed_table, |
4785 | 104 | {{index_table_id, IndexPermissions::INDEX_PERM_WRITE_AND_DELETE_WHILE_REMOVING}})); |
4786 | 268 | } else { |
4787 | 268 | RETURN_NOT_OK(DeleteIndexInfoFromTable(indexed_table_id, index_table_id)); |
4788 | 268 | } |
4789 | | |
4790 | | // Actual Deletion of the index info will happen asynchronously after all the |
4791 | | // tablets move to the new IndexPermission of DELETE_ONLY_WHILE_REMOVING. |
4792 | 372 | RETURN_NOT_OK(SendAlterTableRequest(indexed_table)); |
4793 | 372 | return Status::OK(); |
4794 | 372 | } |
4795 | | |
4796 | | Status CatalogManager::DeleteIndexInfoFromTable( |
4797 | 268 | const TableId& indexed_table_id, const TableId& index_table_id) { |
4798 | 268 | scoped_refptr<TableInfo> indexed_table = GetTableInfo(indexed_table_id); |
4799 | 268 | if (indexed_table == nullptr) { |
4800 | 0 | LOG(WARNING) << "Indexed table " << indexed_table_id << " for index " << index_table_id |
4801 | 0 | << " not found"; |
4802 | 0 | return Status::OK(); |
4803 | 0 | } |
4804 | 268 | TRACE("Locking indexed table"); |
4805 | 268 | auto l = indexed_table->LockForWrite(); |
4806 | 268 | auto &indexed_table_data = *l.mutable_data(); |
4807 | | |
4808 | | // Heed issue #6233. |
4809 | 268 | if (!l->pb.has_fully_applied_schema()) { |
4810 | 159 | MultiStageAlterTable::CopySchemaDetailsToFullyApplied(&indexed_table_data.pb); |
4811 | 159 | } |
4812 | 268 | auto *indexes = indexed_table_data.pb.mutable_indexes(); |
4813 | 290 | for (int i = 0; i < indexes->size(); i++) { |
4814 | 290 | if (indexes->Get(i).table_id() == index_table_id) { |
4815 | | |
4816 | 268 | indexes->DeleteSubrange(i, 1); |
4817 | | |
4818 | 268 | indexed_table_data.pb.set_version(indexed_table_data.pb.version() + 1); |
4819 | | // TODO(Amit) : Is this compatible with the previous version? |
4820 | 268 | indexed_table_data.pb.set_updates_only_index_permissions(false); |
4821 | 268 | indexed_table_data.set_state( |
4822 | 268 | SysTablesEntryPB::ALTERING, |
4823 | 268 | Format("Delete index info version=$0 ts=$1", |
4824 | 268 | indexed_table_data.pb.version(), LocalTimeAsString())); |
4825 | | |
4826 | | // Update sys-catalog with the deleted indexed table info. |
4827 | 268 | TRACE("Updating indexed table metadata on disk"); |
4828 | 268 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), indexed_table)); |
4829 | | |
4830 | | // Update the in-memory state. |
4831 | 268 | TRACE("Committing in-memory state"); |
4832 | 268 | l.Commit(); |
4833 | 268 | return Status::OK(); |
4834 | 268 | } |
4835 | 290 | } |
4836 | | |
4837 | 0 | LOG(WARNING) << "Index " << index_table_id << " not found in indexed table " << indexed_table_id; |
4838 | 0 | return Status::OK(); |
4839 | 268 | } |
4840 | | |
4841 | | Status CatalogManager::DeleteTable( |
4842 | 2.49k | const DeleteTableRequestPB* req, DeleteTableResponsePB* resp, rpc::RpcContext* rpc) { |
4843 | 2.49k | LOG(INFO) << "Servicing DeleteTable request from " << RequestorString(rpc) << ": " |
4844 | 2.49k | << req->ShortDebugString(); |
4845 | | |
4846 | 2.47k | scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTable(req->table())); |
4847 | 2.47k | bool result = IsCdcEnabled(*table); |
4848 | 2.47k | if (!FLAGS_enable_delete_truncate_xcluster_replicated_table && result) { |
4849 | 1 | return STATUS(NotSupported, |
4850 | 1 | "Cannot delete a table in replication.", |
4851 | 1 | req->ShortDebugString(), |
4852 | 1 | MasterError(MasterErrorPB::INVALID_REQUEST)); |
4853 | 1 | } |
4854 | | |
4855 | 2.47k | if (req->is_index_table()) { |
4856 | 264 | TRACE("Looking up index"); |
4857 | 264 | TableId table_id = table->id(); |
4858 | 264 | resp->set_table_id(table_id); |
4859 | 264 | TableId indexed_table_id; |
4860 | 264 | { |
4861 | 264 | auto l = table->LockForRead(); |
4862 | 264 | indexed_table_id = GetIndexedTableId(l->pb); |
4863 | 264 | } |
4864 | 264 | scoped_refptr<TableInfo> indexed_table = GetTableInfo(indexed_table_id); |
4865 | 264 | const bool is_pg_table = indexed_table != nullptr && |
4866 | 264 | indexed_table->GetTableType() == PGSQL_TABLE_TYPE; |
4867 | 264 | bool is_transactional; |
4868 | 264 | { |
4869 | 264 | Schema index_schema; |
4870 | 264 | RETURN_NOT_OK(table->GetSchema(&index_schema)); |
4871 | 264 | is_transactional = index_schema.table_properties().is_transactional(); |
4872 | 264 | } |
4873 | 264 | const bool index_backfill_enabled = |
4874 | 264 | IsIndexBackfillEnabled(table->GetTableType(), is_transactional); |
4875 | 264 | if (!is_pg_table && index_backfill_enabled) { |
4876 | 104 | return MarkIndexInfoFromTableForDeletion( |
4877 | 104 | indexed_table_id, table_id, /* multi_stage */ true, resp); |
4878 | 104 | } |
4879 | 2.36k | } |
4880 | | |
4881 | 2.36k | return DeleteTableInternal(req, resp, rpc); |
4882 | 2.36k | } |
4883 | | |
4884 | | // Delete a Table |
4885 | | // - Update the table state to "DELETING". |
4886 | | // - Issue DeleteTablet tasks to all said tablets. |
4887 | | // - Update all the underlying tablet states as "DELETED". |
4888 | | // |
4889 | | // This order of events can help us guarantee that: |
4890 | | // - If a table is DELETING/DELETED, we do not add further tasks to it. |
4891 | | // - A DeleteTable is done when a table is either DELETING or DELETED and has no running tasks. |
4892 | | // - If a table is DELETING and it has no tasks on it, then it is safe to mark DELETED. |
4893 | | // |
4894 | | // We are lazy about deletions. |
4895 | | // |
4896 | | // IMPORTANT: If modifying, consider updating DeleteYsqlDBTables(), the bulk deletion API. |
4897 | | Status CatalogManager::DeleteTableInternal( |
4898 | 2.51k | const DeleteTableRequestPB* req, DeleteTableResponsePB* resp, rpc::RpcContext* rpc) { |
4899 | 2.51k | auto schedules_to_tables_map = VERIFY_RESULT( |
4900 | 2.51k | MakeSnapshotSchedulesToObjectIdsMap(SysRowEntryType::TABLE)); |
4901 | | |
4902 | 2.51k | vector<DeletingTableData> tables; |
4903 | 2.51k | RETURN_NOT_OK(DeleteTableInMemory(req->table(), req->is_index_table(), |
4904 | 2.51k | true /* update_indexed_table */, schedules_to_tables_map, |
4905 | 2.51k | &tables, resp, rpc)); |
4906 | | |
4907 | | // Update the in-memory state. |
4908 | 2.47k | TRACE("Committing in-memory state"); |
4909 | 2.47k | std::unordered_set<TableId> sys_table_ids; |
4910 | 2.74k | for (auto& table : tables) { |
4911 | 2.74k | if (IsSystemTable(*table.info)) { |
4912 | 0 | sys_table_ids.insert(table.info->id()); |
4913 | 0 | } |
4914 | 2.74k | table.write_lock.Commit(); |
4915 | 2.74k | } |
4916 | | |
4917 | | // Delete any CDC streams that are set up on this table, after releasing the Table lock. |
4918 | 2.47k | TRACE("Deleting CDC streams on table"); |
4919 | | // table_id for the requested table will be added to the end of the response. |
4920 | 2.47k | RSTATUS_DCHECK_GE(resp->deleted_table_ids_size(), 1, IllegalState, |
4921 | 2.47k | "DeleteTableInMemory expected to add the index id to resp"); |
4922 | 2.47k | RETURN_NOT_OK( |
4923 | 2.47k | DeleteCDCStreamsForTable(resp->deleted_table_ids(resp->deleted_table_ids_size() - 1))); |
4924 | | |
4925 | 2.47k | if (PREDICT_FALSE(FLAGS_catalog_manager_inject_latency_in_delete_table_ms > 0)) { |
4926 | 2 | LOG(INFO) << "Sleeping in CatalogManager::DeleteTable for " << |
4927 | 2 | FLAGS_catalog_manager_inject_latency_in_delete_table_ms << " ms"; |
4928 | 2 | SleepFor(MonoDelta::FromMilliseconds(FLAGS_catalog_manager_inject_latency_in_delete_table_ms)); |
4929 | 2 | } |
4930 | | |
4931 | | // Update the internal table maps. Exclude Postgres tables which are not in the name map. |
4932 | | // Also exclude hidden tables, that were already removed from this map. |
4933 | 2.47k | if (std::any_of(tables.begin(), tables.end(), [](auto& t) { return t.remove_from_name_map; })) { |
4934 | 1.26k | TRACE("Removing tables from by-name map"); |
4935 | 1.26k | LockGuard lock(mutex_); |
4936 | 1.54k | for (const auto& table : tables) { |
4937 | 1.54k | if (table.remove_from_name_map) { |
4938 | 1.54k | TableInfoByNameMap::key_type key = {table.info->namespace_id(), table.info->name()}; |
4939 | 1.54k | if (table_names_map_.erase(key) != 1) { |
4940 | 0 | LOG(WARNING) << "Could not remove table from map: " << key.first << "." << key.second; |
4941 | 0 | } |
4942 | | |
4943 | | // Also remove from the system.partitions table. |
4944 | 1.54k | GetYqlPartitionsVtable().RemoveFromCache(table.info->id()); |
4945 | | |
4946 | | // Remove matviews from matview to pg table id map |
4947 | 1.54k | matview_pg_table_ids_map_.erase(table.info->id()); |
4948 | 1.54k | } |
4949 | 1.54k | } |
4950 | | // We commit another map to increment its version and reset cache. |
4951 | | // Since table_name_map_ does not have version. |
4952 | 1.26k | table_ids_map_.Commit(); |
4953 | 1.26k | } |
4954 | | |
4955 | 2.74k | for (const auto& table : tables) { |
4956 | 2.74k | LOG(INFO) << "Deleting table: " << table.info->name() << ", retained by: " |
4957 | 2.74k | << AsString(table.retained_by_snapshot_schedules, &Uuid::TryFullyDecode); |
4958 | | |
4959 | | // Send a DeleteTablet() request to each tablet replica in the table. |
4960 | 2.74k | RETURN_NOT_OK(DeleteTabletsAndSendRequests(table.info, table.retained_by_snapshot_schedules)); |
4961 | | // Send a RemoveTableFromTablet() request to each colocated parent tablet replica in the table. |
4962 | | // TODO(pitr) handle YSQL colocated tables. |
4963 | 2.74k | if (table.info->IsColocatedUserTable()) { |
4964 | 15 | auto call = std::make_shared<AsyncRemoveTableFromTablet>( |
4965 | 15 | master_, AsyncTaskPool(), table.info->GetColocatedTablet(), table.info); |
4966 | 15 | table.info->AddTask(call); |
4967 | 15 | WARN_NOT_OK(ScheduleTask(call), "Failed to send RemoveTableFromTablet request"); |
4968 | 15 | } |
4969 | 2.74k | } |
4970 | | |
4971 | | // If there are any permissions granted on this table find them and delete them. This is necessary |
4972 | | // because we keep track of the permissions based on the canonical resource name which is a |
4973 | | // combination of the keyspace and table names, so if another table with the same name is created |
4974 | | // (in the same keyspace where the previous one existed), and the permissions were not deleted at |
4975 | | // the time of the previous table deletion, then the permissions that existed for the previous |
4976 | | // table will automatically be granted to the new table even though this wasn't the intention. |
4977 | 2.47k | string canonical_resource = get_canonical_table(req->table().namespace_().name(), |
4978 | 2.47k | req->table().table_name()); |
4979 | 2.47k | RETURN_NOT_OK(permissions_manager_->RemoveAllPermissionsForResource(canonical_resource, resp)); |
4980 | | |
4981 | | // Remove the system tables from system catalog. |
4982 | 2.47k | if (!sys_table_ids.empty()) { |
4983 | | // We do not expect system tables deletion during initial snapshot forming. |
4984 | 0 | DCHECK(!initial_snapshot_writer_); |
4985 | |
|
4986 | 0 | TRACE("Sending system table delete RPCs"); |
4987 | 0 | for (auto& table_id : sys_table_ids) { |
4988 | | // "sys_catalog_->DeleteYsqlSystemTable(table_id)" won't work here |
4989 | | // as it only acts on the leader. |
4990 | 0 | tablet::ChangeMetadataRequestPB change_req; |
4991 | 0 | change_req.set_tablet_id(kSysCatalogTabletId); |
4992 | 0 | change_req.set_remove_table_id(table_id); |
4993 | 0 | RETURN_NOT_OK(tablet::SyncReplicateChangeMetadataOperation( |
4994 | 0 | &change_req, sys_catalog_->tablet_peer().get(), leader_ready_term())); |
4995 | 0 | } |
4996 | 2.47k | } else { |
4997 | 2.47k | TRACE("No system tables to delete"); |
4998 | 2.47k | } |
4999 | | |
5000 | 2.47k | LOG(INFO) << "Successfully initiated deletion of " |
5001 | 2.20k | << (req->is_index_table() ? "index" : "table") << " with " |
5002 | 2.47k | << req->table().DebugString() << " per request from " << RequestorString(rpc); |
5003 | | // Asynchronously cleans up the final memory traces of the deleted database. |
5004 | 2.47k | background_tasks_->Wake(); |
5005 | 2.47k | return Status::OK(); |
5006 | 2.47k | } |
5007 | | |
5008 | | Status CatalogManager::DeleteTableInMemory( |
5009 | | const TableIdentifierPB& table_identifier, |
5010 | | const bool is_index_table, |
5011 | | const bool update_indexed_table, |
5012 | | const SnapshotSchedulesToObjectIdsMap& schedules_to_tables_map, |
5013 | | vector<DeletingTableData>* tables, |
5014 | | DeleteTableResponsePB* resp, |
5015 | 2.80k | rpc::RpcContext* rpc) { |
5016 | | // TODO(NIC): How to handle a DeleteTable request when the namespace is being deleted? |
5017 | 2.20k | const char* const object_type = is_index_table ? "index" : "table"; |
5018 | 2.80k | const bool cascade_delete_index = is_index_table && !update_indexed_table; |
5019 | | |
5020 | 0 | VLOG_WITH_PREFIX_AND_FUNC(1) << YB_STRUCT_TO_STRING( |
5021 | 0 | table_identifier, is_index_table, update_indexed_table) << "\n" << GetStackTrace(); |
5022 | | |
5023 | | // Lookup the table and verify if it exists. |
5024 | 2.80k | TRACE(Substitute("Looking up $0", object_type)); |
5025 | 2.80k | auto table_result = FindTable(table_identifier); |
5026 | 2.80k | if (!VERIFY_RESULT(DoesTableExist(table_result))) { |
5027 | 0 | if (cascade_delete_index) { |
5028 | 0 | LOG(WARNING) << "Index " << table_identifier.DebugString() << " not found"; |
5029 | 0 | return Status::OK(); |
5030 | 0 | } else { |
5031 | 0 | return table_result.status(); |
5032 | 0 | } |
5033 | 2.80k | } |
5034 | 2.80k | auto table = std::move(*table_result); |
5035 | | |
5036 | 2.80k | TRACE(Substitute("Locking $0", object_type)); |
5037 | 2.80k | auto data = DeletingTableData { |
5038 | 2.80k | .info = table, |
5039 | 2.80k | .write_lock = table->LockForWrite(), |
5040 | 2.80k | .retained_by_snapshot_schedules = RepeatedBytes(), |
5041 | 2.80k | .remove_from_name_map = false |
5042 | 2.80k | }; |
5043 | 2.80k | auto& l = data.write_lock; |
5044 | | // table_id for the requested table will be added to the end of the response. |
5045 | 2.80k | *resp->add_deleted_table_ids() = table->id(); |
5046 | | |
5047 | 2.80k | if (is_index_table == IsTable(l->pb)) { |
5048 | 0 | Status s = STATUS(NotFound, "The object does not exist"); |
5049 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); |
5050 | 0 | } |
5051 | | |
5052 | 2.80k | FillRetainedBySnapshotSchedules( |
5053 | 2.80k | schedules_to_tables_map, table->id(), &data.retained_by_snapshot_schedules); |
5054 | 2.80k | bool hide_only = !data.retained_by_snapshot_schedules.empty(); |
5055 | | |
5056 | 2.80k | if (l->started_deleting() || (hide_only && l->started_hiding())) { |
5057 | 34 | if (cascade_delete_index) { |
5058 | 0 | LOG(WARNING) << "Index " << table_identifier.ShortDebugString() << " was " |
5059 | 0 | << (l->started_deleting() ? "deleted" : "hidden"); |
5060 | 0 | return Status::OK(); |
5061 | 34 | } else { |
5062 | 34 | Status s = STATUS(NotFound, "The object was deleted", l->pb.state_msg()); |
5063 | 34 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); |
5064 | 34 | } |
5065 | 2.77k | } |
5066 | | |
5067 | | // Determine if we have to remove from the name map here before we change the table state. |
5068 | 2.77k | data.remove_from_name_map = l.data().table_type() != PGSQL_TABLE_TYPE && !l->started_hiding(); |
5069 | | |
5070 | 2.77k | TRACE("Updating metadata on disk"); |
5071 | | // Update the metadata for the on-disk state. |
5072 | 2.77k | if (hide_only) { |
5073 | 0 | l.mutable_data()->pb.set_hide_state(SysTablesEntryPB::HIDING); |
5074 | 2.77k | } else { |
5075 | 2.77k | l.mutable_data()->set_state(SysTablesEntryPB::DELETING, |
5076 | 2.77k | Substitute("Started deleting at $0", LocalTimeAsString())); |
5077 | 2.77k | } |
5078 | | |
5079 | 2.77k | auto now = master_->clock()->Now(); |
5080 | 2.77k | DdlLogEntry ddl_log_entry(now, table->id(), l->pb, "Drop"); |
5081 | 2.77k | if (is_index_table) { |
5082 | 563 | const auto& indexed_table_id = GetIndexedTableId(l->pb); |
5083 | 563 | auto indexed_table = FindTableById(indexed_table_id); |
5084 | 563 | if (indexed_table.ok()) { |
5085 | 563 | auto lock = (**indexed_table).LockForRead(); |
5086 | 563 | ddl_log_entry = DdlLogEntry( |
5087 | 563 | now, indexed_table_id, lock->pb, Format("Drop index $0", l->name())); |
5088 | 563 | } |
5089 | 563 | } |
5090 | | |
5091 | | // Update sys-catalog with the removed table state. |
5092 | 2.77k | Status s = sys_catalog_->Upsert(leader_ready_term(), &ddl_log_entry, table); |
5093 | | |
5094 | 2.77k | if (PREDICT_FALSE(FLAGS_TEST_simulate_crash_after_table_marked_deleting)) { |
5095 | 1 | return Status::OK(); |
5096 | 1 | } |
5097 | | |
5098 | 2.77k | if (!s.ok()) { |
5099 | | // The mutation will be aborted when 'l' exits the scope on early return. |
5100 | 2 | s = s.CloneAndPrepend("An error occurred while updating sys tables"); |
5101 | 2 | LOG(WARNING) << s; |
5102 | 2 | return CheckIfNoLongerLeaderAndSetupError(s, resp); |
5103 | 2 | } |
5104 | | |
5105 | | // For regular (indexed) table, delete all its index tables if any. Else for index table, delete |
5106 | | // index info from the indexed table. |
5107 | 2.76k | if (!is_index_table) { |
5108 | 2.19k | TableIdentifierPB index_identifier; |
5109 | 294 | for (const auto& index : l->pb.indexes()) { |
5110 | 294 | index_identifier.set_table_id(index.table_id()); |
5111 | 294 | RETURN_NOT_OK(DeleteTableInMemory(index_identifier, true /* is_index_table */, |
5112 | 294 | false /* update_indexed_table */, schedules_to_tables_map, |
5113 | 294 | tables, resp, rpc)); |
5114 | 294 | } |
5115 | 574 | } else if (update_indexed_table) { |
5116 | 268 | s = MarkIndexInfoFromTableForDeletion( |
5117 | 268 | GetIndexedTableId(l->pb), table->id(), /* multi_stage */ false, resp); |
5118 | 268 | if (!s.ok()) { |
5119 | 0 | s = s.CloneAndPrepend(Substitute("An error occurred while deleting index info: $0", |
5120 | 0 | s.ToString())); |
5121 | 0 | LOG(WARNING) << s.ToString(); |
5122 | 0 | return CheckIfNoLongerLeaderAndSetupError(s, resp); |
5123 | 0 | } |
5124 | 2.76k | } |
5125 | | |
5126 | 2.76k | if (!hide_only) { |
5127 | | // If table is being hidden we should not abort snapshot related tasks. |
5128 | 2.75k | table->AbortTasks(); |
5129 | 2.75k | } |
5130 | | |
5131 | | // For regular (indexed) table, insert table info and lock in the front of the list. Else for |
5132 | | // index table, append them to the end. We do so so that we will commit and delete the indexed |
5133 | | // table first before its indexes. |
5134 | 2.20k | tables->insert(is_index_table ? tables->end() : tables->begin(), std::move(data)); |
5135 | | |
5136 | 2.76k | return Status::OK(); |
5137 | 2.76k | } |
5138 | | |
5139 | 3.41M | TableInfo::WriteLock CatalogManager::MaybeTransitionTableToDeleted(const TableInfoPtr& table) { |
5140 | 3.41M | if (!table) { |
5141 | 0 | LOG_WITH_PREFIX(INFO) << "Finished deleting an Orphaned tablet. " |
5142 | 0 | << "Table Information is null. Skipping updating its state to DELETED."; |
5143 | 0 | return TableInfo::WriteLock(); |
5144 | 0 | } |
5145 | 3.41M | if (table->HasTasks()) { |
5146 | 0 | VLOG_WITH_PREFIX_AND_FUNC(2) << table->ToString() << " has tasks"; |
5147 | 54.3k | return TableInfo::WriteLock(); |
5148 | 54.3k | } |
5149 | 3.35M | bool hide_only; |
5150 | 3.35M | { |
5151 | 3.35M | auto lock = table->LockForRead(); |
5152 | | |
5153 | | // For any table in DELETING state, we will want to mark it as DELETED once all its respective |
5154 | | // tablets have been successfully removed from tservers. |
5155 | | // For any hiding table we will want to mark it as HIDDEN once all its respective |
5156 | | // tablets have been successfully hidden on tservers. |
5157 | 3.35M | if (lock->is_deleted()) { |
5158 | | // Clear the tablets_ and partitions_ maps if table has already been DELETED. |
5159 | | // Usually this would have been done except for tables that were hidden and are now deleted. |
5160 | | // Also, this is a catch all in case any other path misses clearing the maps. |
5161 | 183k | table->ClearTabletMaps(); |
5162 | 183k | return TableInfo::WriteLock(); |
5163 | 183k | } |
5164 | 3.17M | hide_only = !lock->is_deleting(); |
5165 | 3.17M | if (hide_only && !lock->is_hiding()) { |
5166 | 3.16M | return TableInfo::WriteLock(); |
5167 | 3.16M | } |
5168 | 6.12k | } |
5169 | | // The current relevant order of operations during a DeleteTable is: |
5170 | | // 1) Mark the table as DELETING |
5171 | | // 2) Abort the current table tasks |
5172 | | // 3) Per tablet, send DeleteTable requests to all TS, then mark that tablet as DELETED |
5173 | | // |
5174 | | // This creates a race, wherein, after 2, HasTasks can be false, but we still have not |
5175 | | // gotten to point 3, which would add further tasks for the deletes. |
5176 | | // |
5177 | | // However, HasTasks is cheaper than AreAllTabletsDeletedOrHidden... |
5178 | 6.12k | auto all_tablets_done = hide_only ? table->AreAllTabletsHidden() : table->AreAllTabletsDeleted(); |
5179 | 18.4E | VLOG_WITH_PREFIX_AND_FUNC(2) |
5180 | 18.4E | << table->ToString() << " hide only: " << hide_only << ", all tablets done: " |
5181 | 18.4E | << all_tablets_done; |
5182 | 6.12k | if (!all_tablets_done && !IsSystemTable(*table) && !table->IsColocatedUserTable()) { |
5183 | 104 | return TableInfo::WriteLock(); |
5184 | 104 | } |
5185 | | |
5186 | 6.01k | auto lock = table->LockForWrite(); |
5187 | 6.01k | if (lock->is_hiding()) { |
5188 | 0 | LOG(INFO) << "Marking table as HIDDEN: " << table->ToString(); |
5189 | 0 | lock.mutable_data()->pb.set_hide_state(SysTablesEntryPB::HIDDEN); |
5190 | | // Erase all the tablets from partitions_ structure. |
5191 | 0 | table->ClearTabletMaps(DeactivateOnly::kTrue); |
5192 | 0 | return lock; |
5193 | 0 | } |
5194 | 6.01k | if (lock->is_deleting()) { |
5195 | | // Update the metadata for the on-disk state. |
5196 | 5.05k | LOG(INFO) << "Marking table as DELETED: " << table->ToString(); |
5197 | 5.05k | lock.mutable_data()->set_state(SysTablesEntryPB::DELETED, |
5198 | 5.05k | Substitute("Deleted with tablets at $0", LocalTimeAsString())); |
5199 | | // Erase all the tablets from tablets_ and partitions_ structures. |
5200 | 5.05k | table->ClearTabletMaps(); |
5201 | 5.05k | return lock; |
5202 | 5.05k | } |
5203 | 965 | return TableInfo::WriteLock(); |
5204 | 965 | } |
5205 | | |
5206 | 17.2k | void CatalogManager::CleanUpDeletedTables() { |
5207 | | // TODO(bogdan): Cache tables being deleted to make this iterate only over those? |
5208 | 17.2k | vector<scoped_refptr<TableInfo>> tables_to_delete; |
5209 | | // Garbage collecting. |
5210 | | // Going through all tables under the global lock, copying them to not hold lock for too long. |
5211 | 17.2k | TableInfoMap copy_of_table_by_id_map; |
5212 | 17.2k | { |
5213 | 17.2k | LockGuard lock(mutex_); |
5214 | 17.2k | copy_of_table_by_id_map = *table_ids_map_; |
5215 | 17.2k | } |
5216 | | // Mark the tables as DELETED and remove them from the in-memory maps. |
5217 | 17.2k | vector<TableInfo*> tables_to_update_on_disk; |
5218 | 17.2k | vector<TableInfo::WriteLock> table_locks; |
5219 | 3.33M | for (const auto& it : copy_of_table_by_id_map) { |
5220 | 3.33M | const auto& table = it.second; |
5221 | 3.33M | auto lock = MaybeTransitionTableToDeleted(table); |
5222 | 3.33M | if (lock.locked()) { |
5223 | 2.29k | table_locks.push_back(std::move(lock)); |
5224 | 2.29k | tables_to_update_on_disk.push_back(table.get()); |
5225 | 2.29k | } |
5226 | 3.33M | } |
5227 | 17.2k | if (tables_to_update_on_disk.size() > 0) { |
5228 | 32 | Status s = sys_catalog_->Upsert(leader_ready_term(), tables_to_update_on_disk); |
5229 | 32 | if (!s.ok()) { |
5230 | 1 | LOG(WARNING) << "Error marking tables as DELETED: " << s.ToString(); |
5231 | 1 | return; |
5232 | 1 | } |
5233 | | // Update the table in-memory info as DELETED after we've removed them from the maps. |
5234 | 2.28k | for (auto& lock : table_locks) { |
5235 | 2.28k | lock.Commit(); |
5236 | 2.28k | } |
5237 | | // TODO: Check if we want to delete the totally deleted table from the sys_catalog here. |
5238 | | // TODO: SysCatalog::DeleteItem() if we've DELETED all user tables in a DELETING namespace. |
5239 | | // TODO: Also properly handle namespace_ids_map_.erase(table->namespace_id()) |
5240 | 31 | } |
5241 | 17.2k | } |
5242 | | |
5243 | | Status CatalogManager::IsDeleteTableDone(const IsDeleteTableDoneRequestPB* req, |
5244 | 5.58k | IsDeleteTableDoneResponsePB* resp) { |
5245 | | // Lookup the deleted table. |
5246 | 5.58k | TRACE("Looking up table $0", req->table_id()); |
5247 | 5.58k | scoped_refptr<TableInfo> table; |
5248 | 5.58k | { |
5249 | 5.58k | SharedLock lock(mutex_); |
5250 | 5.58k | table = FindPtrOrNull(*table_ids_map_, req->table_id()); |
5251 | 5.58k | } |
5252 | | |
5253 | 5.58k | if (table == nullptr) { |
5254 | 2 | LOG(INFO) << "Servicing IsDeleteTableDone request for table id " |
5255 | 2 | << req->table_id() << ": deleted (not found)"; |
5256 | 2 | resp->set_done(true); |
5257 | 2 | return Status::OK(); |
5258 | 2 | } |
5259 | | |
5260 | 5.58k | TRACE("Locking table"); |
5261 | 5.58k | auto l = table->LockForRead(); |
5262 | | |
5263 | 5.58k | if (!l->started_deleting() && !l->started_hiding()) { |
5264 | 104 | LOG(WARNING) << "Servicing IsDeleteTableDone request for table id " |
5265 | 104 | << req->table_id() << ": NOT deleted"; |
5266 | 104 | Status s = STATUS(IllegalState, "The object was NOT deleted", l->pb.state_msg()); |
5267 | 104 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); |
5268 | 104 | } |
5269 | | |
5270 | | // Temporary fix for github issue #5290. |
5271 | | // TODO: Wait till deletion completed for tablegroup parent table. |
5272 | 5.48k | if (table->IsTablegroupParentTable()) { |
5273 | 0 | LOG(INFO) << "Servicing IsDeleteTableDone request for tablegroup parent table id " |
5274 | 0 | << req->table_id() << ": deleting. Skipping wait for DELETED state."; |
5275 | 0 | resp->set_done(true); |
5276 | 0 | return Status::OK(); |
5277 | 0 | } |
5278 | | |
5279 | 5.48k | if (l->is_deleted() || l->is_hidden()) { |
5280 | 2.61k | LOG(INFO) << "Servicing IsDeleteTableDone request for table id " |
5281 | 2.61k | << req->table_id() << ": totally " << (l->is_hidden() ? "hidden" : "deleted"); |
5282 | 2.61k | resp->set_done(true); |
5283 | 2.86k | } else { |
5284 | 2.86k | LOG(INFO) << "Servicing IsDeleteTableDone request for table id " << req->table_id() |
5285 | 2.84k | << ((!table->IsColocatedUserTable()) ? ": deleting tablets" : ""); |
5286 | | |
5287 | 2.86k | std::vector<std::shared_ptr<TSDescriptor>> descs; |
5288 | 2.86k | master_->ts_manager()->GetAllDescriptors(&descs); |
5289 | 8.52k | for (auto& ts_desc : descs) { |
5290 | 8.52k | LOG(INFO) << "Deleting on " << ts_desc->permanent_uuid() << ": " |
5291 | 8.52k | << ts_desc->PendingTabletDeleteToString(); |
5292 | 8.52k | } |
5293 | | |
5294 | 2.86k | resp->set_done(false); |
5295 | 2.86k | } |
5296 | | |
5297 | 5.48k | return Status::OK(); |
5298 | 5.48k | } |
5299 | | |
5300 | | namespace { |
5301 | | |
5302 | | CHECKED_STATUS ApplyAlterSteps(server::Clock* clock, |
5303 | | const TableId& table_id, |
5304 | | const SysTablesEntryPB& current_pb, |
5305 | | const AlterTableRequestPB* req, |
5306 | | Schema* new_schema, |
5307 | | ColumnId* next_col_id, |
5308 | 276 | std::vector<DdlLogEntry>* ddl_log_entries) { |
5309 | 276 | const SchemaPB& current_schema_pb = current_pb.schema(); |
5310 | 276 | Schema cur_schema; |
5311 | 276 | RETURN_NOT_OK(SchemaFromPB(current_schema_pb, &cur_schema)); |
5312 | | |
5313 | 276 | SchemaBuilder builder(cur_schema); |
5314 | 276 | if (current_pb.has_next_column_id()) { |
5315 | 276 | builder.set_next_column_id(ColumnId(current_pb.next_column_id())); |
5316 | 276 | } |
5317 | 276 | if (current_pb.has_colocated() && current_pb.colocated()) { |
5318 | 2 | if (current_schema_pb.table_properties().is_ysql_catalog_table()) { |
5319 | 0 | Uuid cotable_id; |
5320 | 0 | RETURN_NOT_OK(cotable_id.FromHexString(req->table().table_id())); |
5321 | 0 | builder.set_cotable_id(cotable_id); |
5322 | 2 | } else { |
5323 | 2 | uint32_t pgtable_id = VERIFY_RESULT(GetPgsqlTableOid(req->table().table_id())); |
5324 | 2 | builder.set_pgtable_id(pgtable_id); |
5325 | 2 | } |
5326 | 2 | } |
5327 | | |
5328 | 284 | for (const AlterTableRequestPB::Step& step : req->alter_schema_steps()) { |
5329 | 284 | auto time = clock->Now(); |
5330 | 284 | switch (step.type()) { |
5331 | 173 | case AlterTableRequestPB::ADD_COLUMN: { |
5332 | 173 | if (!step.has_add_column()) { |
5333 | 0 | return STATUS(InvalidArgument, "ADD_COLUMN missing column info"); |
5334 | 0 | } |
5335 | | |
5336 | | // Verify that encoding is appropriate for the new column's type. |
5337 | 173 | ColumnSchemaPB new_col_pb = step.add_column().schema(); |
5338 | 173 | if (new_col_pb.has_id()) { |
5339 | 0 | return STATUS_SUBSTITUTE(InvalidArgument, |
5340 | 0 | "column $0: client should not specify column id", new_col_pb.ShortDebugString()); |
5341 | 0 | } |
5342 | 173 | ColumnSchema new_col = ColumnSchemaFromPB(new_col_pb); |
5343 | | |
5344 | 173 | RETURN_NOT_OK(builder.AddColumn(new_col, false)); |
5345 | 173 | ddl_log_entries->emplace_back(time, table_id, current_pb, Format("Add column $0", new_col)); |
5346 | 173 | break; |
5347 | 173 | } |
5348 | | |
5349 | 96 | case AlterTableRequestPB::DROP_COLUMN: { |
5350 | 96 | if (!step.has_drop_column()) { |
5351 | 0 | return STATUS(InvalidArgument, "DROP_COLUMN missing column info"); |
5352 | 0 | } |
5353 | | |
5354 | 96 | if (cur_schema.is_key_column(step.drop_column().name())) { |
5355 | 0 | return STATUS(InvalidArgument, "cannot remove a key column"); |
5356 | 0 | } |
5357 | | |
5358 | 96 | RETURN_NOT_OK(builder.RemoveColumn(step.drop_column().name())); |
5359 | 96 | ddl_log_entries->emplace_back( |
5360 | 96 | time, table_id, current_pb, Format("Drop column $0", step.drop_column().name())); |
5361 | 96 | break; |
5362 | 96 | } |
5363 | | |
5364 | 15 | case AlterTableRequestPB::RENAME_COLUMN: { |
5365 | 15 | if (!step.has_rename_column()) { |
5366 | 0 | return STATUS(InvalidArgument, "RENAME_COLUMN missing column info"); |
5367 | 0 | } |
5368 | | |
5369 | 15 | RETURN_NOT_OK(builder.RenameColumn( |
5370 | 15 | step.rename_column().old_name(), |
5371 | 15 | step.rename_column().new_name())); |
5372 | 15 | ddl_log_entries->emplace_back( |
5373 | 15 | time, table_id, current_pb, |
5374 | 15 | Format("Rename column $0 => $1", step.rename_column().old_name(), |
5375 | 15 | step.rename_column().new_name())); |
5376 | 15 | break; |
5377 | 15 | } |
5378 | | |
5379 | | // TODO: EDIT_COLUMN. |
5380 | | |
5381 | 0 | default: { |
5382 | 0 | return STATUS_SUBSTITUTE(InvalidArgument, "Invalid alter step type: $0", step.type()); |
5383 | 15 | } |
5384 | 284 | } |
5385 | 284 | } |
5386 | | |
5387 | 276 | if (req->has_alter_properties()) { |
5388 | 7 | RETURN_NOT_OK(builder.AlterProperties(req->alter_properties())); |
5389 | 7 | } |
5390 | | |
5391 | 276 | *new_schema = builder.Build(); |
5392 | 276 | *next_col_id = builder.next_column_id(); |
5393 | 276 | return Status::OK(); |
5394 | 276 | } |
5395 | | |
5396 | | } // namespace |
5397 | | |
5398 | | Status CatalogManager::AlterTable(const AlterTableRequestPB* req, |
5399 | | AlterTableResponsePB* resp, |
5400 | 2.86k | rpc::RpcContext* rpc) { |
5401 | 2.86k | LOG_WITH_PREFIX(INFO) << "Servicing " << __func__ << " request from " << RequestorString(rpc) |
5402 | 2.86k | << ": " << req->ShortDebugString(); |
5403 | | |
5404 | 2.86k | std::vector<DdlLogEntry> ddl_log_entries; |
5405 | | |
5406 | | // Lookup the table and verify if it exists. |
5407 | 2.86k | TRACE("Looking up table"); |
5408 | 2.86k | scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTable(req->table())); |
5409 | | |
5410 | 2.86k | NamespaceId new_namespace_id; |
5411 | | |
5412 | 2.86k | if (req->has_new_namespace()) { |
5413 | | // Lookup the new namespace and verify if it exists. |
5414 | 46 | TRACE("Looking up new namespace"); |
5415 | 46 | scoped_refptr<NamespaceInfo> ns; |
5416 | 46 | NamespaceIdentifierPB namespace_identifier = req->new_namespace(); |
5417 | | // Use original namespace_id as new_namespace_id for YSQL tables. |
5418 | 46 | if (table->GetTableType() == PGSQL_TABLE_TYPE && !namespace_identifier.has_id()) { |
5419 | 42 | namespace_identifier.set_id(table->namespace_id()); |
5420 | 42 | } |
5421 | 44 | ns = VERIFY_NAMESPACE_FOUND(FindNamespace(namespace_identifier), resp); |
5422 | | |
5423 | 44 | auto ns_lock = ns->LockForRead(); |
5424 | 44 | new_namespace_id = ns->id(); |
5425 | | // Don't use Namespaces that aren't running. |
5426 | 44 | if (ns->state() != SysNamespaceEntryPB::RUNNING) { |
5427 | 0 | Status s = STATUS_SUBSTITUTE(TryAgain, |
5428 | 0 | "Namespace not running (State=$0). Cannot create $1.$2", |
5429 | 0 | SysNamespaceEntryPB::State_Name(ns->state()), ns->name(), table->name() ); |
5430 | 0 | return SetupError(resp->mutable_error(), NamespaceMasterError(ns->state()), s); |
5431 | 0 | } |
5432 | 2.86k | } |
5433 | 2.86k | if (req->has_new_namespace() || req->has_new_table_name()) { |
5434 | 44 | if (new_namespace_id.empty()) { |
5435 | 0 | const Status s = STATUS(InvalidArgument, "No namespace used"); |
5436 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::NO_NAMESPACE_USED, s); |
5437 | 0 | } |
5438 | 2.86k | } |
5439 | | |
5440 | 2.86k | TRACE("Locking table"); |
5441 | 2.86k | auto l = table->LockForWrite(); |
5442 | 2.86k | RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp)); |
5443 | | |
5444 | 2.86k | bool has_changes = false; |
5445 | 2.86k | auto& table_pb = l.mutable_data()->pb; |
5446 | 2.86k | const TableName table_name = l->name(); |
5447 | 2.86k | const NamespaceId namespace_id = l->namespace_id(); |
5448 | 2.82k | const TableName new_table_name = req->has_new_table_name() ? req->new_table_name() : table_name; |
5449 | | |
5450 | | // Calculate new schema for the on-disk state, not persisted yet. |
5451 | 2.86k | Schema new_schema; |
5452 | 2.86k | ColumnId next_col_id = ColumnId(l->pb.next_column_id()); |
5453 | 2.86k | if (req->alter_schema_steps_size() || req->has_alter_properties()) { |
5454 | 276 | TRACE("Apply alter schema"); |
5455 | 276 | Status s = ApplyAlterSteps( |
5456 | 276 | master_->clock(), table->id(), l->pb, req, &new_schema, &next_col_id, &ddl_log_entries); |
5457 | 276 | if (!s.ok()) { |
5458 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
5459 | 0 | } |
5460 | 276 | DCHECK_NE(next_col_id, 0); |
5461 | 276 | DCHECK_EQ(new_schema.find_column_by_id(next_col_id), |
5462 | 276 | static_cast<int>(Schema::kColumnNotFound)); |
5463 | 276 | has_changes = true; |
5464 | 276 | } |
5465 | | |
5466 | | // Try to acquire the new table name. |
5467 | 2.86k | if (req->has_new_namespace() || req->has_new_table_name()) { |
5468 | | |
5469 | | // Postgres handles name uniqueness constraints in it's own layer. |
5470 | 44 | if (l->table_type() != PGSQL_TABLE_TYPE) { |
5471 | 2 | LockGuard lock(mutex_); |
5472 | 0 | VLOG_WITH_FUNC(3) << "Acquired the catalog manager lock"; |
5473 | | |
5474 | 2 | TRACE("Acquired catalog manager lock"); |
5475 | | |
5476 | | // Verify that the table does not exist. |
5477 | 2 | scoped_refptr<TableInfo> other_table = FindPtrOrNull( |
5478 | 2 | table_names_map_, {new_namespace_id, new_table_name}); |
5479 | 2 | if (other_table != nullptr) { |
5480 | 1 | Status s = STATUS_SUBSTITUTE(AlreadyPresent, |
5481 | 1 | "Object '$0.$1' already exists", |
5482 | 1 | GetNamespaceNameUnlocked(new_namespace_id), other_table->name()); |
5483 | 1 | LOG(WARNING) << "Found table: " << other_table->ToStringWithState() |
5484 | 1 | << ". Failed alterring table with error: " |
5485 | 1 | << s.ToString() << " Request:\n" << req->DebugString(); |
5486 | 1 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_ALREADY_PRESENT, s); |
5487 | 1 | } |
5488 | | |
5489 | | // Acquire the new table name (now we have 2 name for the same table). |
5490 | 1 | table_names_map_[{new_namespace_id, new_table_name}] = table; |
5491 | 1 | } |
5492 | | |
5493 | 43 | table_pb.set_namespace_id(new_namespace_id); |
5494 | 43 | table_pb.set_name(new_table_name); |
5495 | | |
5496 | 43 | has_changes = true; |
5497 | 43 | } |
5498 | | |
5499 | | // Check if there has been any changes to the placement policies for this table. |
5500 | 2.86k | if (req->has_replication_info()) { |
5501 | | // If this is a colocated table, it does not make sense to set placement |
5502 | | // policy for this table, as the tablet associated with it is shared by |
5503 | | // multiple tables. |
5504 | 4 | if (table->colocated()) { |
5505 | 0 | const Status s = STATUS(InvalidArgument, |
5506 | 0 | "Placement policy cannot be altered for a colocated table"); |
5507 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s); |
5508 | 0 | } |
5509 | 4 | if (table->GetTableType() == PGSQL_TABLE_TYPE) { |
5510 | 0 | const Status s = STATUS(InvalidArgument, |
5511 | 0 | "Placement policy cannot be altered for YSQL tables, use Tablespaces"); |
5512 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s); |
5513 | 0 | } |
5514 | | // Validate table replication info. |
5515 | 4 | RETURN_NOT_OK(ValidateTableReplicationInfo(req->replication_info())); |
5516 | 4 | table_pb.mutable_replication_info()->CopyFrom(req->replication_info()); |
5517 | 4 | has_changes = true; |
5518 | 4 | } |
5519 | | |
5520 | | // TODO(hector): Simplify the AlterSchema workflow to avoid doing the same checks on every layer |
5521 | | // this request goes through: https://github.com/YugaByte/yugabyte-db/issues/1882. |
5522 | 2.86k | if (req->has_wal_retention_secs()) { |
5523 | 2.54k | if (has_changes) { |
5524 | 0 | const Status s = STATUS(InvalidArgument, |
5525 | 0 | "wal_retention_secs cannot be altered concurrently with other properties"); |
5526 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s); |
5527 | 0 | } |
5528 | | // TODO(hector): Handle co-partitioned tables: |
5529 | | // https://github.com/YugaByte/yugabyte-db/issues/1905. |
5530 | 2.54k | table_pb.set_wal_retention_secs(req->wal_retention_secs()); |
5531 | 2.54k | has_changes = true; |
5532 | 2.54k | } |
5533 | | |
5534 | 2.86k | if (!has_changes) { |
5535 | 0 | if (req->has_force_send_alter_request() && req->force_send_alter_request()) { |
5536 | 0 | RETURN_NOT_OK(SendAlterTableRequest(table, req)); |
5537 | 0 | } |
5538 | | // Skip empty requests... |
5539 | 0 | return Status::OK(); |
5540 | 2.86k | } |
5541 | | |
5542 | | // Serialize the schema Increment the version number. |
5543 | 2.86k | if (new_schema.initialized()) { |
5544 | 276 | if (!l->pb.has_fully_applied_schema()) { |
5545 | | // The idea here is that if we are in the middle of updating the schema |
5546 | | // from one state to another, then YBClients will be given the older |
5547 | | // version until the schema is updated on all the tablets. |
5548 | | // As of Dec 2019, this may lead to some rejected operations/retries during |
5549 | | // the index backfill. See #3284 for possible optimizations. |
5550 | 276 | MultiStageAlterTable::CopySchemaDetailsToFullyApplied(&table_pb); |
5551 | 276 | } |
5552 | 276 | SchemaToPB(new_schema, table_pb.mutable_schema()); |
5553 | 276 | } |
5554 | | |
5555 | | // Only increment the version number if it is a schema change (AddTable change goes through a |
5556 | | // different path and it's not processed here). |
5557 | 2.86k | if (!req->has_wal_retention_secs()) { |
5558 | 323 | table_pb.set_version(table_pb.version() + 1); |
5559 | 323 | table_pb.set_updates_only_index_permissions(false); |
5560 | 323 | } |
5561 | 2.86k | table_pb.set_next_column_id(next_col_id); |
5562 | 2.86k | l.mutable_data()->set_state( |
5563 | 2.86k | SysTablesEntryPB::ALTERING, |
5564 | 2.86k | Substitute("Alter table version=$0 ts=$1", table_pb.version(), LocalTimeAsString())); |
5565 | | |
5566 | | // Update sys-catalog with the new table schema. |
5567 | 2.86k | TRACE("Updating metadata on disk"); |
5568 | 2.86k | std::vector<const DdlLogEntry*> ddl_log_entry_pointers; |
5569 | 2.86k | ddl_log_entry_pointers.reserve(ddl_log_entries.size()); |
5570 | 284 | for (const auto& entry : ddl_log_entries) { |
5571 | 284 | ddl_log_entry_pointers.push_back(&entry); |
5572 | 284 | } |
5573 | 2.86k | Status s = sys_catalog_->Upsert(leader_ready_term(), ddl_log_entry_pointers, table); |
5574 | 2.86k | if (!s.ok()) { |
5575 | 1 | s = s.CloneAndPrepend( |
5576 | 1 | Substitute("An error occurred while updating sys-catalog tables entry: $0", |
5577 | 1 | s.ToString())); |
5578 | 1 | LOG(WARNING) << s.ToString(); |
5579 | 1 | if (table->GetTableType() != PGSQL_TABLE_TYPE && |
5580 | 0 | (req->has_new_namespace() || req->has_new_table_name())) { |
5581 | 0 | LockGuard lock(mutex_); |
5582 | 0 | VLOG_WITH_FUNC(3) << "Acquired the catalog manager lock"; |
5583 | 0 | CHECK_EQ(table_names_map_.erase({new_namespace_id, new_table_name}), 1); |
5584 | 0 | } |
5585 | | // TableMetadaLock follows RAII paradigm: when it leaves scope, |
5586 | | // 'l' will be unlocked, and the mutation will be aborted. |
5587 | 1 | return CheckIfNoLongerLeaderAndSetupError(s, resp); |
5588 | 1 | } |
5589 | | |
5590 | | // Remove the old name. Not present if PGSQL. |
5591 | 2.86k | if (table->GetTableType() != PGSQL_TABLE_TYPE && |
5592 | 172 | (req->has_new_namespace() || req->has_new_table_name())) { |
5593 | 1 | TRACE("Removing (namespace, table) combination ($0, $1) from by-name map", |
5594 | 1 | namespace_id, table_name); |
5595 | 1 | LockGuard lock(mutex_); |
5596 | 1 | table_names_map_.erase({namespace_id, table_name}); |
5597 | 1 | } |
5598 | | |
5599 | | // Update the in-memory state. |
5600 | 2.86k | TRACE("Committing in-memory state"); |
5601 | 2.86k | l.Commit(); |
5602 | | |
5603 | 2.86k | RETURN_NOT_OK(SendAlterTableRequest(table, req)); |
5604 | | |
5605 | | // Increment transaction status version if needed. |
5606 | 2.86k | if (table->GetTableType() == TableType::TRANSACTION_STATUS_TABLE_TYPE) { |
5607 | 0 | RETURN_NOT_OK(IncrementTransactionTablesVersion()); |
5608 | 0 | } |
5609 | | |
5610 | 2.86k | LOG(INFO) << "Successfully initiated ALTER TABLE (pending tablet schema updates) for " |
5611 | 2.86k | << table->ToString() << " per request from " << RequestorString(rpc); |
5612 | 2.86k | return Status::OK(); |
5613 | 2.86k | } |
5614 | | |
5615 | | Status CatalogManager::IsAlterTableDone(const IsAlterTableDoneRequestPB* req, |
5616 | 649 | IsAlterTableDoneResponsePB* resp) { |
5617 | | // 1. Lookup the table and verify if it exists. |
5618 | 649 | TRACE("Looking up table"); |
5619 | 649 | scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTable(req->table())); |
5620 | | |
5621 | 649 | TRACE("Locking table"); |
5622 | 649 | auto l = table->LockForRead(); |
5623 | 649 | RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp)); |
5624 | | |
5625 | | // 2. Verify if the alter is in-progress. |
5626 | 649 | TRACE("Verify if there is an alter operation in progress for $0", table->ToString()); |
5627 | 649 | resp->set_schema_version(l->pb.version()); |
5628 | 649 | resp->set_done(l->pb.state() != SysTablesEntryPB::ALTERING); |
5629 | | |
5630 | 649 | return Status::OK(); |
5631 | 649 | } |
5632 | | |
5633 | | Result<TabletInfoPtr> CatalogManager::RegisterNewTabletForSplit( |
5634 | | TabletInfo* source_tablet_info, const PartitionPB& partition, |
5635 | 36 | TableInfo::WriteLock* table_write_lock, TabletInfo::WriteLock* tablet_write_lock) { |
5636 | 36 | const auto tablet_lock = source_tablet_info->LockForRead(); |
5637 | | |
5638 | 36 | auto table = source_tablet_info->table(); |
5639 | 36 | TabletInfoPtr new_tablet; |
5640 | 36 | { |
5641 | 36 | LockGuard lock(mutex_); |
5642 | 36 | new_tablet = CreateTabletInfo(table.get(), partition); |
5643 | 36 | } |
5644 | 36 | const auto& source_tablet_meta = tablet_lock->pb; |
5645 | | |
5646 | 36 | auto& new_tablet_meta = new_tablet->mutable_metadata()->mutable_dirty()->pb; |
5647 | 36 | new_tablet_meta.set_state(SysTabletsEntryPB::CREATING); |
5648 | 36 | new_tablet_meta.mutable_committed_consensus_state()->CopyFrom( |
5649 | 36 | source_tablet_meta.committed_consensus_state()); |
5650 | 36 | new_tablet_meta.set_split_depth(source_tablet_meta.split_depth() + 1); |
5651 | 36 | new_tablet_meta.set_split_parent_tablet_id(source_tablet_info->tablet_id()); |
5652 | | // TODO(tsplit): consider and handle failure scenarios, for example: |
5653 | | // - Crash or leader failover before sending out the split tasks. |
5654 | | // - Long enough partition while trying to send out the splits so that they timeout and |
5655 | | // not get executed. |
5656 | 36 | int new_partition_list_version; |
5657 | 36 | { |
5658 | 36 | LockGuard lock(mutex_); |
5659 | | |
5660 | 36 | auto& table_pb = table_write_lock->mutable_data()->pb; |
5661 | 36 | new_partition_list_version = table_pb.partition_list_version() + 1; |
5662 | 36 | table_pb.set_partition_list_version(new_partition_list_version); |
5663 | | |
5664 | 36 | tablet_write_lock->mutable_data()->pb.add_split_tablet_ids(new_tablet->id()); |
5665 | 36 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), table, new_tablet, source_tablet_info)); |
5666 | | |
5667 | 36 | MAYBE_FAULT(FLAGS_TEST_crash_after_creating_single_split_tablet); |
5668 | | |
5669 | 36 | table->AddTablet(new_tablet); |
5670 | | // TODO: We use this pattern in other places, but what if concurrent thread accesses not yet |
5671 | | // committed TabletInfo from the `table` ? |
5672 | 36 | new_tablet->mutable_metadata()->CommitMutation(); |
5673 | | |
5674 | 36 | auto tablet_map_checkout = tablet_map_.CheckOut(); |
5675 | 36 | (*tablet_map_checkout)[new_tablet->id()] = new_tablet; |
5676 | 36 | } |
5677 | 36 | LOG(INFO) << "Registered new tablet " << new_tablet->tablet_id() |
5678 | 36 | << " (" << AsString(partition) << ") to split the tablet " |
5679 | 36 | << source_tablet_info->tablet_id() |
5680 | 36 | << " (" << AsString(source_tablet_meta.partition()) |
5681 | 36 | << ") for table " << table->ToString() |
5682 | 36 | << ", new partition_list_version: " << new_partition_list_version; |
5683 | | |
5684 | 36 | return new_tablet; |
5685 | 36 | } |
5686 | | |
5687 | | Status CatalogManager::GetTableSchema(const GetTableSchemaRequestPB* req, |
5688 | 117k | GetTableSchemaResponsePB* resp) { |
5689 | 0 | VLOG(1) << "Servicing GetTableSchema request for " << req->ShortDebugString(); |
5690 | | |
5691 | | // Lookup the table and verify if it exists. |
5692 | 117k | TRACE("Looking up table"); |
5693 | 114k | scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTable(req->table())); |
5694 | | |
5695 | | // Due to differences in the way proxies handle version mismatch (pull for yql vs push for sql). |
5696 | | // For YQL tables, we will return the "set of indexes" being applied instead of the ones |
5697 | | // that are fully completed. |
5698 | | // For PGSQL (and other) tables we want to return the fully applied schema. |
5699 | 114k | const bool get_fully_applied_indexes = table->GetTableType() != TableType::YQL_TABLE_TYPE; |
5700 | 114k | return GetTableSchemaInternal(req, resp, get_fully_applied_indexes); |
5701 | 117k | } |
5702 | | |
5703 | | Status CatalogManager::GetTableSchemaInternal(const GetTableSchemaRequestPB* req, |
5704 | | GetTableSchemaResponsePB* resp, |
5705 | 115k | bool get_fully_applied_indexes) { |
5706 | 12 | VLOG(1) << "Servicing GetTableSchema request for " << req->ShortDebugString(); |
5707 | | |
5708 | | // Lookup the table and verify if it exists. |
5709 | 115k | TRACE("Looking up table"); |
5710 | 115k | scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTable(req->table())); |
5711 | | |
5712 | 115k | TRACE("Locking table"); |
5713 | 115k | auto l = table->LockForRead(); |
5714 | 115k | RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp)); |
5715 | | |
5716 | 115k | if (l->pb.has_fully_applied_schema()) { |
5717 | | // An AlterTable is in progress; fully_applied_schema is the last |
5718 | | // schema that has reached every TS. |
5719 | 1.16k | DCHECK(l->pb.state() == SysTablesEntryPB::ALTERING); |
5720 | 1.16k | resp->mutable_schema()->CopyFrom(l->pb.fully_applied_schema()); |
5721 | 114k | } else { |
5722 | | // There's no AlterTable, the regular schema is "fully applied". |
5723 | 114k | resp->mutable_schema()->CopyFrom(l->pb.schema()); |
5724 | 114k | } |
5725 | | |
5726 | 115k | if (get_fully_applied_indexes && l->pb.has_fully_applied_schema()) { |
5727 | 123 | resp->set_version(l->pb.fully_applied_schema_version()); |
5728 | 123 | resp->mutable_indexes()->CopyFrom(l->pb.fully_applied_indexes()); |
5729 | 123 | if (l->pb.has_fully_applied_index_info()) { |
5730 | 0 | resp->set_obsolete_indexed_table_id(GetIndexedTableId(l->pb)); |
5731 | 0 | *resp->mutable_index_info() = l->pb.fully_applied_index_info(); |
5732 | 0 | } |
5733 | 0 | VLOG(1) << "Returning" |
5734 | 0 | << "\nfully_applied_schema with version " |
5735 | 0 | << l->pb.fully_applied_schema_version() |
5736 | 0 | << ":\n" |
5737 | 0 | << yb::ToString(l->pb.fully_applied_indexes()) |
5738 | 0 | << "\ninstead of schema with version " |
5739 | 0 | << l->pb.version() |
5740 | 0 | << ":\n" |
5741 | 0 | << yb::ToString(l->pb.indexes()); |
5742 | 115k | } else { |
5743 | 115k | resp->set_version(l->pb.version()); |
5744 | 115k | resp->mutable_indexes()->CopyFrom(l->pb.indexes()); |
5745 | 115k | if (l->pb.has_index_info()) { |
5746 | 22.2k | resp->set_obsolete_indexed_table_id(GetIndexedTableId(l->pb)); |
5747 | 22.2k | *resp->mutable_index_info() = l->pb.index_info(); |
5748 | 22.2k | } |
5749 | 17 | VLOG(3) << "Returning" |
5750 | 17 | << "\nschema with version " |
5751 | 17 | << l->pb.version() |
5752 | 17 | << ":\n" |
5753 | 17 | << yb::ToString(l->pb.indexes()); |
5754 | 115k | } |
5755 | 115k | resp->set_is_compatible_with_previous_version(l->pb.updates_only_index_permissions()); |
5756 | 115k | resp->mutable_partition_schema()->CopyFrom(l->pb.partition_schema()); |
5757 | 115k | if (IsReplicationInfoSet(l->pb.replication_info())) { |
5758 | 2 | resp->mutable_replication_info()->CopyFrom(l->pb.replication_info()); |
5759 | 2 | } |
5760 | 115k | resp->set_create_table_done(!table->IsCreateInProgress()); |
5761 | 115k | resp->set_table_type(table->metadata().state().pb.table_type()); |
5762 | 115k | resp->mutable_identifier()->set_table_name(l->pb.name()); |
5763 | 115k | resp->mutable_identifier()->set_table_id(table->id()); |
5764 | 115k | resp->mutable_identifier()->mutable_namespace_()->set_id(table->namespace_id()); |
5765 | 115k | auto nsinfo = FindNamespaceById(table->namespace_id()); |
5766 | 115k | if (nsinfo.ok()) { |
5767 | 115k | resp->mutable_identifier()->mutable_namespace_()->set_name((**nsinfo).name()); |
5768 | 115k | } |
5769 | | |
5770 | 115k | if (l->pb.has_wal_retention_secs()) { |
5771 | 2.46k | resp->set_wal_retention_secs(l->pb.wal_retention_secs()); |
5772 | 2.46k | } |
5773 | | |
5774 | | // Get namespace name by id. |
5775 | 115k | SharedLock lock(mutex_); |
5776 | 115k | TRACE("Looking up namespace"); |
5777 | 115k | const scoped_refptr<NamespaceInfo> ns = FindPtrOrNull(namespace_ids_map_, table->namespace_id()); |
5778 | | |
5779 | 115k | if (ns == nullptr) { |
5780 | 0 | Status s = STATUS_SUBSTITUTE( |
5781 | 0 | NotFound, "Could not find namespace by namespace id $0 for request $1.", |
5782 | 0 | table->namespace_id(), req->DebugString()); |
5783 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s); |
5784 | 0 | } |
5785 | | |
5786 | 115k | resp->mutable_identifier()->mutable_namespace_()->set_name(ns->name()); |
5787 | | |
5788 | 115k | resp->set_colocated(table->colocated()); |
5789 | | |
5790 | 18.4E | VLOG(1) << "Serviced GetTableSchema request for " << req->ShortDebugString() << " with " |
5791 | 18.4E | << yb::ToString(*resp); |
5792 | 115k | return Status::OK(); |
5793 | 115k | } |
5794 | | |
5795 | | Status CatalogManager::GetTablegroupSchema(const GetTablegroupSchemaRequestPB* req, |
5796 | 0 | GetTablegroupSchemaResponsePB* resp) { |
5797 | 0 | VLOG(1) << "Servicing GetTablegroupSchema request for " << req->ShortDebugString(); |
5798 | 0 | if (!req->parent_tablegroup().has_id()) { |
5799 | 0 | Status s = STATUS(InvalidArgument, "Invalid get tablegroup request (missing fields)"); |
5800 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
5801 | 0 | } |
5802 | | |
5803 | 0 | const std::string& tablegroupId = req->parent_tablegroup().id(); |
5804 | 0 | if (!IsTablegroupParentTableId(tablegroupId)) { |
5805 | 0 | Status s = STATUS(InvalidArgument, "Received a non tablegroup ID"); |
5806 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
5807 | 0 | } |
5808 | | |
5809 | | // Strip the suffix from the tablegroup ID request (since tablegroup_ids_map_ |
5810 | | // only accepts the plain ID). |
5811 | 0 | DCHECK(boost::algorithm::ends_with(tablegroupId, master::kTablegroupParentTableIdSuffix)); |
5812 | 0 | size_t tgid_len = tablegroupId.size() - strlen(master::kTablegroupParentTableIdSuffix); |
5813 | 0 | TablegroupId tgid = tablegroupId.substr(0, tgid_len); |
5814 | | |
5815 | | // Lookup the tablegroup. |
5816 | 0 | std::unordered_set<TableId> tablesInTablegroup; |
5817 | 0 | { |
5818 | 0 | SharedLock lock(mutex_); |
5819 | |
|
5820 | 0 | if (tablegroup_ids_map_.find(tgid) == tablegroup_ids_map_.end()) { |
5821 | 0 | return STATUS(NotFound, Substitute("Tablegroup not found for tablegroup id: $0", |
5822 | 0 | req->parent_tablegroup().id())); |
5823 | 0 | } |
5824 | 0 | scoped_refptr<TablegroupInfo> tginfo = tablegroup_ids_map_[tgid]; |
5825 | 0 | tablesInTablegroup = tginfo->ChildTables(); |
5826 | 0 | } |
5827 | |
|
5828 | 0 | for (const auto& t : tablesInTablegroup) { |
5829 | 0 | TRACE("Looking up table"); |
5830 | 0 | GetTableSchemaRequestPB schemaReq; |
5831 | 0 | GetTableSchemaResponsePB schemaResp; |
5832 | 0 | schemaReq.mutable_table()->set_table_id(t); |
5833 | 0 | Status s = GetTableSchema(&schemaReq, &schemaResp); |
5834 | 0 | if (!s.ok() || schemaResp.has_error()) { |
5835 | 0 | LOG(ERROR) << "Error while getting table schema: " << s; |
5836 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s); |
5837 | 0 | } |
5838 | 0 | resp->add_get_table_schema_response_pbs()->Swap(&schemaResp); |
5839 | 0 | } |
5840 | |
|
5841 | 0 | return Status::OK(); |
5842 | 0 | } |
5843 | | |
5844 | | Status CatalogManager::GetColocatedTabletSchema(const GetColocatedTabletSchemaRequestPB* req, |
5845 | 0 | GetColocatedTabletSchemaResponsePB* resp) { |
5846 | 0 | VLOG(1) << "Servicing GetColocatedTabletSchema request for " << req->ShortDebugString(); |
5847 | | |
5848 | | // Lookup the given parent colocated table and verify if it exists. |
5849 | 0 | TRACE("Looking up table"); |
5850 | 0 | auto parent_colocated_table = VERIFY_RESULT(FindTable(req->parent_colocated_table())); |
5851 | 0 | { |
5852 | 0 | TRACE("Locking table"); |
5853 | 0 | auto l = parent_colocated_table->LockForRead(); |
5854 | 0 | RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp)); |
5855 | 0 | } |
5856 | |
|
5857 | 0 | if (!parent_colocated_table->colocated() || !parent_colocated_table->IsColocatedParentTable()) { |
5858 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_TABLE_TYPE, |
5859 | 0 | STATUS(InvalidArgument, "Table provided is not a parent colocated table")); |
5860 | 0 | } |
5861 | | |
5862 | | // Next get all the user tables that are in the database. |
5863 | 0 | ListTablesRequestPB listTablesReq; |
5864 | 0 | ListTablesResponsePB ListTablesResp; |
5865 | |
|
5866 | 0 | listTablesReq.mutable_namespace_()->set_id(parent_colocated_table->namespace_id()); |
5867 | 0 | listTablesReq.mutable_namespace_()->set_database_type(YQL_DATABASE_PGSQL); |
5868 | 0 | listTablesReq.set_exclude_system_tables(true); |
5869 | 0 | Status status = ListTables(&listTablesReq, &ListTablesResp); |
5870 | 0 | if (!status.ok() || ListTablesResp.has_error()) { |
5871 | 0 | LOG(ERROR) << "Error while listing tables: " << status; |
5872 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, status); |
5873 | 0 | } |
5874 | | |
5875 | | // Get the table schema for each colocated table. |
5876 | 0 | for (const auto& t : ListTablesResp.tables()) { |
5877 | | // Need to check if this table is colocated first. |
5878 | 0 | TRACE("Looking up table"); |
5879 | 0 | scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTableById(t.id())); |
5880 | |
|
5881 | 0 | if (table->colocated()) { |
5882 | | // Now we can get the schema for this table. |
5883 | 0 | GetTableSchemaRequestPB schemaReq; |
5884 | 0 | GetTableSchemaResponsePB schemaResp; |
5885 | 0 | schemaReq.mutable_table()->set_table_id(t.id()); |
5886 | 0 | status = GetTableSchema(&schemaReq, &schemaResp); |
5887 | 0 | if (!status.ok() || schemaResp.has_error()) { |
5888 | 0 | LOG(ERROR) << "Error while getting table schema: " << status; |
5889 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, status); |
5890 | 0 | } |
5891 | 0 | resp->add_get_table_schema_response_pbs()->Swap(&schemaResp); |
5892 | 0 | } |
5893 | 0 | } |
5894 | |
|
5895 | 0 | return Status::OK(); |
5896 | 0 | } |
5897 | | |
5898 | | Status CatalogManager::ListTables(const ListTablesRequestPB* req, |
5899 | 1.83k | ListTablesResponsePB* resp) { |
5900 | 1.83k | NamespaceId namespace_id; |
5901 | | |
5902 | | // Validate namespace. |
5903 | 1.83k | if (req->has_namespace_()) { |
5904 | | // Lookup the namespace and verify if it exists. |
5905 | 349 | auto ns = VERIFY_NAMESPACE_FOUND(FindNamespace(req->namespace_()), resp); |
5906 | | |
5907 | 176 | auto ns_lock = ns->LockForRead(); |
5908 | 176 | namespace_id = ns->id(); |
5909 | | |
5910 | | // Don't list tables with a namespace that isn't running. |
5911 | 176 | if (ns->state() != SysNamespaceEntryPB::RUNNING) { |
5912 | 0 | LOG(INFO) << "ListTables request for a Namespace not running (State=" |
5913 | 0 | << SysNamespaceEntryPB::State_Name(ns->state()) << ")"; |
5914 | 0 | return Status::OK(); |
5915 | 0 | } |
5916 | 1.66k | } |
5917 | | |
5918 | 1.66k | bool has_rel_filter = req->relation_type_filter_size() > 0; |
5919 | 1.50k | bool include_user_table = has_rel_filter ? false : true; |
5920 | 1.50k | bool include_user_index = has_rel_filter ? false : true; |
5921 | 198 | bool include_system_table = req->exclude_system_tables() ? false |
5922 | 1.46k | : (has_rel_filter ? false : true); |
5923 | | |
5924 | 158 | for (const auto &relation : req->relation_type_filter()) { |
5925 | 158 | if (relation == SYSTEM_TABLE_RELATION) { |
5926 | 2 | include_system_table = true; |
5927 | 156 | } else if (relation == USER_TABLE_RELATION) { |
5928 | 155 | include_user_table = true; |
5929 | 1 | } else if (relation == INDEX_TABLE_RELATION) { |
5930 | 1 | include_user_index = true; |
5931 | 1 | } |
5932 | 158 | } |
5933 | | |
5934 | 1.66k | SharedLock lock(mutex_); |
5935 | 1.66k | RelationType relation_type; |
5936 | | |
5937 | 213k | for (const auto& entry : *table_ids_map_) { |
5938 | 213k | auto& table_info = *entry.second; |
5939 | 213k | auto ltm = table_info.LockForRead(); |
5940 | | |
5941 | 213k | if (!ltm->visible_to_client() && !req->include_not_running()) { |
5942 | 36 | continue; |
5943 | 36 | } |
5944 | | |
5945 | 213k | if (!namespace_id.empty() && namespace_id != table_info.namespace_id()) { |
5946 | 72.4k | continue; // Skip tables from other namespaces. |
5947 | 72.4k | } |
5948 | | |
5949 | 141k | if (req->has_name_filter()) { |
5950 | 2.70k | size_t found = ltm->name().find(req->name_filter()); |
5951 | 2.70k | if (found == string::npos) { |
5952 | 2.69k | continue; |
5953 | 2.69k | } |
5954 | 138k | } |
5955 | | |
5956 | 138k | if (IsUserIndexUnlocked(table_info)) { |
5957 | 2 | if (!include_user_index) { |
5958 | 0 | continue; |
5959 | 0 | } |
5960 | 2 | relation_type = INDEX_TABLE_RELATION; |
5961 | 138k | } else if (IsUserTableUnlocked(table_info)) { |
5962 | 6.38k | if (!include_user_table) { |
5963 | 4 | continue; |
5964 | 4 | } |
5965 | 6.37k | relation_type = USER_TABLE_RELATION; |
5966 | 132k | } else { |
5967 | 132k | if (!include_system_table) { |
5968 | 110k | continue; |
5969 | 110k | } |
5970 | 21.7k | relation_type = SYSTEM_TABLE_RELATION; |
5971 | 21.7k | } |
5972 | | |
5973 | 28.1k | NamespaceIdentifierPB ns_identifier; |
5974 | 28.1k | ns_identifier.set_id(ltm->namespace_id()); |
5975 | 28.1k | auto ns = FindNamespaceUnlocked(ns_identifier); |
5976 | 28.1k | if (!ns.ok() || (**ns).state() != SysNamespaceEntryPB::RUNNING) { |
5977 | 2 | if (PREDICT_FALSE(FLAGS_TEST_return_error_if_namespace_not_found)) { |
5978 | 0 | VERIFY_NAMESPACE_FOUND(std::move(ns), resp); |
5979 | 0 | } |
5980 | 1 | LOG(ERROR) << "Unable to find namespace with id " << ltm->namespace_id() |
5981 | 1 | << " for table " << ltm->name(); |
5982 | 1 | continue; |
5983 | 28.1k | } |
5984 | | |
5985 | 28.1k | ListTablesResponsePB::TableInfo *table = resp->add_tables(); |
5986 | 28.1k | { |
5987 | 28.1k | auto namespace_lock = (**ns).LockForRead(); |
5988 | 28.1k | table->mutable_namespace_()->set_id((**ns).id()); |
5989 | 28.1k | table->mutable_namespace_()->set_name(namespace_lock->name()); |
5990 | 28.1k | table->mutable_namespace_()->set_database_type(namespace_lock->pb.database_type()); |
5991 | 28.1k | } |
5992 | 28.1k | table->set_id(entry.second->id()); |
5993 | 28.1k | table->set_name(ltm->name()); |
5994 | 28.1k | table->set_table_type(ltm->table_type()); |
5995 | 28.1k | table->set_relation_type(relation_type); |
5996 | 28.1k | table->set_state(ltm->pb.state()); |
5997 | 28.1k | table->set_pgschema_name(ltm->schema().pgschema_name()); |
5998 | 28.1k | } |
5999 | 1.66k | return Status::OK(); |
6000 | 1.66k | } |
6001 | | |
6002 | 0 | boost::optional<TablegroupId> CatalogManager::FindTablegroupByTableId(const TableId& table_id) { |
6003 | 0 | SharedLock lock(mutex_); |
6004 | |
|
6005 | 0 | for (const auto& tablegroup : tablegroup_ids_map_) { |
6006 | 0 | const auto& tgid = tablegroup.first; |
6007 | 0 | const auto& tginfo = tablegroup.second; |
6008 | 0 | for (const auto& t : tginfo->ChildTables()) { |
6009 | 0 | if (table_id == t) { |
6010 | 0 | return boost::optional<TablegroupId>(tgid + kTablegroupParentTableIdSuffix); |
6011 | 0 | } |
6012 | 0 | } |
6013 | 0 | } |
6014 | |
|
6015 | 0 | return boost::none; |
6016 | 0 | } |
6017 | | |
6018 | 410k | scoped_refptr<TableInfo> CatalogManager::GetTableInfo(const TableId& table_id) { |
6019 | 410k | SharedLock lock(mutex_); |
6020 | 410k | return FindPtrOrNull(*table_ids_map_, table_id); |
6021 | 410k | } |
6022 | | |
6023 | | scoped_refptr<TableInfo> CatalogManager::GetTableInfoFromNamespaceNameAndTableName( |
6024 | 0 | YQLDatabase db_type, const NamespaceName& namespace_name, const TableName& table_name) { |
6025 | 0 | if (db_type == YQL_DATABASE_PGSQL) |
6026 | 0 | return nullptr; |
6027 | 0 | SharedLock lock(mutex_); |
6028 | 0 | const auto ns = FindPtrOrNull(namespace_names_mapper_[db_type], namespace_name); |
6029 | 0 | return ns |
6030 | 0 | ? FindPtrOrNull(table_names_map_, {ns->id(), table_name}) |
6031 | 0 | : nullptr; |
6032 | 0 | } |
6033 | | |
6034 | 243k | scoped_refptr<TableInfo> CatalogManager::GetTableInfoUnlocked(const TableId& table_id) { |
6035 | 243k | return FindPtrOrNull(*table_ids_map_, table_id); |
6036 | 243k | } |
6037 | | |
6038 | 46.5k | std::vector<TableInfoPtr> CatalogManager::GetTables(GetTablesMode mode) { |
6039 | 46.5k | std::vector<TableInfoPtr> result; |
6040 | 46.5k | { |
6041 | 46.5k | SharedLock lock(mutex_); |
6042 | 46.5k | result.reserve(table_ids_map_->size()); |
6043 | 1.11M | for (const auto& e : *table_ids_map_) { |
6044 | 1.11M | result.push_back(e.second); |
6045 | 1.11M | } |
6046 | 46.5k | } |
6047 | 46.5k | switch (mode) { |
6048 | 1 | case GetTablesMode::kAll: |
6049 | 1 | return result; |
6050 | 165 | case GetTablesMode::kRunning: { |
6051 | 3.53k | auto filter = [](const TableInfoPtr& table_info) { return !table_info->is_running(); }; |
6052 | 165 | EraseIf(filter, &result); |
6053 | 165 | return result; |
6054 | 0 | } |
6055 | 46.3k | case GetTablesMode::kVisibleToClient: { |
6056 | 1.10M | auto filter = [](const TableInfoPtr& table_info) { |
6057 | 1.10M | return !table_info->LockForRead()->visible_to_client(); |
6058 | 1.10M | }; |
6059 | 46.3k | EraseIf(filter, &result); |
6060 | 46.3k | return result; |
6061 | 0 | } |
6062 | 0 | } |
6063 | 0 | FATAL_INVALID_ENUM_VALUE(GetTablesMode, mode); |
6064 | 0 | } |
6065 | | |
6066 | | void CatalogManager::GetAllNamespaces(std::vector<scoped_refptr<NamespaceInfo>>* namespaces, |
6067 | 13.6k | bool includeOnlyRunningNamespaces) { |
6068 | 13.6k | namespaces->clear(); |
6069 | 13.6k | SharedLock lock(mutex_); |
6070 | 56.3k | for (const NamespaceInfoMap::value_type& e : namespace_ids_map_) { |
6071 | 56.3k | if (includeOnlyRunningNamespaces && e.second->state() != SysNamespaceEntryPB::RUNNING) { |
6072 | 3 | continue; |
6073 | 3 | } |
6074 | 56.3k | namespaces->push_back(e.second); |
6075 | 56.3k | } |
6076 | 13.6k | } |
6077 | | |
6078 | 13.9k | void CatalogManager::GetAllUDTypes(std::vector<scoped_refptr<UDTypeInfo>>* types) { |
6079 | 13.9k | types->clear(); |
6080 | 13.9k | SharedLock lock(mutex_); |
6081 | 204 | for (const UDTypeInfoMap::value_type& e : udtype_ids_map_) { |
6082 | 204 | types->push_back(e.second); |
6083 | 204 | } |
6084 | 13.9k | } |
6085 | | |
6086 | 3 | std::vector<std::shared_ptr<MonitoredTask>> CatalogManager::GetRecentTasks() { |
6087 | 3 | return tasks_tracker_->GetTasks(); |
6088 | 3 | } |
6089 | | |
6090 | 0 | std::vector<std::shared_ptr<MonitoredTask>> CatalogManager::GetRecentJobs() { |
6091 | 0 | return jobs_tracker_->GetTasks(); |
6092 | 0 | } |
6093 | | |
6094 | 13.1k | NamespaceName CatalogManager::GetNamespaceNameUnlocked(const NamespaceId& id) const { |
6095 | 13.1k | const scoped_refptr<NamespaceInfo> ns = FindPtrOrNull(namespace_ids_map_, id); |
6096 | 13.1k | return ns == nullptr ? NamespaceName() : ns->name(); |
6097 | 13.1k | } |
6098 | | |
6099 | 19 | NamespaceName CatalogManager::GetNamespaceName(const NamespaceId& id) const { |
6100 | 19 | TRACE("Acquired catalog manager lock"); |
6101 | 19 | SharedLock lock(mutex_); |
6102 | 19 | return GetNamespaceNameUnlocked(id); |
6103 | 19 | } |
6104 | | |
6105 | | NamespaceName CatalogManager::GetNamespaceNameUnlocked( |
6106 | 0 | const scoped_refptr<TableInfo>& table) const { |
6107 | 0 | return GetNamespaceNameUnlocked(table->namespace_id()); |
6108 | 0 | } |
6109 | | |
6110 | 0 | NamespaceName CatalogManager::GetNamespaceName(const scoped_refptr<TableInfo>& table) const { |
6111 | 0 | return GetNamespaceName(table->namespace_id()); |
6112 | 0 | } |
6113 | | |
6114 | 28.9M | bool CatalogManager::IsSystemTable(const TableInfo& table) const { |
6115 | 28.9M | return table.is_system(); |
6116 | 28.9M | } |
6117 | | |
6118 | | // True if table is created by user. |
6119 | | // Table can be regular table or index in this case. |
6120 | 208 | bool CatalogManager::IsUserCreatedTable(const TableInfo& table) const { |
6121 | 208 | SharedLock lock(mutex_); |
6122 | 208 | return IsUserCreatedTableUnlocked(table); |
6123 | 208 | } |
6124 | | |
6125 | 277k | bool CatalogManager::IsUserCreatedTableUnlocked(const TableInfo& table) const { |
6126 | 277k | if (table.GetTableType() == PGSQL_TABLE_TYPE || table.GetTableType() == YQL_TABLE_TYPE) { |
6127 | 276k | if (!IsSystemTable(table) && !IsSequencesSystemTable(table) && |
6128 | 13.0k | GetNamespaceNameUnlocked(table.namespace_id()) != kSystemNamespaceName && |
6129 | 12.9k | !table.IsColocatedParentTable() && |
6130 | 12.9k | !table.IsTablegroupParentTable()) { |
6131 | 12.9k | return true; |
6132 | 12.9k | } |
6133 | 264k | } |
6134 | 264k | return false; |
6135 | 264k | } |
6136 | | |
6137 | 198 | bool CatalogManager::IsUserTable(const TableInfo& table) const { |
6138 | 198 | SharedLock lock(mutex_); |
6139 | 198 | return IsUserTableUnlocked(table); |
6140 | 198 | } |
6141 | | |
6142 | 138k | bool CatalogManager::IsUserTableUnlocked(const TableInfo& table) const { |
6143 | 138k | return IsUserCreatedTableUnlocked(table) && table.indexed_table_id().empty(); |
6144 | 138k | } |
6145 | | |
6146 | 18 | bool CatalogManager::IsUserIndex(const TableInfo& table) const { |
6147 | 18 | SharedLock lock(mutex_); |
6148 | 18 | return IsUserIndexUnlocked(table); |
6149 | 18 | } |
6150 | | |
6151 | 138k | bool CatalogManager::IsUserIndexUnlocked(const TableInfo& table) const { |
6152 | 138k | return IsUserCreatedTableUnlocked(table) && !table.indexed_table_id().empty(); |
6153 | 138k | } |
6154 | | |
6155 | 0 | bool CatalogManager::IsTablegroupParentTableId(const TableId& table_id) const { |
6156 | 0 | return table_id.find(kTablegroupParentTableIdSuffix) != std::string::npos; |
6157 | 0 | } |
6158 | | |
6159 | 0 | bool CatalogManager::IsColocatedParentTableId(const TableId& table_id) const { |
6160 | 0 | return table_id.find(kColocatedParentTableIdSuffix) != std::string::npos; |
6161 | 0 | } |
6162 | | |
6163 | 13.0k | bool CatalogManager::IsSequencesSystemTable(const TableInfo& table) const { |
6164 | 13.0k | if (table.GetTableType() == PGSQL_TABLE_TYPE && !table.IsColocatedParentTable() |
6165 | 10.3k | && !table.IsTablegroupParentTable()) { |
6166 | | // This case commonly occurs during unit testing. Avoid unnecessary assert within Get(). |
6167 | 10.3k | if (!IsPgsqlId(table.namespace_id()) || !IsPgsqlId(table.id())) { |
6168 | 4 | LOG(WARNING) << "Not PGSQL IDs " << table.namespace_id() << ", " << table.id(); |
6169 | 4 | return false; |
6170 | 4 | } |
6171 | 10.3k | Result<uint32_t> database_oid = GetPgsqlDatabaseOid(table.namespace_id()); |
6172 | 10.3k | if (!database_oid.ok()) { |
6173 | 0 | LOG(WARNING) << "Invalid Namespace ID " << table.namespace_id(); |
6174 | 0 | return false; |
6175 | 0 | } |
6176 | 10.3k | Result<uint32_t> table_oid = GetPgsqlTableOid(table.id()); |
6177 | 10.3k | if (!table_oid.ok()) { |
6178 | 0 | LOG(WARNING) << "Invalid Table ID " << table.id(); |
6179 | 0 | return false; |
6180 | 0 | } |
6181 | 10.3k | if (*database_oid == kPgSequencesDataDatabaseOid && *table_oid == kPgSequencesDataTableOid) { |
6182 | 0 | return true; |
6183 | 0 | } |
6184 | 13.0k | } |
6185 | 13.0k | return false; |
6186 | 13.0k | } |
6187 | | |
6188 | | void CatalogManager::NotifyTabletDeleteFinished(const TabletServerId& tserver_uuid, |
6189 | | const TabletId& tablet_id, |
6190 | 47.8k | const TableInfoPtr& table) { |
6191 | 47.8k | shared_ptr<TSDescriptor> ts_desc; |
6192 | 47.8k | if (!master_->ts_manager()->LookupTSByUUID(tserver_uuid, &ts_desc)) { |
6193 | 0 | LOG(WARNING) << "Unable to find tablet server " << tserver_uuid; |
6194 | 47.8k | } else if (!ts_desc->IsTabletDeletePending(tablet_id)) { |
6195 | 952 | LOG(WARNING) << "Pending delete for tablet " << tablet_id << " in ts " |
6196 | 952 | << tserver_uuid << " doesn't exist"; |
6197 | 46.9k | } else { |
6198 | 46.9k | LOG(INFO) << "Clearing pending delete for tablet " << tablet_id << " in ts " << tserver_uuid; |
6199 | 46.9k | ts_desc->ClearPendingTabletDelete(tablet_id); |
6200 | 46.9k | } |
6201 | 47.8k | CheckTableDeleted(table); |
6202 | 47.8k | } |
6203 | | |
6204 | | bool CatalogManager::ReplicaMapDiffersFromConsensusState(const scoped_refptr<TabletInfo>& tablet, |
6205 | 214k | const ConsensusStatePB& cstate) { |
6206 | 214k | auto locs = tablet->GetReplicaLocations(); |
6207 | 214k | if (locs->size() != implicit_cast<size_t>(cstate.config().peers_size())) { |
6208 | 28.5k | return true; |
6209 | 28.5k | } |
6210 | 760k | for (auto iter = cstate.config().peers().begin(); iter != cstate.config().peers().end(); iter++) { |
6211 | 575k | if (locs->find(iter->permanent_uuid()) == locs->end()) { |
6212 | 0 | return true; |
6213 | 0 | } |
6214 | 575k | } |
6215 | 185k | return false; |
6216 | 185k | } |
6217 | | |
6218 | | namespace { |
6219 | | |
6220 | 513k | int64_t GetCommittedConsensusStateOpIdIndex(const ReportedTabletPB& report) { |
6221 | 513k | if (!report.has_committed_consensus_state() || |
6222 | 510k | !report.committed_consensus_state().config().has_opid_index()) { |
6223 | 2.39k | return consensus::kInvalidOpIdIndex; |
6224 | 2.39k | } |
6225 | | |
6226 | 510k | return report.committed_consensus_state().config().opid_index(); |
6227 | 510k | } |
6228 | | |
6229 | | } // namespace |
6230 | | |
6231 | | bool CatalogManager::ProcessCommittedConsensusState( |
6232 | | TSDescriptor* ts_desc, |
6233 | | bool is_incremental, |
6234 | | const ReportedTabletPB& report, |
6235 | | const TableInfo::WriteLock& table_lock, |
6236 | | const TabletInfoPtr& tablet, |
6237 | | const TabletInfo::WriteLock& tablet_lock, |
6238 | 257k | std::vector<RetryingTSRpcTaskPtr>* rpcs) { |
6239 | 257k | const ConsensusStatePB& prev_cstate = tablet_lock->pb.committed_consensus_state(); |
6240 | 257k | ConsensusStatePB cstate = report.committed_consensus_state(); |
6241 | 257k | bool tablet_was_mutated = false; |
6242 | | |
6243 | | // 6a. The master only processes reports for replicas with committed |
6244 | | // consensus configurations since it needs the committed index to only |
6245 | | // cache the most up-to-date config. Since it's possible for TOMBSTONED |
6246 | | // replicas with no ConsensusMetadata on disk to be reported as having no |
6247 | | // committed config opid_index, we skip over those replicas. |
6248 | 257k | if (!cstate.config().has_opid_index()) { |
6249 | 0 | LOG(WARNING) << "Missing opid_index in reported config: " << report.ShortDebugString(); |
6250 | 0 | return false; |
6251 | 0 | } |
6252 | 257k | if (PREDICT_TRUE(FLAGS_master_ignore_stale_cstate) && |
6253 | 257k | (cstate.current_term() < prev_cstate.current_term() || |
6254 | 253k | GetCommittedConsensusStateOpIdIndex(report) < prev_cstate.config().opid_index())) { |
6255 | 6.93k | LOG(WARNING) << "Stale heartbeat for Tablet " << tablet->ToString() |
6256 | 6.93k | << " on TS " << ts_desc->permanent_uuid() |
6257 | 6.93k | << "cstate=" << cstate.ShortDebugString() |
6258 | 6.93k | << ", prev_cstate=" << prev_cstate.ShortDebugString(); |
6259 | 6.93k | return false; |
6260 | 6.93k | } |
6261 | | |
6262 | | // 6b. Disregard the leader state if the reported leader is not a member |
6263 | | // of the committed config. |
6264 | 250k | if (cstate.leader_uuid().empty() || |
6265 | 154k | !IsRaftConfigMember(cstate.leader_uuid(), cstate.config())) { |
6266 | 95.6k | cstate.clear_leader_uuid(); |
6267 | 95.6k | tablet_was_mutated = true; |
6268 | 95.6k | } |
6269 | | |
6270 | | // 6c. Mark the tablet as RUNNING if it makes sense to do so. |
6271 | | // |
6272 | | // We need to wait for a leader before marking a tablet as RUNNING, or |
6273 | | // else we could incorrectly consider a tablet created when only a |
6274 | | // minority of its replicas were successful. In that case, the tablet |
6275 | | // would be stuck in this bad state forever. |
6276 | | // - FLAG added to avoid waiting during mock tests. |
6277 | 250k | if (!tablet_lock->is_running() && |
6278 | 121k | report.state() == tablet::RUNNING && |
6279 | 121k | (cstate.has_leader_uuid() || |
6280 | 93.6k | !FLAGS_catalog_manager_wait_for_new_tablets_to_elect_leader)) { |
6281 | 0 | DCHECK_EQ(SysTabletsEntryPB::CREATING, tablet_lock->pb.state()) |
6282 | 0 | << "Tablet in unexpected state: " << tablet->ToString() |
6283 | 0 | << ": " << tablet_lock->pb.ShortDebugString(); |
6284 | 0 | VLOG(1) << "Tablet " << tablet->ToString() << " is now online"; |
6285 | 28.1k | tablet_lock.mutable_data()->set_state(SysTabletsEntryPB::RUNNING, |
6286 | 28.1k | "Tablet reported with an active leader"); |
6287 | 28.1k | tablet_was_mutated = true; |
6288 | 28.1k | } |
6289 | | |
6290 | | // 6d. Update the consensus state if: |
6291 | | // - A config change operation was committed (reflected by a change to |
6292 | | // the committed config's opid_index). |
6293 | | // - The new cstate has a leader, and either the old cstate didn't, or |
6294 | | // there was a term change. |
6295 | 250k | if (cstate.config().opid_index() > prev_cstate.config().opid_index() || |
6296 | 247k | (cstate.has_leader_uuid() && |
6297 | 151k | (!prev_cstate.has_leader_uuid() || |
6298 | 123k | cstate.current_term() > prev_cstate.current_term()))) { |
6299 | | |
6300 | | // 6d(i). Retain knowledge of the leader even if it wasn't reported in |
6301 | | // the latest config. |
6302 | | // |
6303 | | // When a config change is reported to the master, it may not include the |
6304 | | // leader because the follower doing the reporting may not know who the |
6305 | | // leader is yet (it may have just started up). It is safe to reuse |
6306 | | // the previous leader if the reported cstate has the same term as the |
6307 | | // previous cstate, and the leader was known for that term. |
6308 | 35.7k | if (cstate.current_term() == prev_cstate.current_term()) { |
6309 | 2.72k | if (!cstate.has_leader_uuid() && prev_cstate.has_leader_uuid()) { |
6310 | 1 | cstate.set_leader_uuid(prev_cstate.leader_uuid()); |
6311 | | // Sanity check to detect consensus divergence bugs. |
6312 | 2.72k | } else if (cstate.has_leader_uuid() && prev_cstate.has_leader_uuid() && |
6313 | 2.72k | cstate.leader_uuid() != prev_cstate.leader_uuid()) { |
6314 | 0 | string msg = Substitute("Previously reported cstate for tablet $0 gave " |
6315 | 0 | "a different leader for term $1 than the current cstate. " |
6316 | 0 | "Previous cstate: $2. Current cstate: $3.", |
6317 | 0 | tablet->ToString(), cstate.current_term(), |
6318 | 0 | prev_cstate.ShortDebugString(), cstate.ShortDebugString()); |
6319 | 0 | LOG(DFATAL) << msg; |
6320 | 0 | return false; |
6321 | 0 | } |
6322 | 35.7k | } |
6323 | | |
6324 | | // 6d(ii). Delete any replicas from the previous config that are not in the new one. |
6325 | 35.7k | if (FLAGS_master_tombstone_evicted_tablet_replicas) { |
6326 | 35.7k | std::unordered_set<string> current_member_uuids; |
6327 | 107k | for (const consensus::RaftPeerPB &peer : cstate.config().peers()) { |
6328 | 107k | InsertOrDie(¤t_member_uuids, peer.permanent_uuid()); |
6329 | 107k | } |
6330 | 107k | for (const consensus::RaftPeerPB &prev_peer : prev_cstate.config().peers()) { |
6331 | 107k | const string& peer_uuid = prev_peer.permanent_uuid(); |
6332 | 107k | if (!ContainsKey(current_member_uuids, peer_uuid)) { |
6333 | | // Don't delete a tablet server that hasn't reported in yet (Bootstrapping). |
6334 | 829 | shared_ptr<TSDescriptor> dummy_ts_desc; |
6335 | 829 | if (!master_->ts_manager()->LookupTSByUUID(peer_uuid, &dummy_ts_desc)) { |
6336 | 9 | continue; |
6337 | 9 | } |
6338 | | // Otherwise, the TabletServer needs to remove this peer. |
6339 | 820 | rpcs->push_back(std::make_shared<AsyncDeleteReplica>( |
6340 | 820 | master_, AsyncTaskPool(), peer_uuid, tablet->table(), tablet->tablet_id(), |
6341 | 820 | TABLET_DATA_TOMBSTONED, prev_cstate.config().opid_index(), |
6342 | 820 | Substitute("TS $0 not found in new config with opid_index $1", |
6343 | 820 | peer_uuid, cstate.config().opid_index()))); |
6344 | 820 | } |
6345 | 107k | } |
6346 | 35.7k | } |
6347 | | // 6d(iii). Update the in-memory ReplicaLocations for this tablet using the new config. |
6348 | 3 | VLOG(2) << "Updating replicas for tablet " << tablet->tablet_id() |
6349 | 3 | << " using config reported by " << ts_desc->permanent_uuid() |
6350 | 3 | << " to that committed in log index " << cstate.config().opid_index() |
6351 | 3 | << " with leader state from term " << cstate.current_term(); |
6352 | 35.7k | ReconcileTabletReplicasInLocalMemoryWithReport( |
6353 | 35.7k | tablet, ts_desc->permanent_uuid(), cstate, report); |
6354 | | |
6355 | | // 6d(iv). Update the consensus state. Don't use 'prev_cstate' after this. |
6356 | 35.7k | LOG(INFO) << "Tablet: " << tablet->tablet_id() << " reported consensus state change." |
6357 | 35.7k | << " New consensus state: " << cstate.ShortDebugString() |
6358 | 35.7k | << " from " << ts_desc->permanent_uuid(); |
6359 | 35.7k | *tablet_lock.mutable_data()->pb.mutable_committed_consensus_state() = cstate; |
6360 | 35.7k | tablet_was_mutated = true; |
6361 | 214k | } else { |
6362 | | // Report opid_index is equal to the previous opid_index. If some |
6363 | | // replica is reporting the same consensus configuration we already know about, but we |
6364 | | // haven't yet heard from all the tservers in the config, update the in-memory |
6365 | | // ReplicaLocations. |
6366 | 214k | LOG(INFO) << "Peer " << ts_desc->permanent_uuid() << " sent " |
6367 | 213k | << (is_incremental ? "incremental" : "full tablet") |
6368 | 214k | << " report for " << tablet->tablet_id() |
6369 | 214k | << ", prev state op id: " << prev_cstate.config().opid_index() |
6370 | 214k | << ", prev state term: " << prev_cstate.current_term() |
6371 | 214k | << ", prev state has_leader_uuid: " << prev_cstate.has_leader_uuid() |
6372 | 214k | << ". Consensus state: " << cstate.ShortDebugString(); |
6373 | 214k | if (GetAtomicFlag(&FLAGS_enable_register_ts_from_raft) && |
6374 | 214k | ReplicaMapDiffersFromConsensusState(tablet, cstate)) { |
6375 | 28.5k | ReconcileTabletReplicasInLocalMemoryWithReport( |
6376 | 28.5k | tablet, ts_desc->permanent_uuid(), cstate, report); |
6377 | 185k | } else { |
6378 | 185k | UpdateTabletReplicaInLocalMemory(ts_desc, &cstate, report, tablet); |
6379 | 185k | } |
6380 | 214k | } |
6381 | | |
6382 | 250k | if (FLAGS_use_create_table_leader_hint && |
6383 | 249k | !cstate.has_leader_uuid() && cstate.current_term() == 0) { |
6384 | 91.8k | StartElectionIfReady(cstate, tablet.get()); |
6385 | 91.8k | } |
6386 | | |
6387 | | // 7. Send an AlterSchema RPC if the tablet has an old schema version. |
6388 | 250k | if (report.has_schema_version() && |
6389 | 250k | report.schema_version() != table_lock->pb.version()) { |
6390 | 45 | if (report.schema_version() > table_lock->pb.version()) { |
6391 | 0 | LOG(ERROR) << "TS " << ts_desc->permanent_uuid() |
6392 | 0 | << " has reported a schema version greater than the current one " |
6393 | 0 | << " for tablet " << tablet->ToString() |
6394 | 0 | << ". Expected version " << table_lock->pb.version() |
6395 | 0 | << " got " << report.schema_version() |
6396 | 0 | << " (corruption)"; |
6397 | 45 | } else { |
6398 | | // TODO: For Alter (rolling apply to tablets), this is an expected transitory state. |
6399 | 45 | LOG(INFO) << "TS " << ts_desc->permanent_uuid() |
6400 | 45 | << " does not have the latest schema for tablet " << tablet->ToString() |
6401 | 45 | << ". Expected version " << table_lock->pb.version() |
6402 | 45 | << " got " << report.schema_version(); |
6403 | 45 | } |
6404 | | // It's possible that the tablet being reported is a laggy replica, and in fact |
6405 | | // the leader has already received an AlterTable RPC. That's OK, though -- |
6406 | | // it'll safely ignore it if we send another. |
6407 | 45 | TransactionId txn_id = TransactionId::Nil(); |
6408 | 45 | if (table_lock->pb.has_transaction() && |
6409 | 8 | table_lock->pb.transaction().has_transaction_id()) { |
6410 | 8 | LOG(INFO) << "Parsing transaction ID for tablet ID " << tablet->tablet_id(); |
6411 | 8 | auto txn_id_res = FullyDecodeTransactionId(table_lock->pb.transaction().transaction_id()); |
6412 | 8 | if (!txn_id_res.ok()) { |
6413 | 0 | LOG(WARNING) << "Parsing transaction ID failed for tablet ID " << tablet->tablet_id(); |
6414 | 0 | return false; |
6415 | 0 | } |
6416 | 8 | txn_id = txn_id_res.get(); |
6417 | 8 | } |
6418 | 45 | LOG(INFO) << "Triggering AlterTable with transaction ID " << txn_id |
6419 | 45 | << " due to heartbeat delay for tablet ID " << tablet->tablet_id(); |
6420 | 45 | rpcs->push_back(std::make_shared<AsyncAlterTable>( |
6421 | 45 | master_, AsyncTaskPool(), tablet, tablet->table(), txn_id)); |
6422 | 45 | } |
6423 | | |
6424 | 250k | return tablet_was_mutated; |
6425 | 250k | } |
6426 | | |
6427 | | Status CatalogManager::ProcessTabletReportBatch( |
6428 | | TSDescriptor* ts_desc, |
6429 | | bool is_incremental, |
6430 | | ReportedTablets::const_iterator begin, |
6431 | | ReportedTablets::const_iterator end, |
6432 | | TabletReportUpdatesPB* full_report_update, |
6433 | 259k | std::vector<RetryingTSRpcTaskPtr>* rpcs) { |
6434 | | // 1. First Pass. Iterate in TabletId Order to discover all Table locks we'll need. Even though |
6435 | | // read locks are sufficient here, take write locks since we'll be writing to the tablet while |
6436 | | // holding this. |
6437 | | // Need to acquire both types of locks in Id order to prevent deadlock. |
6438 | 259k | std::map<TableId, TableInfo::WriteLock> table_write_locks; |
6439 | 518k | for (auto it = begin; it != end; ++it) { |
6440 | 258k | auto& lock = table_write_locks[it->info->table()->id()]; |
6441 | 258k | if (!lock.locked()) { |
6442 | 258k | lock = it->info->table()->LockForWrite(); |
6443 | 258k | } |
6444 | 258k | } |
6445 | | |
6446 | 259k | map<TabletId, TabletInfo::WriteLock> tablet_write_locks; // used for unlock. |
6447 | | // 2. Second Pass. Process each tablet. This may not be in the order that the tablets |
6448 | | // appear in 'full_report', but that has no bearing on correctness. |
6449 | 259k | vector<TabletInfo*> mutated_tablets; // refcount protected by 'tablet_infos' |
6450 | 519k | for (auto it = begin; it != end; ++it) { |
6451 | 259k | const auto& tablet_id = it->tablet_id; |
6452 | 259k | const TabletInfoPtr& tablet = it->info; |
6453 | 259k | const ReportedTabletPB& report = *it->report; |
6454 | 259k | const TableInfoPtr& table = tablet->table(); |
6455 | | |
6456 | | // Prepare an heartbeat response entry for this tablet, now that we're going to process it. |
6457 | | // Every tablet in the report that is processed gets one, even if there are no changes to it. |
6458 | 259k | ReportedTabletUpdatesPB* update = full_report_update->add_tablets(); |
6459 | 259k | update->set_tablet_id(tablet_id); |
6460 | | |
6461 | | // Get tablet lock on demand. This works in the batch case because the loop is ordered. |
6462 | 259k | tablet_write_locks[tablet_id] = tablet->LockForWrite(); |
6463 | 259k | auto& table_lock = table_write_locks[table->id()]; |
6464 | 259k | auto& tablet_lock = tablet_write_locks[tablet_id]; |
6465 | | |
6466 | 259k | TRACE_EVENT1("master", "HandleReportedTablet", "tablet_id", report.tablet_id()); |
6467 | 259k | RETURN_NOT_OK_PREPEND(CheckIsLeaderAndReady(), |
6468 | 259k | Substitute("This master is no longer the leader, unable to handle report for tablet $0", |
6469 | 259k | tablet_id)); |
6470 | | |
6471 | 18.4E | VLOG(3) << "tablet report: " << report.ShortDebugString(); |
6472 | | |
6473 | | // 3. Delete the tablet if it (or its table) have been deleted. |
6474 | 259k | if (tablet_lock->is_deleted() || |
6475 | 259k | table_lock->started_deleting()) { |
6476 | 244 | const string msg = tablet_lock->pb.state_msg(); |
6477 | 244 | update->set_state_msg(msg); |
6478 | 244 | LOG(INFO) << "Got report from deleted tablet " << tablet->ToString() |
6479 | 244 | << " (" << msg << "): Sending delete request for this tablet"; |
6480 | | // TODO(unknown): Cancel tablet creation, instead of deleting, in cases |
6481 | | // where that might be possible (tablet creation timeout & replacement). |
6482 | 244 | rpcs->push_back(std::make_shared<AsyncDeleteReplica>( |
6483 | 244 | master_, AsyncTaskPool(), ts_desc->permanent_uuid(), table, tablet_id, |
6484 | 244 | TABLET_DATA_DELETED, boost::none, msg)); |
6485 | 244 | ts_desc->AddPendingTabletDelete(tablet_id); |
6486 | 244 | continue; |
6487 | 244 | } |
6488 | | |
6489 | 259k | if (!table_lock->is_running()) { |
6490 | 0 | const string msg = tablet_lock->pb.state_msg(); |
6491 | 0 | LOG(INFO) << "Got report from tablet " << tablet->tablet_id() |
6492 | 0 | << " for non-running table " << table->ToString() << ": " << msg; |
6493 | 0 | update->set_state_msg(msg); |
6494 | 0 | continue; |
6495 | 0 | } |
6496 | | |
6497 | | // 3. Tombstone a replica that is no longer part of the Raft config (and |
6498 | | // not already tombstoned or deleted outright). |
6499 | | // |
6500 | | // If the report includes a committed raft config, we only tombstone if |
6501 | | // the opid_index is strictly less than the latest reported committed |
6502 | | // config. This prevents us from spuriously deleting replicas that have |
6503 | | // just been added to the committed config and are in the process of copying. |
6504 | 259k | const ConsensusStatePB& prev_cstate = tablet_lock->pb.committed_consensus_state(); |
6505 | 259k | const int64_t prev_opid_index = prev_cstate.config().opid_index(); |
6506 | 259k | const int64_t report_opid_index = GetCommittedConsensusStateOpIdIndex(report); |
6507 | 259k | if (FLAGS_master_tombstone_evicted_tablet_replicas && |
6508 | 259k | report.tablet_data_state() != TABLET_DATA_TOMBSTONED && |
6509 | 259k | report.tablet_data_state() != TABLET_DATA_DELETED && |
6510 | 259k | report_opid_index < prev_opid_index && |
6511 | 6.02k | !IsRaftConfigMember(ts_desc->permanent_uuid(), prev_cstate.config())) { |
6512 | 173 | const string delete_msg = (report_opid_index == consensus::kInvalidOpIdIndex) ? |
6513 | 16 | "Replica has no consensus available" : |
6514 | 157 | Substitute("Replica with old config index $0", report_opid_index); |
6515 | 173 | rpcs->push_back(std::make_shared<AsyncDeleteReplica>( |
6516 | 173 | master_, AsyncTaskPool(), ts_desc->permanent_uuid(), table, tablet_id, |
6517 | 173 | TABLET_DATA_TOMBSTONED, prev_opid_index, |
6518 | 173 | Substitute("$0 (current committed config index is $1)", |
6519 | 173 | delete_msg, prev_opid_index))); |
6520 | 173 | ts_desc->AddPendingTabletDelete(tablet_id); |
6521 | 173 | continue; |
6522 | 173 | } |
6523 | | |
6524 | | // 4. Skip a non-deleted tablet which reports an error. |
6525 | 259k | if (report.has_error()) { |
6526 | 0 | Status s = StatusFromPB(report.error()); |
6527 | 0 | DCHECK(!s.ok()); |
6528 | 0 | DCHECK_EQ(report.state(), tablet::FAILED); |
6529 | 0 | LOG(WARNING) << "Tablet " << tablet->ToString() << " has failed on TS " |
6530 | 0 | << ts_desc->permanent_uuid() << ": " << s.ToString(); |
6531 | 0 | continue; |
6532 | 0 | } |
6533 | | |
6534 | | // Hide the tablet if it (or its table) has been hidden and the tablet hasn't. |
6535 | 259k | if ((tablet_lock->is_hidden() || |
6536 | 259k | table_lock->started_hiding()) && |
6537 | 0 | report.has_is_hidden() && |
6538 | 0 | !report.is_hidden()) { |
6539 | 0 | const string msg = tablet_lock->pb.state_msg(); |
6540 | 0 | LOG(INFO) << "Got report from hidden tablet " << tablet->ToString() |
6541 | 0 | << " (" << msg << "): Sending hide request for this tablet"; |
6542 | 0 | auto task = std::make_shared<AsyncDeleteReplica>( |
6543 | 0 | master_, AsyncTaskPool(), ts_desc->permanent_uuid(), table, tablet_id, |
6544 | 0 | TABLET_DATA_DELETED, boost::none, msg); |
6545 | 0 | task->set_hide_only(true); |
6546 | 0 | ts_desc->AddPendingTabletDelete(tablet_id); |
6547 | 0 | rpcs->push_back(task); |
6548 | 0 | } |
6549 | | |
6550 | | // 5. Process the report's consensus state. |
6551 | | // The report will not have a committed_consensus_state if it is in the |
6552 | | // middle of starting up, such as during tablet bootstrap. |
6553 | | // If we received an incremental report, and the tablet is starting up, we will update the |
6554 | | // replica so that the balancer knows how many tablets are in the middle of remote bootstrap. |
6555 | 259k | if (report.has_committed_consensus_state()) { |
6556 | 257k | if (ProcessCommittedConsensusState( |
6557 | 131k | ts_desc, is_incremental, report, table_lock, tablet, tablet_lock, rpcs)) { |
6558 | | // 6. If the tablet was mutated, add it to the tablets to be re-persisted. |
6559 | | // |
6560 | | // Done here and not on a per-mutation basis to avoid duplicate entries. |
6561 | 131k | mutated_tablets.push_back(tablet.get()); |
6562 | 131k | } |
6563 | 2.35k | } else if (is_incremental && |
6564 | 2.29k | (report.state() == tablet::NOT_STARTED || report.state() == tablet::BOOTSTRAPPING)) { |
6565 | | // When a tablet server is restarted, it sends a full tablet report with all of its tablets |
6566 | | // in the NOT_STARTED state, so this would make the load balancer think that all the |
6567 | | // tablets are being remote bootstrapped at once, so only process incremental reports here. |
6568 | 2.29k | UpdateTabletReplicaInLocalMemory(ts_desc, nullptr /* consensus */, report, tablet); |
6569 | 2.29k | } |
6570 | 259k | } // Finished one round of batch processing. |
6571 | | |
6572 | | // 7. Unlock the tables; we no longer need to access their state. |
6573 | 259k | for (auto& l : table_write_locks) { |
6574 | 259k | l.second.Unlock(); |
6575 | 259k | } |
6576 | 259k | table_write_locks.clear(); |
6577 | | |
6578 | | // 8. Write all tablet mutations to the catalog table. |
6579 | | // |
6580 | | // SysCatalogTable::Write will short-circuit the case where the data has not |
6581 | | // in fact changed since the previous version and avoid any unnecessary mutations. |
6582 | 259k | if (!mutated_tablets.empty()) { |
6583 | 131k | Status s = sys_catalog_->Upsert(leader_ready_term(), mutated_tablets); |
6584 | 131k | if (!s.ok()) { |
6585 | 0 | LOG(WARNING) << "Error updating tablets: " << s; |
6586 | 0 | return s; |
6587 | 0 | } |
6588 | 259k | } |
6589 | | // Filter the mutated tablets to find which tablets were modified. Need to actually commit the |
6590 | | // state of the tablets before updating the system.partitions table, so get this first. |
6591 | 259k | vector<TabletInfoPtr> yql_partitions_mutated_tablets = |
6592 | 259k | VERIFY_RESULT(GetYqlPartitionsVtable().FilterRelevantTablets(mutated_tablets)); |
6593 | | |
6594 | | // 9. Publish the in-memory tablet mutations and release the locks. |
6595 | 259k | for (auto& l : tablet_write_locks) { |
6596 | 259k | l.second.Commit(); |
6597 | 259k | } |
6598 | 259k | tablet_write_locks.clear(); |
6599 | | |
6600 | | // Update the relevant tablet entries in system.partitions. |
6601 | 259k | if (!yql_partitions_mutated_tablets.empty()) { |
6602 | 17.7k | Status s = GetYqlPartitionsVtable() |
6603 | 17.7k | .ProcessMutatedTablets(yql_partitions_mutated_tablets, tablet_write_locks); |
6604 | 17.7k | } |
6605 | | |
6606 | | // 10. Third Pass. Process all tablet schema version changes. |
6607 | | // (This is separate from tablet state mutations because only table on-disk state is changed.) |
6608 | 519k | for (auto it = begin; it != end; ++it) { |
6609 | 259k | const ReportedTabletPB& report = *it->report; |
6610 | 259k | if (!report.has_schema_version()) { |
6611 | 0 | continue; |
6612 | 0 | } |
6613 | 259k | const TabletInfoPtr& tablet = it->info; |
6614 | 259k | auto leader = tablet->GetLeader(); |
6615 | 259k | if (leader.ok() && leader.get()->permanent_uuid() == ts_desc->permanent_uuid()) { |
6616 | 36.3k | RETURN_NOT_OK(HandleTabletSchemaVersionReport(tablet.get(), report.schema_version())); |
6617 | 36.3k | } |
6618 | 259k | } |
6619 | | |
6620 | 259k | return Status::OK(); |
6621 | 259k | } |
6622 | | |
6623 | | Status CatalogManager::ProcessTabletReport(TSDescriptor* ts_desc, |
6624 | | const TabletReportPB& full_report, |
6625 | | TabletReportUpdatesPB* full_report_update, |
6626 | 383k | RpcContext* rpc) { |
6627 | 383k | int num_tablets = full_report.updated_tablets_size(); |
6628 | 383k | TRACE_EVENT2("master", "ProcessTabletReport", |
6629 | 383k | "requestor", rpc->requestor_string(), |
6630 | 383k | "num_tablets", num_tablets); |
6631 | | |
6632 | 566 | VLOG_WITH_PREFIX(2) << "Received tablet report from " << RequestorString(rpc) << "(" |
6633 | 566 | << ts_desc->permanent_uuid() << "): " << full_report.DebugString(); |
6634 | | |
6635 | 383k | if (!ts_desc->has_tablet_report() && full_report.is_incremental()) { |
6636 | 5.45k | LOG_WITH_PREFIX(WARNING) |
6637 | 5.45k | << "Invalid tablet report from " << ts_desc->permanent_uuid() |
6638 | 5.45k | << ": Received an incremental tablet report when a full one was needed"; |
6639 | | // We should respond with success in order to send reply that we need full report. |
6640 | 5.45k | return Status::OK(); |
6641 | 5.45k | } |
6642 | | |
6643 | | // TODO: on a full tablet report, we may want to iterate over the tablets we think |
6644 | | // the server should have, compare vs the ones being reported, and somehow mark |
6645 | | // any that have been "lost" (eg somehow the tablet metadata got corrupted or something). |
6646 | | |
6647 | 377k | ReportedTablets reported_tablets; |
6648 | | |
6649 | | // Tablet Deletes to process after the catalog lock below. |
6650 | 377k | set<TabletId> tablets_to_delete; |
6651 | | |
6652 | 377k | { |
6653 | | // Lock the catalog to iterate over tablet_ids_map_ & table_ids_map_. |
6654 | 377k | SharedLock lock(mutex_); |
6655 | | |
6656 | | // Fill the above variables before processing |
6657 | 377k | full_report_update->mutable_tablets()->Reserve(num_tablets); |
6658 | 261k | for (const ReportedTabletPB& report : full_report.updated_tablets()) { |
6659 | 261k | const string& tablet_id = report.tablet_id(); |
6660 | | |
6661 | | // 1a. Find the tablet, deleting/skipping it if it can't be found. |
6662 | 261k | scoped_refptr<TabletInfo> tablet = FindPtrOrNull(*tablet_map_, tablet_id); |
6663 | 261k | if (!tablet) { |
6664 | | // If a TS reported an unknown tablet, send a delete tablet rpc to the TS. |
6665 | 0 | LOG(INFO) << "Null tablet reported, possibly the TS was not around when the" |
6666 | 0 | " table was being deleted. Sending Delete tablet RPC to this TS."; |
6667 | 0 | tablets_to_delete.insert(tablet_id); |
6668 | | // Every tablet in the report that is processed gets a heartbeat response entry. |
6669 | 0 | ReportedTabletUpdatesPB* update = full_report_update->add_tablets(); |
6670 | 0 | update->set_tablet_id(tablet_id); |
6671 | 0 | continue; |
6672 | 0 | } |
6673 | 261k | if (!tablet->table() || FindOrNull(*table_ids_map_, tablet->table()->id()) == nullptr) { |
6674 | 0 | auto table_id = tablet->table() == nullptr ? "(null)" : tablet->table()->id(); |
6675 | 0 | LOG(INFO) << "Got report from an orphaned tablet " << tablet_id << " on table " << table_id; |
6676 | 0 | tablets_to_delete.insert(tablet_id); |
6677 | | // Every tablet in the report that is processed gets a heartbeat response entry. |
6678 | 0 | ReportedTabletUpdatesPB* update = full_report_update->add_tablets(); |
6679 | 0 | update->set_tablet_id(tablet_id); |
6680 | 0 | continue; |
6681 | 0 | } |
6682 | | |
6683 | | // 1b. Found the tablet, update local state. |
6684 | 261k | reported_tablets.push_back(ReportedTablet { |
6685 | 261k | .tablet_id = tablet_id, |
6686 | 261k | .info = tablet, |
6687 | 261k | .report = &report, |
6688 | 261k | }); |
6689 | 261k | } |
6690 | 377k | } |
6691 | | |
6692 | 122k | std::sort(reported_tablets.begin(), reported_tablets.end(), [](const auto& lhs, const auto& rhs) { |
6693 | 122k | return lhs.tablet_id < rhs.tablet_id; |
6694 | 122k | }); |
6695 | | |
6696 | | // Process any delete requests from orphaned tablets, identified above. |
6697 | 0 | for (auto tablet_id : tablets_to_delete) { |
6698 | 0 | SendDeleteTabletRequest(tablet_id, TABLET_DATA_DELETED, boost::none, nullptr, ts_desc, |
6699 | 0 | "Report from an orphaned tablet"); |
6700 | 0 | } |
6701 | | |
6702 | | // Calculate the deadline for this expensive loop coming up. |
6703 | 377k | const auto safe_deadline = rpc->GetClientDeadline() - |
6704 | 377k | (FLAGS_heartbeat_rpc_timeout_ms * 1ms * FLAGS_heartbeat_safe_deadline_ratio); |
6705 | | |
6706 | | // Process tablets by batches. |
6707 | 636k | for (auto tablet_iter = reported_tablets.begin(); tablet_iter != reported_tablets.end();) { |
6708 | 259k | auto batch_begin = tablet_iter; |
6709 | 259k | tablet_iter += std::min<size_t>( |
6710 | 259k | reported_tablets.end() - tablet_iter, FLAGS_catalog_manager_report_batch_size); |
6711 | | |
6712 | | // Keeps track of all RPCs that should be sent when we're done with a single batch. |
6713 | 259k | std::vector<RetryingTSRpcTaskPtr> rpcs; |
6714 | 259k | auto status = ProcessTabletReportBatch( |
6715 | 259k | ts_desc, full_report.is_incremental(), batch_begin, tablet_iter, full_report_update, &rpcs); |
6716 | 259k | if (!status.ok()) { |
6717 | 0 | for (auto& rpc : rpcs) { |
6718 | 0 | rpc->AbortAndReturnPrevState(status); |
6719 | 0 | } |
6720 | 0 | return status; |
6721 | 0 | } |
6722 | | |
6723 | | // 13. Send all queued RPCs. |
6724 | 259k | for (auto& rpc : rpcs) { |
6725 | 1.28k | DCHECK(rpc->table()); |
6726 | 1.28k | rpc->table()->AddTask(rpc); |
6727 | 1.28k | WARN_NOT_OK(ScheduleTask(rpc), Substitute("Failed to send $0", rpc->description())); |
6728 | 1.28k | } |
6729 | 259k | rpcs.clear(); |
6730 | | |
6731 | | // 14. Check deadline. Need to exit before processing all batches if we're close to timing out. |
6732 | 259k | if (ts_desc->HasCapability(CAPABILITY_TabletReportLimit) && |
6733 | 259k | tablet_iter != reported_tablets.end()) { |
6734 | | // [TESTING] Inject latency before processing a batch to test deadline. |
6735 | 91.1k | if (PREDICT_FALSE(FLAGS_TEST_inject_latency_during_tablet_report_ms > 0)) { |
6736 | 0 | LOG(INFO) << "Sleeping in CatalogManager::ProcessTabletReport for " |
6737 | 0 | << FLAGS_TEST_inject_latency_during_tablet_report_ms << " ms"; |
6738 | 0 | SleepFor(MonoDelta::FromMilliseconds(FLAGS_TEST_inject_latency_during_tablet_report_ms)); |
6739 | 0 | } |
6740 | | |
6741 | | // Return from here at configured safe heartbeat deadline to give the response packet time. |
6742 | 91.1k | if (safe_deadline < CoarseMonoClock::Now()) { |
6743 | 842 | LOG(INFO) << "Reached Heartbeat deadline. Returning early after processing " |
6744 | 842 | << full_report_update->tablets_size() << " tablets"; |
6745 | 842 | full_report_update->set_processing_truncated(true); |
6746 | 842 | return Status::OK(); |
6747 | 842 | } |
6748 | 91.1k | } |
6749 | 259k | } // Loop to process the next batch until fully iterated. |
6750 | | |
6751 | 377k | if (!full_report.is_incremental()) { |
6752 | | // A full report may take multiple heartbeats. |
6753 | | // The TS communicates how much is left to process for the full report beyond this specific HB. |
6754 | 5.58k | bool completed_full_report = !full_report.has_remaining_tablet_count() |
6755 | 5.58k | || full_report.remaining_tablet_count() == 0; |
6756 | 5.58k | if (full_report.updated_tablets_size() == 0) { |
6757 | 5.44k | LOG(INFO) << ts_desc->permanent_uuid() << " sent full tablet report with 0 tablets."; |
6758 | 137 | } else if (!ts_desc->has_tablet_report()) { |
6759 | 137 | LOG(INFO) << ts_desc->permanent_uuid() |
6760 | 137 | << (completed_full_report ? " finished" : " receiving") << " first full report: " |
6761 | 137 | << full_report.updated_tablets_size() << " tablets."; |
6762 | 137 | } |
6763 | | // We have a tablet report only once we're done processing all the chunks of the initial report. |
6764 | 5.58k | ts_desc->set_has_tablet_report(completed_full_report); |
6765 | 5.58k | } |
6766 | | |
6767 | | // 14. Queue background processing if we had updates. |
6768 | 377k | if (full_report.updated_tablets_size() > 0) { |
6769 | 168k | background_tasks_->WakeIfHasPendingUpdates(); |
6770 | 168k | } |
6771 | | |
6772 | 377k | return Status::OK(); |
6773 | 377k | } |
6774 | | |
6775 | | Status CatalogManager::CreateTablegroup(const CreateTablegroupRequestPB* req, |
6776 | | CreateTablegroupResponsePB* resp, |
6777 | 3 | rpc::RpcContext* rpc) { |
6778 | | |
6779 | 3 | CreateTableRequestPB ctreq; |
6780 | 3 | CreateTableResponsePB ctresp; |
6781 | | |
6782 | | // Sanity check for PB fields. |
6783 | 3 | if (!req->has_id() || !req->has_namespace_id() || !req->has_namespace_name()) { |
6784 | 0 | Status s = STATUS(InvalidArgument, "Improper CREATE TABLEGROUP request (missing fields)."); |
6785 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
6786 | 0 | } |
6787 | | |
6788 | | // Use the tablegroup id as the prefix for the parent table id. |
6789 | 3 | const auto parent_table_id = req->id() + kTablegroupParentTableIdSuffix; |
6790 | 3 | const auto parent_table_name = req->id() + kTablegroupParentTableNameSuffix; |
6791 | 3 | ctreq.set_name(parent_table_name); |
6792 | 3 | ctreq.set_table_id(parent_table_id); |
6793 | 3 | ctreq.mutable_namespace_()->set_name(req->namespace_name()); |
6794 | 3 | ctreq.mutable_namespace_()->set_id(req->namespace_id()); |
6795 | 3 | ctreq.set_table_type(PGSQL_TABLE_TYPE); |
6796 | 3 | ctreq.set_tablegroup_id(req->id()); |
6797 | 3 | ctreq.set_tablespace_id(req->tablespace_id()); |
6798 | | |
6799 | 3 | YBSchemaBuilder schemaBuilder; |
6800 | 3 | schemaBuilder.AddColumn("parent_column")->Type(BINARY)->PrimaryKey()->NotNull(); |
6801 | 3 | YBSchema ybschema; |
6802 | 3 | CHECK_OK(schemaBuilder.Build(&ybschema)); |
6803 | 3 | auto schema = yb::client::internal::GetSchema(ybschema); |
6804 | 3 | SchemaToPB(schema, ctreq.mutable_schema()); |
6805 | 3 | if (!FLAGS_TEST_tablegroup_master_only) { |
6806 | 2 | ctreq.mutable_schema()->mutable_table_properties()->set_is_transactional(true); |
6807 | 2 | } |
6808 | | |
6809 | | // Create a parent table, which will create the tablet. |
6810 | 3 | Status s = CreateTable(&ctreq, &ctresp, rpc); |
6811 | 3 | resp->set_parent_table_id(ctresp.table_id()); |
6812 | 3 | resp->set_parent_table_name(parent_table_name); |
6813 | | |
6814 | | // Carry over error. |
6815 | 3 | if (ctresp.has_error()) { |
6816 | 0 | resp->mutable_error()->Swap(ctresp.mutable_error()); |
6817 | 0 | } |
6818 | | |
6819 | | // We do not lock here so it is technically possible that the table was already created. |
6820 | | // If so, there is nothing to do so we just ignore the "AlreadyPresent" error. |
6821 | 3 | if (!s.ok() && !s.IsAlreadyPresent()) { |
6822 | 0 | LOG(WARNING) << "Tablegroup creation failed: " << s.ToString(); |
6823 | 0 | return s; |
6824 | 0 | } |
6825 | | |
6826 | | // Update catalog manager maps |
6827 | 3 | SharedLock lock(mutex_); |
6828 | 3 | TRACE("Acquired catalog manager lock"); |
6829 | 3 | TablegroupInfo *tg = new TablegroupInfo(req->id(), req->namespace_id()); |
6830 | 3 | tablegroup_ids_map_[req->id()] = tg; |
6831 | | |
6832 | 3 | return s; |
6833 | 3 | } |
6834 | | |
6835 | | Status CatalogManager::DeleteTablegroup(const DeleteTablegroupRequestPB* req, |
6836 | | DeleteTablegroupResponsePB* resp, |
6837 | 2 | rpc::RpcContext* rpc) { |
6838 | 2 | DeleteTableRequestPB dtreq; |
6839 | 2 | DeleteTableResponsePB dtresp; |
6840 | | |
6841 | | // Sanity check for PB fields |
6842 | 2 | if (!req->has_id() || !req->has_namespace_id()) { |
6843 | 0 | Status s = STATUS(InvalidArgument, "Improper DELETE TABLEGROUP request (missing fields)."); |
6844 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
6845 | 0 | } |
6846 | | |
6847 | | // Use the tablegroup id as the prefix for the parent table id. |
6848 | 2 | const auto parent_table_id = req->id() + kTablegroupParentTableIdSuffix; |
6849 | 2 | const auto parent_table_name = req->id() + kTablegroupParentTableNameSuffix; |
6850 | | |
6851 | 2 | dtreq.mutable_table()->set_table_name(parent_table_name); |
6852 | 2 | dtreq.mutable_table()->set_table_id(parent_table_id); |
6853 | 2 | dtreq.set_is_index_table(false); |
6854 | | |
6855 | 2 | Status s = DeleteTable(&dtreq, &dtresp, rpc); |
6856 | 2 | resp->set_parent_table_id(dtresp.table_id()); |
6857 | | |
6858 | | // Carry over error. |
6859 | 2 | if (dtresp.has_error()) { |
6860 | 0 | resp->mutable_error()->Swap(dtresp.mutable_error()); |
6861 | 0 | return s; |
6862 | 0 | } |
6863 | | |
6864 | | // Perform map updates. |
6865 | 2 | SharedLock lock(mutex_); |
6866 | 2 | TRACE("Acquired catalog manager lock"); |
6867 | 2 | tablegroup_ids_map_.erase(req->id()); |
6868 | 2 | tablegroup_tablet_ids_map_[req->namespace_id()].erase(req->id()); |
6869 | | |
6870 | 2 | LOG(INFO) << "Deleted table " << parent_table_name; |
6871 | 2 | return s; |
6872 | 2 | } |
6873 | | |
6874 | | Status CatalogManager::ListTablegroups(const ListTablegroupsRequestPB* req, |
6875 | | ListTablegroupsResponsePB* resp, |
6876 | 3 | rpc::RpcContext* rpc) { |
6877 | 3 | SharedLock lock(mutex_); |
6878 | | |
6879 | 3 | if (!req->has_namespace_id()) { |
6880 | 0 | Status s = STATUS(InvalidArgument, "Improper ListTablegroups request (missing fields)."); |
6881 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); |
6882 | 0 | } |
6883 | | |
6884 | 3 | if (tablegroup_tablet_ids_map_.find(req->namespace_id()) == tablegroup_tablet_ids_map_.end()) { |
6885 | 0 | return STATUS(NotFound, "Tablegroups not found for namespace id: ", req->namespace_id()); |
6886 | 0 | } |
6887 | | |
6888 | 3 | for (const auto& entry : tablegroup_tablet_ids_map_[req->namespace_id()]) { |
6889 | 3 | const TablegroupId tgid = entry.first; |
6890 | 3 | if (tablegroup_ids_map_.find(tgid) == tablegroup_ids_map_.end()) { |
6891 | 0 | LOG(WARNING) << "Tablegroup info in " << req->namespace_id() |
6892 | 0 | << " not found for tablegroup id: " << tgid; |
6893 | 0 | continue; |
6894 | 0 | } |
6895 | 3 | scoped_refptr<TablegroupInfo> tginfo = tablegroup_ids_map_[tgid]; |
6896 | | |
6897 | 3 | TablegroupIdentifierPB *tg = resp->add_tablegroups(); |
6898 | 3 | tg->set_id(tginfo->id()); |
6899 | 3 | tg->set_namespace_id(tginfo->namespace_id()); |
6900 | 3 | } |
6901 | 3 | return Status::OK(); |
6902 | 3 | } |
6903 | | |
6904 | 1 | bool CatalogManager::HasTablegroups() { |
6905 | 1 | SharedLock lock(mutex_); |
6906 | 1 | return !tablegroup_ids_map_.empty(); |
6907 | 1 | } |
6908 | | |
6909 | | Status CatalogManager::CreateNamespace(const CreateNamespaceRequestPB* req, |
6910 | | CreateNamespaceResponsePB* resp, |
6911 | 2.08k | rpc::RpcContext* rpc) { |
6912 | 2.08k | Status return_status; |
6913 | | |
6914 | | // Copy the request, so we can fill in some defaults. |
6915 | 2.08k | LOG(INFO) << "CreateNamespace from " << RequestorString(rpc) |
6916 | 2.08k | << ": " << req->DebugString(); |
6917 | | |
6918 | 2.08k | scoped_refptr<NamespaceInfo> ns; |
6919 | 2.08k | std::vector<scoped_refptr<TableInfo>> pgsql_tables; |
6920 | 2.08k | TransactionMetadata txn; |
6921 | 2.08k | const auto db_type = GetDatabaseType(*req); |
6922 | 2.08k | { |
6923 | 2.08k | LockGuard lock(mutex_); |
6924 | 2.08k | TRACE("Acquired catalog manager lock"); |
6925 | | |
6926 | | // Validate the user request. |
6927 | | |
6928 | | // Verify that the namespace does not already exist. |
6929 | 2.08k | ns = FindPtrOrNull(namespace_ids_map_, req->namespace_id()); // Same ID. |
6930 | 2.08k | if (ns == nullptr && db_type != YQL_DATABASE_PGSQL) { |
6931 | | // PGSQL databases have name uniqueness handled at a different layer, so ignore overlaps. |
6932 | 2.00k | ns = FindPtrOrNull(namespace_names_mapper_[db_type], req->name()); |
6933 | 2.00k | } |
6934 | 2.08k | if (ns != nullptr) { |
6935 | 4 | resp->set_id(ns->id()); |
6936 | 4 | return_status = STATUS_SUBSTITUTE(AlreadyPresent, "Keyspace '$0' already exists", |
6937 | 4 | req->name()); |
6938 | 4 | LOG(WARNING) << "Found keyspace: " << ns->id() << ". Failed creating keyspace with error: " |
6939 | 4 | << return_status.ToString() << " Request:\n" << req->DebugString(); |
6940 | 4 | return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_ALREADY_PRESENT, |
6941 | 4 | return_status); |
6942 | 4 | } |
6943 | | |
6944 | | // Add the new namespace. |
6945 | | |
6946 | | // Create unique id for this new namespace. |
6947 | 2.07k | NamespaceId new_id = !req->namespace_id().empty() |
6948 | 2.03k | ? req->namespace_id() : GenerateIdUnlocked(SysRowEntryType::NAMESPACE); |
6949 | 2.07k | ns = new NamespaceInfo(new_id); |
6950 | 2.07k | ns->mutable_metadata()->StartMutation(); |
6951 | 2.07k | SysNamespaceEntryPB *metadata = &ns->mutable_metadata()->mutable_dirty()->pb; |
6952 | 2.07k | metadata->set_name(req->name()); |
6953 | 2.07k | metadata->set_database_type(db_type); |
6954 | 2.07k | metadata->set_colocated(req->colocated()); |
6955 | 2.07k | metadata->set_state(SysNamespaceEntryPB::PREPARING); |
6956 | | |
6957 | | // For namespace created for a Postgres database, save the list of tables and indexes for |
6958 | | // for the database that need to be copied. |
6959 | 2.07k | if (db_type == YQL_DATABASE_PGSQL) { |
6960 | 81 | if (req->source_namespace_id().empty()) { |
6961 | 59 | metadata->set_next_pg_oid(req->next_pg_oid()); |
6962 | 22 | } else { |
6963 | 22 | const auto source_oid = GetPgsqlDatabaseOid(req->source_namespace_id()); |
6964 | 22 | if (!source_oid.ok()) { |
6965 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, |
6966 | 0 | source_oid.status()); |
6967 | 0 | } |
6968 | 13.1k | for (const auto& iter : *table_ids_map_) { |
6969 | 13.1k | const auto& table_id = iter.first; |
6970 | 13.1k | const auto& table = iter.second; |
6971 | 13.1k | if (IsPgsqlId(table_id) && CHECK_RESULT(GetPgsqlDatabaseOid(table_id)) == *source_oid) { |
6972 | | // Since indexes have dependencies on the base tables, put the tables in the front. |
6973 | 2.81k | const bool is_table = table->indexed_table_id().empty(); |
6974 | 1.58k | pgsql_tables.insert(is_table ? pgsql_tables.begin() : pgsql_tables.end(), table); |
6975 | 2.81k | } |
6976 | 13.1k | } |
6977 | | |
6978 | 22 | scoped_refptr<NamespaceInfo> source_ns = FindPtrOrNull(namespace_ids_map_, |
6979 | 22 | req->source_namespace_id()); |
6980 | 22 | if (!source_ns) { |
6981 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, |
6982 | 0 | STATUS(NotFound, "Source keyspace not found", |
6983 | 0 | req->source_namespace_id())); |
6984 | 0 | } |
6985 | 22 | auto source_ns_lock = source_ns->LockForRead(); |
6986 | 22 | metadata->set_next_pg_oid(source_ns_lock->pb.next_pg_oid()); |
6987 | 22 | } |
6988 | 81 | } |
6989 | | |
6990 | | // NS with a Transaction should be rolled back if the transaction does not get Committed. |
6991 | | // Store this on the NS for now and use it later. |
6992 | 2.07k | if (req->has_transaction() && PREDICT_TRUE(FLAGS_enable_transactional_ddl_gc)) { |
6993 | 22 | metadata->mutable_transaction()->CopyFrom(req->transaction()); |
6994 | 22 | txn = VERIFY_RESULT(TransactionMetadata::FromPB(req->transaction())); |
6995 | 22 | RSTATUS_DCHECK(!txn.status_tablet.empty(), Corruption, "Given incomplete Transaction"); |
6996 | 22 | } |
6997 | | |
6998 | | // Add the namespace to the in-memory map for the assignment. |
6999 | 2.07k | namespace_ids_map_[ns->id()] = ns; |
7000 | 2.07k | namespace_names_mapper_[db_type][req->name()] = ns; |
7001 | | |
7002 | 2.07k | resp->set_id(ns->id()); |
7003 | 2.07k | } |
7004 | 2.07k | TRACE("Inserted new keyspace info into CatalogManager maps"); |
7005 | | |
7006 | | // Update the on-disk system catalog. |
7007 | 2.07k | return_status = sys_catalog_->Upsert(leader_ready_term(), ns); |
7008 | 2.07k | if (!return_status.ok()) { |
7009 | 8 | LOG(WARNING) << "Keyspace creation failed:" << return_status.ToString(); |
7010 | 8 | { |
7011 | 8 | LockGuard lock(mutex_); |
7012 | 8 | namespace_ids_map_.erase(ns->id()); |
7013 | 8 | namespace_names_mapper_[db_type].erase(req->name()); |
7014 | 8 | } |
7015 | 8 | ns->mutable_metadata()->AbortMutation(); |
7016 | 8 | return CheckIfNoLongerLeaderAndSetupError(return_status, resp); |
7017 | 8 | } |
7018 | 2.07k | TRACE("Wrote keyspace to sys-catalog"); |
7019 | | // Commit the namespace in-memory state. |
7020 | 2.07k | ns->mutable_metadata()->CommitMutation(); |
7021 | | |
7022 | 2.07k | LOG(INFO) << "Created keyspace " << ns->ToString(); |
7023 | | |
7024 | 2.07k | if (req->has_creator_role_name()) { |
7025 | 904 | RETURN_NOT_OK(permissions_manager_->GrantPermissions( |
7026 | 904 | req->creator_role_name(), |
7027 | 904 | get_canonical_keyspace(req->name()), |
7028 | 904 | req->name() /* resource name */, |
7029 | 904 | req->name() /* keyspace name */, |
7030 | 904 | all_permissions_for_resource(ResourceType::KEYSPACE), |
7031 | 904 | ResourceType::KEYSPACE, |
7032 | 904 | resp)); |
7033 | 904 | } |
7034 | | |
7035 | | // Colocated databases need to create a parent tablet to serve as the base storage location. |
7036 | 2.07k | if (req->colocated()) { |
7037 | 6 | CreateTableRequestPB req; |
7038 | 6 | CreateTableResponsePB resp; |
7039 | 6 | const auto parent_table_id = ns->id() + kColocatedParentTableIdSuffix; |
7040 | 6 | const auto parent_table_name = ns->id() + kColocatedParentTableNameSuffix; |
7041 | 6 | req.set_name(parent_table_name); |
7042 | 6 | req.set_table_id(parent_table_id); |
7043 | 6 | req.mutable_namespace_()->set_name(ns->name()); |
7044 | 6 | req.mutable_namespace_()->set_id(ns->id()); |
7045 | 6 | req.set_table_type(GetTableTypeForDatabase(ns->database_type())); |
7046 | 6 | req.set_colocated(true); |
7047 | | |
7048 | 6 | YBSchemaBuilder schemaBuilder; |
7049 | 6 | schemaBuilder.AddColumn("parent_column")->Type(BINARY)->PrimaryKey()->NotNull(); |
7050 | 6 | YBSchema ybschema; |
7051 | 6 | CHECK_OK(schemaBuilder.Build(&ybschema)); |
7052 | 6 | auto schema = yb::client::internal::GetSchema(ybschema); |
7053 | 6 | SchemaToPB(schema, req.mutable_schema()); |
7054 | 6 | req.mutable_schema()->mutable_table_properties()->set_is_transactional(true); |
7055 | | |
7056 | | // create a parent table, which will create the tablet. |
7057 | 6 | Status s = CreateTable(&req, &resp, rpc); |
7058 | | // We do not lock here so it is technically possible that the table was already created. |
7059 | | // If so, there is nothing to do so we just ignore the "AlreadyPresent" error. |
7060 | 6 | if (!s.ok() && !s.IsAlreadyPresent()) { |
7061 | 0 | LOG(WARNING) << "Keyspace creation failed:" << s.ToString(); |
7062 | | // TODO: We should verify this behavior works end-to-end. |
7063 | | // Diverging in-memory state from disk so the user can issue a delete if no new leader. |
7064 | 0 | auto l = ns->LockForWrite(); |
7065 | 0 | SysNamespaceEntryPB& metadata = ns->mutable_metadata()->mutable_dirty()->pb; |
7066 | 0 | metadata.set_state(SysNamespaceEntryPB::FAILED); |
7067 | 0 | l.Commit(); |
7068 | 0 | return s; |
7069 | 0 | } |
7070 | 2.07k | } |
7071 | | |
7072 | 2.07k | if ((db_type == YQL_DATABASE_PGSQL && !pgsql_tables.empty()) || |
7073 | 2.04k | PREDICT_FALSE(GetAtomicFlag(&FLAGS_TEST_hang_on_namespace_transition))) { |
7074 | | // Process the subsequent work in the background thread (normally PGSQL). |
7075 | 24 | LOG(INFO) << "Keyspace create enqueued for later processing: " << ns->ToString(); |
7076 | 24 | RETURN_NOT_OK(background_tasks_thread_pool_->SubmitFunc( |
7077 | 24 | std::bind(&CatalogManager::ProcessPendingNamespace, this, ns->id(), pgsql_tables, txn))); |
7078 | 24 | return Status::OK(); |
7079 | 2.04k | } else { |
7080 | | // All work is done, it's now safe to online the namespace (normally YQL). |
7081 | 2.04k | auto l = ns->LockForWrite(); |
7082 | 2.04k | SysNamespaceEntryPB& metadata = ns->mutable_metadata()->mutable_dirty()->pb; |
7083 | 2.04k | if (metadata.state() == SysNamespaceEntryPB::PREPARING) { |
7084 | 2.04k | metadata.set_state(SysNamespaceEntryPB::RUNNING); |
7085 | 2.04k | return_status = sys_catalog_->Upsert(leader_ready_term(), ns); |
7086 | 2.04k | if (!return_status.ok()) { |
7087 | | // Diverging in-memory state from disk so the user can issue a delete if no new leader. |
7088 | 2 | LOG(WARNING) << "Keyspace creation failed:" << return_status.ToString(); |
7089 | 2 | metadata.set_state(SysNamespaceEntryPB::FAILED); |
7090 | 2 | return_status = CheckIfNoLongerLeaderAndSetupError(return_status, resp); |
7091 | 2.04k | } else { |
7092 | 2.04k | TRACE("Activated keyspace in sys-catalog"); |
7093 | 2.04k | LOG(INFO) << "Activated keyspace: " << ns->ToString(); |
7094 | 2.04k | } |
7095 | | // Commit the namespace in-memory state. |
7096 | 2.04k | l.Commit(); |
7097 | 0 | } else { |
7098 | 0 | LOG(WARNING) << "Keyspace has invalid state (" << metadata.state() << "), aborting create"; |
7099 | 0 | } |
7100 | 2.04k | } |
7101 | 2.04k | return return_status; |
7102 | 2.07k | } |
7103 | | |
7104 | | void CatalogManager::ProcessPendingNamespace( |
7105 | | NamespaceId id, |
7106 | | std::vector<scoped_refptr<TableInfo>> template_tables, |
7107 | 25 | TransactionMetadata txn) { |
7108 | 25 | LOG(INFO) << "ProcessPendingNamespace started for " << id; |
7109 | | |
7110 | | // Ensure that we are currently the Leader before handling DDL operations. |
7111 | 25 | { |
7112 | 25 | SCOPED_LEADER_SHARED_LOCK(l, this); |
7113 | 25 | if (!l.catalog_status().ok()) { |
7114 | 0 | LOG(WARNING) << "Catalog status failure: " << l.catalog_status().ToString(); |
7115 | | // Don't try again, we have to reset in-memory state after losing leader election. |
7116 | 0 | return; |
7117 | 0 | } |
7118 | 25 | if (!l.leader_status().ok()) { |
7119 | 0 | LOG(WARNING) << "Leader status failure: " << l.leader_status().ToString(); |
7120 | | // Don't try again, we have to reset in-memory state after losing leader election. |
7121 | 0 | return; |
7122 | 0 | } |
7123 | 25 | } |
7124 | | |
7125 | 25 | if (PREDICT_FALSE(GetAtomicFlag(&FLAGS_TEST_hang_on_namespace_transition))) { |
7126 | 2 | LOG(INFO) << "Artificially waiting (" << FLAGS_catalog_manager_bg_task_wait_ms |
7127 | 2 | << "ms) on namespace creation for " << id; |
7128 | 2 | SleepFor(MonoDelta::FromMilliseconds(FLAGS_catalog_manager_bg_task_wait_ms)); |
7129 | 2 | WARN_NOT_OK(background_tasks_thread_pool_->SubmitFunc( |
7130 | 2 | std::bind(&CatalogManager::ProcessPendingNamespace, this, id, template_tables, txn)), |
7131 | 2 | "Could not submit ProcessPendingNamespaces to thread pool"); |
7132 | 2 | return; |
7133 | 2 | } |
7134 | | |
7135 | 23 | scoped_refptr<NamespaceInfo> ns; |
7136 | 23 | { |
7137 | 23 | LockGuard lock(mutex_); |
7138 | 23 | ns = FindPtrOrNull(namespace_ids_map_, id);; |
7139 | 23 | } |
7140 | 23 | if (ns == nullptr) { |
7141 | 0 | LOG(WARNING) << "Pending Namespace not found to finish creation: " << id; |
7142 | 0 | return; |
7143 | 0 | } |
7144 | | |
7145 | | // Copy the system tables necessary to create this namespace. This can be time-intensive. |
7146 | 23 | bool success = true; |
7147 | 23 | if (!template_tables.empty()) { |
7148 | 22 | auto s = CopyPgsqlSysTables(ns->id(), template_tables); |
7149 | 22 | WARN_NOT_OK(s, "Error Copying PGSQL System Tables for Pending Namespace"); |
7150 | 22 | success = s.ok(); |
7151 | 22 | } |
7152 | | |
7153 | | // All work is done, change the namespace state regardless of success or failure. |
7154 | 23 | { |
7155 | 23 | auto l = ns->LockForWrite(); |
7156 | 23 | SysNamespaceEntryPB& metadata = ns->mutable_metadata()->mutable_dirty()->pb; |
7157 | 23 | if (metadata.state() == SysNamespaceEntryPB::PREPARING) { |
7158 | 22 | metadata.set_state(success ? SysNamespaceEntryPB::RUNNING : SysNamespaceEntryPB::FAILED); |
7159 | 23 | auto s = sys_catalog_->Upsert(leader_ready_term(), ns); |
7160 | 23 | if (s.ok()) { |
7161 | 22 | TRACE("Done processing keyspace"); |
7162 | 22 | LOG(INFO) << (success ? "Processed" : "Failed") << " keyspace: " << ns->ToString(); |
7163 | | |
7164 | | // Verify Transaction gets committed, which occurs after namespace create finishes. |
7165 | 22 | if (success && metadata.has_transaction()) { |
7166 | 21 | LOG(INFO) << "Enqueuing keyspace for Transaction Verification: " << ns->ToString(); |
7167 | 21 | std::function<Status(bool)> when_done = |
7168 | 21 | std::bind(&CatalogManager::VerifyNamespacePgLayer, this, ns, _1); |
7169 | 21 | WARN_NOT_OK(background_tasks_thread_pool_->SubmitFunc( |
7170 | 21 | std::bind(&YsqlTransactionDdl::VerifyTransaction, ysql_transaction_.get(), |
7171 | 21 | txn, when_done)), |
7172 | 21 | "Could not submit VerifyTransaction to thread pool"); |
7173 | 21 | } |
7174 | 1 | } else { |
7175 | 1 | metadata.set_state(SysNamespaceEntryPB::FAILED); |
7176 | 1 | if (s.IsIllegalState() || s.IsAborted()) { |
7177 | 0 | s = STATUS(ServiceUnavailable, |
7178 | 0 | "operation requested can only be executed on a leader master, but this" |
7179 | 0 | " master is no longer the leader", s.ToString()); |
7180 | 1 | } else { |
7181 | 1 | s = s.CloneAndPrepend(Substitute( |
7182 | 1 | "An error occurred while modifying keyspace to $0 in sys-catalog: $1", |
7183 | 1 | metadata.state(), s.ToString())); |
7184 | 1 | } |
7185 | 1 | LOG(WARNING) << s.ToString(); |
7186 | 1 | } |
7187 | | // Commit the namespace in-memory state. |
7188 | 23 | l.Commit(); |
7189 | 0 | } else { |
7190 | 0 | LOG(WARNING) << "Bad keyspace state (" << metadata.state() |
7191 | 0 | << "), abandoning creation work for " << ns->ToString(); |
7192 | 0 | } |
7193 | 23 | } |
7194 | 23 | } |
7195 | | |
7196 | | Status CatalogManager::VerifyNamespacePgLayer( |
7197 | 21 | scoped_refptr<NamespaceInfo> ns, bool rpc_success) { |
7198 | | // Upon Transaction completion, check pg system table using OID to ensure SUCCESS. |
7199 | 21 | const auto pg_table_id = GetPgsqlTableId(atoi(kSystemNamespaceId), kPgDatabaseTableOid); |
7200 | 21 | auto entry_exists = VERIFY_RESULT( |
7201 | 21 | ysql_transaction_->PgEntryExists(pg_table_id, GetPgsqlDatabaseOid(ns->id()))); |
7202 | 21 | auto l = ns->LockForWrite(); |
7203 | 21 | SysNamespaceEntryPB& metadata = ns->mutable_metadata()->mutable_dirty()->pb; |
7204 | | |
7205 | | // #5981: Mark un-retryable rpc failures as pass to avoid infinite retry of GC'd txns. |
7206 | 21 | bool txn_check_passed = entry_exists || !rpc_success; |
7207 | | |
7208 | 21 | if (txn_check_passed) { |
7209 | | // Passed checks. Remove the transaction from the entry since we're done processing it. |
7210 | 21 | SCHECK_EQ(metadata.state(), SysNamespaceEntryPB::RUNNING, Aborted, |
7211 | 21 | Substitute("Invalid Namespace state ($0), abandoning transaction GC work for $1", |
7212 | 21 | SysNamespaceEntryPB_State_Name(metadata.state()), ns->ToString())); |
7213 | 20 | metadata.clear_transaction(); |
7214 | 20 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), ns)); |
7215 | 20 | if (entry_exists) { |
7216 | 20 | LOG(INFO) << "Namespace transaction succeeded: " << ns->ToString(); |
7217 | 0 | } else { |
7218 | 0 | LOG(WARNING) << "Unknown RPC Failure, removing transaction on namespace: " << ns->ToString(); |
7219 | 0 | } |
7220 | | // Commit the namespace in-memory state. |
7221 | 20 | l.Commit(); |
7222 | 0 | } else { |
7223 | | // Transaction failed. We need to delete this Database now. |
7224 | 0 | SCHECK(metadata.state() == SysNamespaceEntryPB::RUNNING || |
7225 | 0 | metadata.state() == SysNamespaceEntryPB::FAILED, Aborted, |
7226 | 0 | Substitute("Invalid Namespace state ($0), aborting delete.", |
7227 | 0 | SysNamespaceEntryPB_State_Name(metadata.state()), ns->ToString())); |
7228 | 0 | LOG(INFO) << "Namespace transaction failed, deleting: " << ns->ToString(); |
7229 | 0 | metadata.set_state(SysNamespaceEntryPB::DELETING); |
7230 | 0 | metadata.clear_transaction(); |
7231 | 0 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), ns)); |
7232 | | // Commit the namespace in-memory state. |
7233 | 0 | l.Commit(); |
7234 | | // Async enqueue delete. |
7235 | 0 | RETURN_NOT_OK(background_tasks_thread_pool_->SubmitFunc( |
7236 | 0 | std::bind(&CatalogManager::DeleteYsqlDatabaseAsync, this, ns))); |
7237 | 0 | } |
7238 | 20 | return Status::OK(); |
7239 | 21 | } |
7240 | | |
7241 | | // Get the information about an in-progress create operation. |
7242 | | Status CatalogManager::IsCreateNamespaceDone(const IsCreateNamespaceDoneRequestPB* req, |
7243 | 2.13k | IsCreateNamespaceDoneResponsePB* resp) { |
7244 | 2.13k | auto ns_pb = req->namespace_(); |
7245 | | |
7246 | | // 1. Lookup the namespace and verify it exists. |
7247 | 2.13k | TRACE("Looking up keyspace"); |
7248 | 2.13k | auto ns = VERIFY_NAMESPACE_FOUND(FindNamespace(ns_pb), resp); |
7249 | | |
7250 | 2.13k | TRACE("Locking keyspace"); |
7251 | 2.13k | auto l = ns->LockForRead(); |
7252 | 2.13k | auto metadata = l->pb; |
7253 | | |
7254 | 2.13k | switch (metadata.state()) { |
7255 | | // Success cases. Done and working. |
7256 | 1.88k | case SysNamespaceEntryPB::RUNNING: |
7257 | 1.88k | if (!ns->colocated()) { |
7258 | 1.86k | resp->set_done(true); |
7259 | 11 | } else { |
7260 | | // Verify system table created as well, if colocated. |
7261 | 11 | IsCreateTableDoneRequestPB table_req; |
7262 | 11 | IsCreateTableDoneResponsePB table_resp; |
7263 | 11 | const auto parent_table_id = ns->id() + kColocatedParentTableIdSuffix; |
7264 | 11 | table_req.mutable_table()->set_table_id(parent_table_id); |
7265 | 11 | auto s = IsCreateTableDone(&table_req, &table_resp); |
7266 | 11 | resp->set_done(table_resp.done()); |
7267 | 11 | if (!s.ok()) { |
7268 | 0 | if (table_resp.has_error()) { |
7269 | 0 | resp->mutable_error()->Swap(table_resp.mutable_error()); |
7270 | 0 | } |
7271 | 0 | return s; |
7272 | 0 | } |
7273 | 1.88k | } |
7274 | 1.88k | break; |
7275 | | // These states indicate that a create completed but a subsequent remove was requested. |
7276 | 0 | case SysNamespaceEntryPB::DELETING: |
7277 | 0 | case SysNamespaceEntryPB::DELETED: |
7278 | 0 | resp->set_done(true); |
7279 | 0 | break; |
7280 | | // Pending cases. NOT DONE |
7281 | 258 | case SysNamespaceEntryPB::PREPARING: |
7282 | 258 | resp->set_done(false); |
7283 | 258 | break; |
7284 | | // Failure cases. Done, but we need to give the user an error message. |
7285 | 1 | case SysNamespaceEntryPB::FAILED: |
7286 | 1 | resp->set_done(true); |
7287 | 1 | return SetupError(resp->mutable_error(), MasterErrorPB::UNKNOWN_ERROR, STATUS(InternalError, |
7288 | 1 | "Namespace Create Failed: not onlined.")); |
7289 | 0 | default: |
7290 | 0 | Status s = STATUS_SUBSTITUTE(IllegalState, "IsCreateNamespaceDone failure: state=$0", |
7291 | 0 | SysNamespaceEntryPB_State_Name(metadata.state())); |
7292 | 0 | LOG(WARNING) << s.ToString(); |
7293 | 0 | resp->set_done(true); |
7294 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::UNKNOWN_ERROR, s); |
7295 | 2.13k | } |
7296 | | |
7297 | 2.13k | return Status::OK(); |
7298 | 2.13k | } |
7299 | | |
7300 | | Status CatalogManager::DeleteNamespace(const DeleteNamespaceRequestPB* req, |
7301 | | DeleteNamespaceResponsePB* resp, |
7302 | 1.55k | rpc::RpcContext* rpc) { |
7303 | 1.55k | auto status = DoDeleteNamespace(req, resp, rpc); |
7304 | 1.55k | if (!status.ok()) { |
7305 | 10 | return SetupError(resp->mutable_error(), status); |
7306 | 10 | } |
7307 | 1.54k | return status; |
7308 | 1.54k | } |
7309 | | |
7310 | | Status CatalogManager::DoDeleteNamespace(const DeleteNamespaceRequestPB* req, |
7311 | | DeleteNamespaceResponsePB* resp, |
7312 | 1.55k | rpc::RpcContext* rpc) { |
7313 | 1.55k | LOG(INFO) << "Servicing DeleteNamespace request from " << RequestorString(rpc) |
7314 | 1.55k | << ": " << req->ShortDebugString(); |
7315 | | |
7316 | | // Lookup the namespace and verify if it exists. |
7317 | 1.55k | TRACE("Looking up keyspace"); |
7318 | 1.55k | auto ns = VERIFY_RESULT(FindNamespace(req->namespace_())); |
7319 | | |
7320 | 1.55k | if (req->has_database_type() && req->database_type() != ns->database_type()) { |
7321 | | // Could not find the right database to delete. |
7322 | 0 | return STATUS(NotFound, "Keyspace not found", ns->name(), |
7323 | 0 | MasterError(MasterErrorPB::NAMESPACE_NOT_FOUND)); |
7324 | 0 | } |
7325 | 1.55k | { |
7326 | | // Don't allow deletion if the namespace is in a transient state. |
7327 | 1.55k | auto cur_state = ns->state(); |
7328 | 1.55k | if (cur_state != SysNamespaceEntryPB::RUNNING && cur_state != SysNamespaceEntryPB::FAILED) { |
7329 | 2 | if (cur_state == SysNamespaceEntryPB::DELETED) { |
7330 | 1 | return STATUS(NotFound, "Keyspace already deleted", ns->name(), |
7331 | 1 | MasterError(MasterErrorPB::NAMESPACE_NOT_FOUND)); |
7332 | 1 | } else { |
7333 | 1 | return STATUS_EC_FORMAT( |
7334 | 1 | TryAgain, MasterError(MasterErrorPB::IN_TRANSITION_CAN_RETRY), |
7335 | 1 | "Namespace deletion not allowed when State = $0", |
7336 | 1 | SysNamespaceEntryPB::State_Name(cur_state)); |
7337 | 1 | } |
7338 | 1.55k | } |
7339 | 1.55k | } |
7340 | | |
7341 | | // PGSQL has a completely forked implementation because it allows non-empty namespaces on delete. |
7342 | 1.55k | if (ns->database_type() == YQL_DATABASE_PGSQL) { |
7343 | 52 | return DeleteYsqlDatabase(req, resp, rpc); |
7344 | 52 | } |
7345 | | |
7346 | 1.50k | TRACE("Locking keyspace"); |
7347 | 1.50k | auto l = ns->LockForWrite(); |
7348 | | |
7349 | | // Only empty namespace can be deleted. |
7350 | 1.50k | TRACE("Looking for tables in the keyspace"); |
7351 | 1.50k | { |
7352 | 1.50k | SharedLock lock(mutex_); |
7353 | 0 | VLOG_WITH_FUNC(3) << "Acquired the catalog manager lock"; |
7354 | | |
7355 | 29.5k | for (const TableInfoMap::value_type& entry : *table_ids_map_) { |
7356 | 29.5k | auto ltm = entry.second->LockForRead(); |
7357 | | |
7358 | 29.5k | if (!ltm->started_deleting() && ltm->namespace_id() == ns->id()) { |
7359 | 2 | return STATUS_EC_FORMAT( |
7360 | 2 | InvalidArgument, MasterError(MasterErrorPB::NAMESPACE_IS_NOT_EMPTY), |
7361 | 2 | "Cannot delete keyspace which has $0: $1 [id=$2], request: $3", |
7362 | 2 | IsTable(ltm->pb) ? "table" : "index", ltm->name(), entry.second->id(), |
7363 | 2 | req->ShortDebugString()); |
7364 | 2 | } |
7365 | 29.5k | } |
7366 | | |
7367 | | // Only empty namespace can be deleted. |
7368 | 1.50k | TRACE("Looking for types in the keyspace"); |
7369 | | |
7370 | 0 | for (const UDTypeInfoMap::value_type& entry : udtype_ids_map_) { |
7371 | 0 | auto ltm = entry.second->LockForRead(); |
7372 | |
|
7373 | 0 | if (ltm->namespace_id() == ns->id()) { |
7374 | 0 | return STATUS_EC_FORMAT( |
7375 | 0 | InvalidArgument, MasterError(MasterErrorPB::NAMESPACE_IS_NOT_EMPTY), |
7376 | 0 | "Cannot delete keyspace which has type: $0 [id=$1], request: $2", |
7377 | 0 | ltm->name(), entry.second->id(), req->ShortDebugString()); |
7378 | 0 | } |
7379 | 0 | } |
7380 | 1.50k | } |
7381 | | |
7382 | | // Disallow deleting namespaces with snapshot schedules. |
7383 | 1.50k | auto map = VERIFY_RESULT(MakeSnapshotSchedulesToObjectIdsMap(SysRowEntryType::NAMESPACE)); |
7384 | 0 | for (const auto& schedule_and_objects : map) { |
7385 | 0 | for (const auto& id : schedule_and_objects.second) { |
7386 | 0 | if (id == ns->id()) { |
7387 | 0 | return STATUS_EC_FORMAT( |
7388 | 0 | InvalidArgument, MasterError(MasterErrorPB::NAMESPACE_IS_NOT_EMPTY), |
7389 | 0 | "Cannot delete keyspace which has schedule: $0, request: $1", |
7390 | 0 | schedule_and_objects.first, req->ShortDebugString()); |
7391 | 0 | } |
7392 | 0 | } |
7393 | 0 | } |
7394 | | |
7395 | | // [Delete]. Skip the DELETING->DELETED state, since no tables are present in this namespace. |
7396 | 1.50k | TRACE("Updating metadata on disk"); |
7397 | | // Update sys-catalog. |
7398 | 1.50k | Status s = sys_catalog_->Delete(leader_ready_term(), ns); |
7399 | 1.50k | if (!s.ok()) { |
7400 | | // The mutation will be aborted when 'l' exits the scope on early return. |
7401 | 0 | s = s.CloneAndPrepend("An error occurred while updating sys-catalog"); |
7402 | 0 | LOG(WARNING) << s; |
7403 | 0 | return CheckIfNoLongerLeader(s); |
7404 | 0 | } |
7405 | | |
7406 | | // Update the in-memory state. |
7407 | 1.50k | TRACE("Committing in-memory state"); |
7408 | 1.50k | l.Commit(); |
7409 | | |
7410 | | // Remove the namespace from all CatalogManager mappings. |
7411 | 1.50k | { |
7412 | 1.50k | LockGuard lock(mutex_); |
7413 | 1.50k | if (namespace_names_mapper_[ns->database_type()].erase(ns->name()) < 1) { |
7414 | 0 | LOG(WARNING) << Format("Could not remove namespace from names map, id=$1", ns->id()); |
7415 | 0 | } |
7416 | 1.50k | if (namespace_ids_map_.erase(ns->id()) < 1) { |
7417 | 0 | LOG(WARNING) << Format("Could not remove namespace from ids map, id=$1", ns->id()); |
7418 | 0 | } |
7419 | 1.50k | } |
7420 | | |
7421 | | // Delete any permissions granted on this keyspace to any role. See comment in DeleteTable() for |
7422 | | // more details. |
7423 | 1.50k | string canonical_resource = get_canonical_keyspace(req->namespace_().name()); |
7424 | 1.50k | RETURN_NOT_OK(permissions_manager_->RemoveAllPermissionsForResource(canonical_resource, resp)); |
7425 | | |
7426 | 1.50k | LOG(INFO) << "Successfully deleted keyspace " << ns->ToString() |
7427 | 1.50k | << " per request from " << RequestorString(rpc); |
7428 | 1.50k | return Status::OK(); |
7429 | 1.50k | } |
7430 | | |
7431 | 0 | void CatalogManager::DeleteYcqlDatabaseAsync(scoped_refptr<NamespaceInfo> database) { |
7432 | 0 | TRACE("Locking keyspace"); |
7433 | 0 | auto l = database->LockForWrite(); |
7434 | | |
7435 | | // Only empty namespace can be deleted. |
7436 | 0 | TRACE("Looking for tables in the keyspace"); |
7437 | 0 | { |
7438 | 0 | SharedLock lock(mutex_); |
7439 | 0 | VLOG_WITH_FUNC(3) << "Acquired the catalog manager lock"; |
7440 | |
|
7441 | 0 | for (const TableInfoMap::value_type& entry : *table_ids_map_) { |
7442 | 0 | auto ltm = entry.second->LockForRead(); |
7443 | |
|
7444 | 0 | if (!ltm->started_deleting() && ltm->namespace_id() == database->id()) { |
7445 | 0 | LOG(WARNING) << "Cannot delete keyspace which has " << ltm->name() |
7446 | 0 | << " with id=" << entry.second->id(); |
7447 | 0 | return; |
7448 | 0 | } |
7449 | 0 | } |
7450 | 0 | } |
7451 | | |
7452 | | // Only empty namespace can be deleted. |
7453 | 0 | TRACE("Looking for types in the keyspace"); |
7454 | 0 | { |
7455 | 0 | SharedLock lock(mutex_); |
7456 | 0 | VLOG_WITH_FUNC(3) << "Acquired the catalog manager lock"; |
7457 | |
|
7458 | 0 | for (const UDTypeInfoMap::value_type& entry : udtype_ids_map_) { |
7459 | 0 | auto ltm = entry.second->LockForRead(); |
7460 | |
|
7461 | 0 | if (ltm->namespace_id() == database->id()) { |
7462 | 0 | LOG(WARNING) << "Cannot delete keyspace which has type: " << ltm->name() |
7463 | 0 | << " with id=" << entry.second->id(); |
7464 | 0 | return; |
7465 | 0 | } |
7466 | 0 | } |
7467 | 0 | } |
7468 | | |
7469 | | // [Delete]. Skip the DELETING->DELETED state, since no tables are present in this namespace. |
7470 | 0 | TRACE("Updating metadata on disk"); |
7471 | | // Update sys-catalog. |
7472 | 0 | Status s = sys_catalog_->Delete(leader_ready_term(), database); |
7473 | 0 | if (!s.ok()) { |
7474 | | // The mutation will be aborted when 'l' exits the scope on early return. |
7475 | 0 | s = s.CloneAndPrepend(Substitute("An error occurred while updating sys-catalog: $0", |
7476 | 0 | s.ToString())); |
7477 | 0 | LOG(WARNING) << s.ToString(); |
7478 | 0 | return; |
7479 | 0 | } |
7480 | | |
7481 | | // Update the in-memory state. |
7482 | 0 | TRACE("Committing in-memory state"); |
7483 | 0 | l.Commit(); |
7484 | | |
7485 | | // Remove the namespace from all CatalogManager mappings. |
7486 | 0 | { |
7487 | 0 | LockGuard lock(mutex_); |
7488 | 0 | namespace_names_mapper_[database->database_type()].erase(database->name()); |
7489 | 0 | if (namespace_ids_map_.erase(database->id()) < 1) { |
7490 | 0 | LOG(WARNING) << Format("Could not remove namespace from maps, id=$1", database->id()); |
7491 | 0 | } |
7492 | 0 | } |
7493 | | |
7494 | | // Delete any permissions granted on this keyspace to any role. See comment in DeleteTable() for |
7495 | | // more details. |
7496 | 0 | string canonical_resource = get_canonical_keyspace(database->name()); |
7497 | 0 | DeleteNamespaceResponsePB resp; |
7498 | 0 | s = permissions_manager_->RemoveAllPermissionsForResource(canonical_resource, &resp); |
7499 | 0 | if (s.ok()) { |
7500 | 0 | LOG(INFO) << "Successfully deleted keyspace " << database->ToString(); |
7501 | 0 | } else { |
7502 | 0 | LOG(WARNING) << "Error deleting keyspace " << database->ToString() << ": " << s; |
7503 | 0 | } |
7504 | 0 | } |
7505 | | |
7506 | | Status CatalogManager::DeleteYsqlDatabase(const DeleteNamespaceRequestPB* req, |
7507 | | DeleteNamespaceResponsePB* resp, |
7508 | 52 | rpc::RpcContext* rpc) { |
7509 | | // Lookup database. |
7510 | 52 | auto database = VERIFY_NAMESPACE_FOUND(FindNamespace(req->namespace_()), resp); |
7511 | | |
7512 | | // Make sure this is a YSQL database. |
7513 | 52 | if (database->database_type() != YQL_DATABASE_PGSQL) { |
7514 | | // A non-YSQL namespace is found, but the rpc requests to drop a YSQL database. |
7515 | 0 | Status s = STATUS(NotFound, "YSQL database not found", database->name()); |
7516 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s); |
7517 | 0 | } |
7518 | | |
7519 | | // Set the Namespace to DELETING. |
7520 | 52 | TRACE("Locking database"); |
7521 | 52 | auto l = database->LockForWrite(); |
7522 | 52 | SysNamespaceEntryPB &metadata = database->mutable_metadata()->mutable_dirty()->pb; |
7523 | 52 | if (metadata.state() == SysNamespaceEntryPB::RUNNING || |
7524 | 52 | metadata.state() == SysNamespaceEntryPB::FAILED) { |
7525 | 52 | metadata.set_state(SysNamespaceEntryPB::DELETING); |
7526 | 52 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), database)); |
7527 | 47 | TRACE("Marked keyspace for deletion in sys-catalog"); |
7528 | | // Commit the namespace in-memory state. |
7529 | 47 | l.Commit(); |
7530 | 0 | } else { |
7531 | 0 | Status s = STATUS_SUBSTITUTE(IllegalState, |
7532 | 0 | "Keyspace ($0) has invalid state ($1), aborting delete", |
7533 | 0 | database->name(), metadata.state()); |
7534 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INTERNAL_ERROR, s); |
7535 | 0 | } |
7536 | | |
7537 | 47 | return background_tasks_thread_pool_->SubmitFunc( |
7538 | 47 | std::bind(&CatalogManager::DeleteYsqlDatabaseAsync, this, database)); |
7539 | 47 | } |
7540 | | |
7541 | 49 | void CatalogManager::DeleteYsqlDatabaseAsync(scoped_refptr<NamespaceInfo> database) { |
7542 | 49 | TEST_PAUSE_IF_FLAG(TEST_hang_on_namespace_transition); |
7543 | | |
7544 | | // Lock database before removing content. |
7545 | 49 | TRACE("Locking database"); |
7546 | 49 | auto l = database->LockForWrite(); |
7547 | 49 | SysNamespaceEntryPB &metadata = database->mutable_metadata()->mutable_dirty()->pb; |
7548 | | |
7549 | | // A DELETED Namespace has finished but was tombstoned to avoid immediately reusing the same ID. |
7550 | | // We consider a restart enough time, so we just need to remove it from the SysCatalog. |
7551 | 49 | if (metadata.state() == SysNamespaceEntryPB::DELETED) { |
7552 | 0 | Status s = sys_catalog_->Delete(leader_ready_term(), database); |
7553 | 0 | WARN_NOT_OK(s, "SysCatalog DeleteItem for Namespace"); |
7554 | 0 | if (!s.ok()) { |
7555 | 0 | return; |
7556 | 0 | } |
7557 | 49 | } else if (metadata.state() == SysNamespaceEntryPB::DELETING) { |
7558 | | // Delete all tables in the database. |
7559 | 48 | TRACE("Delete all tables in YSQL database"); |
7560 | 48 | Status s = DeleteYsqlDBTables(database); |
7561 | 48 | WARN_NOT_OK(s, "DeleteYsqlDBTables failed"); |
7562 | 48 | if (!s.ok()) { |
7563 | | // Move to FAILED so DeleteNamespace can be reissued by the user. |
7564 | 5 | metadata.set_state(SysNamespaceEntryPB::FAILED); |
7565 | 5 | l.Commit(); |
7566 | 5 | return; |
7567 | 5 | } |
7568 | | |
7569 | | // Once all user-facing data has been offlined, move the Namespace to DELETED state. |
7570 | 43 | metadata.set_state(SysNamespaceEntryPB::DELETED); |
7571 | 43 | s = sys_catalog_->Upsert(leader_ready_term(), database); |
7572 | 43 | WARN_NOT_OK(s, "SysCatalog Update for Namespace"); |
7573 | 43 | if (!s.ok()) { |
7574 | | // Move to FAILED so DeleteNamespace can be reissued by the user. |
7575 | 0 | metadata.set_state(SysNamespaceEntryPB::FAILED); |
7576 | 0 | l.Commit(); |
7577 | 0 | return; |
7578 | 0 | } |
7579 | 43 | TRACE("Marked keyspace as deleted in sys-catalog"); |
7580 | 1 | } else { |
7581 | 1 | LOG(WARNING) << "Keyspace (" << database->name() << ") has invalid state (" |
7582 | 1 | << metadata.state() << "), aborting delete"; |
7583 | 1 | return; |
7584 | 1 | } |
7585 | | |
7586 | | // Remove namespace from CatalogManager name mapping. Will remove ID map after all Tables gone. |
7587 | 43 | { |
7588 | 43 | LockGuard lock(mutex_); |
7589 | 43 | if (namespace_names_mapper_[database->database_type()].erase(database->name()) < 1) { |
7590 | 0 | LOG(WARNING) << Format("Could not remove namespace from maps, name=$0, id=$1", |
7591 | 0 | database->name(), database->id()); |
7592 | 0 | } |
7593 | 43 | } |
7594 | | |
7595 | | // Update the in-memory state. |
7596 | 43 | TRACE("Committing in-memory state"); |
7597 | 43 | l.Commit(); |
7598 | | |
7599 | | // DROP completed. Return status. |
7600 | 43 | LOG(INFO) << "Successfully deleted YSQL database " << database->ToString(); |
7601 | 43 | } |
7602 | | |
7603 | | // IMPORTANT: If modifying, consider updating DeleteTable(), the singular deletion API. |
7604 | 48 | Status CatalogManager::DeleteYsqlDBTables(const scoped_refptr<NamespaceInfo>& database) { |
7605 | 48 | TabletInfoPtr sys_tablet_info; |
7606 | 48 | vector<pair<scoped_refptr<TableInfo>, TableInfo::WriteLock>> tables; |
7607 | 48 | std::unordered_set<TableId> sys_table_ids; |
7608 | 48 | { |
7609 | | // Lock the catalog to iterate over table_ids_map_. |
7610 | 48 | SharedLock lock(mutex_); |
7611 | | |
7612 | 48 | sys_tablet_info = tablet_map_->find(kSysCatalogTabletId)->second; |
7613 | | |
7614 | | // Populate tables and sys_table_ids. |
7615 | 15.4k | for (const TableInfoMap::value_type& entry : *table_ids_map_) { |
7616 | 15.4k | scoped_refptr<TableInfo> table = entry.second; |
7617 | 15.4k | if (table->namespace_id() != database->id()) { |
7618 | 13.1k | continue; |
7619 | 13.1k | } |
7620 | 2.31k | auto l = table->LockForWrite(); |
7621 | 2.31k | if (l->started_deleting()) { |
7622 | 11 | continue; |
7623 | 11 | } |
7624 | 2.30k | RSTATUS_DCHECK( |
7625 | 2.30k | !l->pb.is_pg_shared_table(), Corruption, "Shared table found in database"); |
7626 | | |
7627 | 2.30k | if (IsSystemTable(*table)) { |
7628 | 2.26k | sys_table_ids.insert(table->id()); |
7629 | 2.26k | } |
7630 | | |
7631 | | // For regular (indexed) table, insert table info and lock in the front of the list. Else for |
7632 | | // index table, append them to the end. We do so so that we will commit and delete the indexed |
7633 | | // table first before its indexes. |
7634 | 2.30k | if (IsTable(l->pb)) { |
7635 | 1.28k | tables.insert(tables.begin(), {table, std::move(l)}); |
7636 | 1.01k | } else { |
7637 | 1.01k | tables.push_back({table, std::move(l)}); |
7638 | 1.01k | } |
7639 | 2.30k | } |
7640 | 48 | } |
7641 | | // Remove the system tables from RAFT. |
7642 | 48 | TRACE("Sending system table delete RPCs"); |
7643 | 2.26k | for (auto &table_id : sys_table_ids) { |
7644 | 2.26k | RETURN_NOT_OK(sys_catalog_->DeleteYsqlSystemTable(table_id)); |
7645 | 2.26k | } |
7646 | | // Remove the system tables from the system catalog TabletInfo. |
7647 | 48 | RETURN_NOT_OK(RemoveTableIdsFromTabletInfo(sys_tablet_info, sys_table_ids)); |
7648 | | |
7649 | | // Set all table states to DELETING as one batch RPC call. |
7650 | 44 | TRACE("Sending delete table batch RPC to sys catalog"); |
7651 | 44 | vector<TableInfo *> tables_rpc; |
7652 | 44 | tables_rpc.reserve(tables.size()); |
7653 | 2.30k | for (auto &table_and_lock : tables) { |
7654 | 2.30k | tables_rpc.push_back(table_and_lock.first.get()); |
7655 | 2.30k | auto &l = table_and_lock.second; |
7656 | | // Mark the table state as DELETING tablets. |
7657 | 2.30k | l.mutable_data()->set_state(SysTablesEntryPB::DELETING, |
7658 | 2.30k | Substitute("Started deleting at $0", LocalTimeAsString())); |
7659 | 2.30k | } |
7660 | | // Update all the table states in raft in bulk. |
7661 | 44 | Status s = sys_catalog_->Upsert(leader_ready_term(), tables_rpc); |
7662 | 44 | if (!s.ok()) { |
7663 | | // The mutation will be aborted when 'l' exits the scope on early return. |
7664 | 1 | s = s.CloneAndPrepend(Substitute("An error occurred while updating sys tables: $0", |
7665 | 1 | s.ToString())); |
7666 | 1 | LOG(WARNING) << s.ToString(); |
7667 | 1 | return CheckIfNoLongerLeader(s); |
7668 | 1 | } |
7669 | 2.30k | for (auto &table_and_lock : tables) { |
7670 | 2.30k | auto &table = table_and_lock.first; |
7671 | 2.30k | auto &l = table_and_lock.second; |
7672 | | // Cancel all table busywork and commit the DELETING change. |
7673 | 2.30k | l.Commit(); |
7674 | 2.30k | table->AbortTasks(); |
7675 | 2.30k | } |
7676 | | |
7677 | | // Batch remove all relevant CDC streams, handle after releasing Table locks. |
7678 | 43 | TRACE("Deleting CDC streams on table"); |
7679 | 43 | vector<TableId> id_list; |
7680 | 43 | id_list.reserve(tables.size()); |
7681 | 2.30k | for (auto &table_and_lock : tables) { |
7682 | 2.30k | id_list.push_back(table_and_lock.first->id()); |
7683 | 2.30k | } |
7684 | 43 | RETURN_NOT_OK(DeleteCDCStreamsForTables(id_list)); |
7685 | | |
7686 | | // Send a DeleteTablet() RPC request to each tablet replica in the table. |
7687 | 2.30k | for (auto &table_and_lock : tables) { |
7688 | 2.30k | auto &table = table_and_lock.first; |
7689 | | // TODO(pitr) undelete for YSQL tables |
7690 | 2.30k | RETURN_NOT_OK(DeleteTabletsAndSendRequests(table, {})); |
7691 | 2.30k | } |
7692 | | |
7693 | | // Invoke any background tasks and return (notably, table cleanup). |
7694 | 43 | background_tasks_->Wake(); |
7695 | 43 | return Status::OK(); |
7696 | 43 | } |
7697 | | |
7698 | | // Get the information about an in-progress delete operation. |
7699 | | Status CatalogManager::IsDeleteNamespaceDone(const IsDeleteNamespaceDoneRequestPB* req, |
7700 | 1.59k | IsDeleteNamespaceDoneResponsePB* resp) { |
7701 | 1.59k | auto ns_pb = req->namespace_(); |
7702 | | |
7703 | | // Lookup the namespace and verify it exists. |
7704 | 1.59k | TRACE("Looking up keyspace"); |
7705 | 1.59k | auto ns = FindNamespace(ns_pb); |
7706 | 1.59k | if (!ns.ok()) { |
7707 | | // Namespace no longer exists means success. |
7708 | 1.51k | LOG(INFO) << "Servicing IsDeleteNamespaceDone request for " |
7709 | 1.51k | << ns_pb.DebugString() << ": deleted (not found)"; |
7710 | 1.51k | resp->set_done(true); |
7711 | 1.51k | return Status::OK(); |
7712 | 1.51k | } |
7713 | | |
7714 | 85 | TRACE("Locking keyspace"); |
7715 | 85 | auto l = (**ns).LockForRead(); |
7716 | 85 | auto& metadata = l->pb; |
7717 | | |
7718 | 85 | if (metadata.state() == SysNamespaceEntryPB::DELETED) { |
7719 | 22 | resp->set_done(true); |
7720 | 63 | } else if (metadata.state() == SysNamespaceEntryPB::DELETING) { |
7721 | 58 | resp->set_done(false); |
7722 | 5 | } else { |
7723 | 5 | Status s = STATUS_SUBSTITUTE(IllegalState, |
7724 | 5 | "Servicing IsDeleteNamespaceDone request for $0: NOT deleted (state=$1)", |
7725 | 5 | ns_pb.DebugString(), metadata.state()); |
7726 | 5 | LOG(WARNING) << s.ToString(); |
7727 | | // Done != Successful. We just want to let the user know the delete has finished processing. |
7728 | 5 | resp->set_done(true); |
7729 | 5 | return SetupError(resp->mutable_error(), MasterErrorPB::INTERNAL_ERROR, s); |
7730 | 5 | } |
7731 | 80 | return Status::OK(); |
7732 | 80 | } |
7733 | | |
7734 | | Status CatalogManager::AlterNamespace(const AlterNamespaceRequestPB* req, |
7735 | | AlterNamespaceResponsePB* resp, |
7736 | 4 | rpc::RpcContext* rpc) { |
7737 | 4 | LOG(INFO) << "Servicing AlterNamespace request from " << RequestorString(rpc) |
7738 | 4 | << ": " << req->ShortDebugString(); |
7739 | | |
7740 | 4 | auto database = VERIFY_NAMESPACE_FOUND(FindNamespace(req->namespace_()), resp); |
7741 | | |
7742 | 4 | if (req->namespace_().has_database_type() && |
7743 | 2 | database->database_type() != req->namespace_().database_type()) { |
7744 | 0 | Status s = STATUS(NotFound, "Database not found", database->name()); |
7745 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s); |
7746 | 0 | } |
7747 | | |
7748 | 4 | TRACE("Locking database"); |
7749 | 4 | auto l = database->LockForWrite(); |
7750 | | |
7751 | | // Don't allow an alter if the namespace isn't running. |
7752 | 4 | if (l->pb.state() != SysNamespaceEntryPB::RUNNING) { |
7753 | 1 | Status s = STATUS_SUBSTITUTE(TryAgain, "Namespace not running. State = $0", |
7754 | 1 | SysNamespaceEntryPB::State_Name(l->pb.state())); |
7755 | 1 | return SetupError(resp->mutable_error(), NamespaceMasterError(l->pb.state()), s); |
7756 | 1 | } |
7757 | | |
7758 | 3 | const string old_name = l->pb.name(); |
7759 | | |
7760 | 3 | if (req->has_new_name() && req->new_name() != old_name) { |
7761 | 3 | const string new_name = req->new_name(); |
7762 | | |
7763 | | // Verify that the new name does not exist. |
7764 | 3 | NamespaceIdentifierPB ns_identifier; |
7765 | 3 | ns_identifier.set_name(new_name); |
7766 | 3 | if (req->namespace_().has_database_type()) { |
7767 | 1 | ns_identifier.set_database_type(req->namespace_().database_type()); |
7768 | 1 | } |
7769 | | // TODO: This check will only work for YSQL once we add support for YSQL namespaces in |
7770 | | // namespace_name_map (#1476). |
7771 | 3 | LockGuard lock(mutex_); |
7772 | 3 | TRACE("Acquired catalog manager lock"); |
7773 | 3 | auto ns = FindNamespaceUnlocked(ns_identifier); |
7774 | 3 | if (ns.ok() && req->namespace_().has_database_type() && |
7775 | 0 | (**ns).database_type() == req->namespace_().database_type()) { |
7776 | 0 | Status s = STATUS_SUBSTITUTE(AlreadyPresent, "Keyspace '$0' already exists", (**ns).name()); |
7777 | 0 | LOG(WARNING) << "Found keyspace: " << (**ns).id() << ". Failed altering keyspace with error: " |
7778 | 0 | << s << " Request:\n" << req->DebugString(); |
7779 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_ALREADY_PRESENT, s); |
7780 | 0 | } |
7781 | | |
7782 | 3 | namespace_names_mapper_[req->namespace_().database_type()][new_name] = database; |
7783 | 3 | namespace_names_mapper_[req->namespace_().database_type()].erase(old_name); |
7784 | | |
7785 | 3 | l.mutable_data()->pb.set_name(new_name); |
7786 | 3 | } |
7787 | | |
7788 | 3 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), database)); |
7789 | | |
7790 | 3 | TRACE("Committing in-memory state"); |
7791 | 3 | l.Commit(); |
7792 | | |
7793 | 3 | LOG(INFO) << "Successfully altered keyspace " << req->namespace_().name() |
7794 | 3 | << " per request from " << RequestorString(rpc); |
7795 | 3 | return Status::OK(); |
7796 | 3 | } |
7797 | | |
7798 | | Status CatalogManager::ListNamespaces(const ListNamespacesRequestPB* req, |
7799 | 4.44k | ListNamespacesResponsePB* resp) { |
7800 | 4.44k | NamespaceInfoMap namespace_ids_copy; |
7801 | 4.44k | { |
7802 | 4.44k | SharedLock lock(mutex_); |
7803 | 4.44k | namespace_ids_copy = namespace_ids_map_; |
7804 | 4.44k | } |
7805 | | |
7806 | 19.6k | for (const auto& entry : namespace_ids_copy) { |
7807 | 19.6k | const auto& namespace_info = *entry.second; |
7808 | | // If the request asks for namespaces for a specific database type, filter by the type. |
7809 | 19.6k | if (req->has_database_type() && namespace_info.database_type() != req->database_type()) { |
7810 | 19 | continue; |
7811 | 19 | } |
7812 | | // Only return RUNNING namespaces. |
7813 | 19.5k | if (namespace_info.state() != SysNamespaceEntryPB::RUNNING) { |
7814 | 10 | continue; |
7815 | 10 | } |
7816 | | |
7817 | 19.5k | NamespaceIdentifierPB *ns = resp->add_namespaces(); |
7818 | 19.5k | ns->set_id(namespace_info.id()); |
7819 | 19.5k | ns->set_name(namespace_info.name()); |
7820 | 19.5k | ns->set_database_type(namespace_info.database_type()); |
7821 | 19.5k | } |
7822 | 4.44k | return Status::OK(); |
7823 | 4.44k | } |
7824 | | |
7825 | | Status CatalogManager::GetNamespaceInfo(const GetNamespaceInfoRequestPB* req, |
7826 | | GetNamespaceInfoResponsePB* resp, |
7827 | 1.77k | rpc::RpcContext* rpc) { |
7828 | 1.77k | LOG(INFO) << __func__ << " from " << RequestorString(rpc) << ": " << req->ShortDebugString(); |
7829 | | |
7830 | | // Look up the namespace and verify if it exists. |
7831 | 1.77k | TRACE("Looking up namespace"); |
7832 | 1.77k | auto ns = VERIFY_NAMESPACE_FOUND(FindNamespace(req->namespace_()), resp); |
7833 | | |
7834 | 1.77k | resp->mutable_namespace_()->set_id(ns->id()); |
7835 | 1.77k | resp->mutable_namespace_()->set_name(ns->name()); |
7836 | 1.77k | resp->mutable_namespace_()->set_database_type(ns->database_type()); |
7837 | 1.77k | resp->set_colocated(ns->colocated()); |
7838 | 1.77k | return Status::OK(); |
7839 | 1.77k | } |
7840 | | |
7841 | | Status CatalogManager::RedisConfigSet( |
7842 | 0 | const RedisConfigSetRequestPB* req, RedisConfigSetResponsePB* resp, rpc::RpcContext* rpc) { |
7843 | 0 | DCHECK(req->has_keyword()); |
7844 | 0 | const auto& key = req->keyword(); |
7845 | 0 | SysRedisConfigEntryPB config_entry; |
7846 | 0 | config_entry.set_key(key); |
7847 | 0 | *config_entry.mutable_args() = req->args(); |
7848 | 0 | bool created = false; |
7849 | |
|
7850 | 0 | TRACE("Acquired catalog manager lock"); |
7851 | 0 | LockGuard lock(mutex_); |
7852 | 0 | scoped_refptr<RedisConfigInfo> cfg = FindPtrOrNull(redis_config_map_, req->keyword()); |
7853 | 0 | if (cfg == nullptr) { |
7854 | 0 | created = true; |
7855 | 0 | cfg = new RedisConfigInfo(key); |
7856 | 0 | redis_config_map_[key] = cfg; |
7857 | 0 | } |
7858 | |
|
7859 | 0 | auto wl = cfg->LockForWrite(); |
7860 | 0 | wl.mutable_data()->pb = std::move(config_entry); |
7861 | 0 | if (created) { |
7862 | 0 | CHECK_OK(sys_catalog_->Upsert(leader_ready_term(), cfg)); |
7863 | 0 | } else { |
7864 | 0 | CHECK_OK(sys_catalog_->Upsert(leader_ready_term(), cfg)); |
7865 | 0 | } |
7866 | 0 | wl.Commit(); |
7867 | 0 | return Status::OK(); |
7868 | 0 | } |
7869 | | |
7870 | | Status CatalogManager::RedisConfigGet( |
7871 | 291 | const RedisConfigGetRequestPB* req, RedisConfigGetResponsePB* resp, rpc::RpcContext* rpc) { |
7872 | 291 | DCHECK(req->has_keyword()); |
7873 | 291 | resp->set_keyword(req->keyword()); |
7874 | 291 | TRACE("Acquired catalog manager lock"); |
7875 | 291 | SharedLock lock(mutex_); |
7876 | 291 | scoped_refptr<RedisConfigInfo> cfg = FindPtrOrNull(redis_config_map_, req->keyword()); |
7877 | 291 | if (cfg == nullptr) { |
7878 | 291 | Status s = STATUS_SUBSTITUTE(NotFound, "Redis config for $0 does not exists", req->keyword()); |
7879 | 291 | return SetupError(resp->mutable_error(), MasterErrorPB::REDIS_CONFIG_NOT_FOUND, s); |
7880 | 291 | } |
7881 | 0 | auto rci = cfg->LockForRead(); |
7882 | 0 | resp->mutable_args()->CopyFrom(rci->pb.args()); |
7883 | 0 | return Status::OK(); |
7884 | 0 | } |
7885 | | |
7886 | | Status CatalogManager::CreateUDType(const CreateUDTypeRequestPB* req, |
7887 | | CreateUDTypeResponsePB* resp, |
7888 | 46 | rpc::RpcContext* rpc) { |
7889 | 46 | LOG(INFO) << "CreateUDType from " << RequestorString(rpc) |
7890 | 46 | << ": " << req->DebugString(); |
7891 | | |
7892 | 46 | Status s; |
7893 | 46 | scoped_refptr<UDTypeInfo> tp; |
7894 | 46 | scoped_refptr<NamespaceInfo> ns; |
7895 | | |
7896 | | // Lookup the namespace and verify if it exists. |
7897 | 46 | if (req->has_namespace_()) { |
7898 | 46 | TRACE("Looking up namespace"); |
7899 | 46 | ns = VERIFY_NAMESPACE_FOUND(FindNamespace(req->namespace_()), resp); |
7900 | 46 | if (ns->database_type() != YQLDatabase::YQL_DATABASE_CQL) { |
7901 | 0 | Status s = STATUS(NotFound, "Namespace not found"); |
7902 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s); |
7903 | 0 | } |
7904 | 46 | } |
7905 | | |
7906 | | // Get all the referenced types (if any). |
7907 | 46 | std::vector<std::string> referenced_udts; |
7908 | 86 | for (const QLTypePB& field_type : req->field_types()) { |
7909 | 86 | QLType::GetUserDefinedTypeIds(field_type, /* transitive = */ true, &referenced_udts); |
7910 | 86 | } |
7911 | | |
7912 | 46 | { |
7913 | 46 | TRACE("Acquired catalog manager lock"); |
7914 | 46 | LockGuard lock(mutex_); |
7915 | | |
7916 | | // Verify that the type does not exist. |
7917 | 46 | tp = FindPtrOrNull(udtype_names_map_, std::make_pair(ns->id(), req->name())); |
7918 | | |
7919 | 46 | if (tp != nullptr) { |
7920 | 1 | s = STATUS_SUBSTITUTE(AlreadyPresent, |
7921 | 1 | "Type '$0.$1' already exists", ns->name(), req->name()); |
7922 | 1 | LOG(WARNING) << "Found type: " << tp->id() << ". Failed creating type with error: " |
7923 | 1 | << s.ToString() << " Request:\n" << req->DebugString(); |
7924 | 1 | return SetupError(resp->mutable_error(), MasterErrorPB::TYPE_ALREADY_PRESENT, s); |
7925 | 1 | } |
7926 | | |
7927 | | // Verify that all referenced types actually exist. |
7928 | 45 | for (const auto& udt_id : referenced_udts) { |
7929 | 11 | if (FindPtrOrNull(udtype_ids_map_, udt_id) == nullptr) { |
7930 | | // This may be caused by a stale cache (e.g. referenced type name resolves to an old, |
7931 | | // deleted type). Return InvalidArgument so query layer will clear cache and retry. |
7932 | 0 | s = STATUS_SUBSTITUTE(InvalidArgument, |
7933 | 0 | "Type id '$0' referenced by type '$1' does not exist", udt_id, req->name()); |
7934 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s); |
7935 | 0 | } |
7936 | 11 | } |
7937 | | |
7938 | | // Construct the new type (generate fresh name and set fields). |
7939 | 45 | UDTypeId new_id = GenerateIdUnlocked(SysRowEntryType::UDTYPE); |
7940 | 45 | tp = new UDTypeInfo(new_id); |
7941 | 45 | tp->mutable_metadata()->StartMutation(); |
7942 | 45 | SysUDTypeEntryPB *metadata = &tp->mutable_metadata()->mutable_dirty()->pb; |
7943 | 45 | metadata->set_name(req->name()); |
7944 | 45 | metadata->set_namespace_id(ns->id()); |
7945 | 85 | for (const string& field_name : req->field_names()) { |
7946 | 85 | metadata->add_field_names(field_name); |
7947 | 85 | } |
7948 | | |
7949 | 85 | for (const QLTypePB& field_type : req->field_types()) { |
7950 | 85 | metadata->add_field_types()->CopyFrom(field_type); |
7951 | 85 | } |
7952 | | |
7953 | | // Add the type to the in-memory maps. |
7954 | 45 | udtype_ids_map_[tp->id()] = tp; |
7955 | 45 | udtype_names_map_[std::make_pair(ns->id(), req->name())] = tp; |
7956 | 45 | resp->set_id(tp->id()); |
7957 | 45 | } |
7958 | 45 | TRACE("Inserted new user-defined type info into CatalogManager maps"); |
7959 | | |
7960 | | // Update the on-disk system catalog. |
7961 | 45 | s = sys_catalog_->Upsert(leader_ready_term(), tp); |
7962 | 45 | if (!s.ok()) { |
7963 | 0 | s = s.CloneAndPrepend(Substitute( |
7964 | 0 | "An error occurred while inserting user-defined type to sys-catalog: $0", s.ToString())); |
7965 | 0 | LOG(WARNING) << s.ToString(); |
7966 | 0 | return CheckIfNoLongerLeaderAndSetupError(s, resp); |
7967 | 0 | } |
7968 | 45 | TRACE("Wrote user-defined type to sys-catalog"); |
7969 | | |
7970 | | // Commit the in-memory state. |
7971 | 45 | tp->mutable_metadata()->CommitMutation(); |
7972 | 45 | LOG(INFO) << "Created user-defined type " << tp->ToString(); |
7973 | 45 | return Status::OK(); |
7974 | 45 | } |
7975 | | |
7976 | | Status CatalogManager::DeleteUDType(const DeleteUDTypeRequestPB* req, |
7977 | | DeleteUDTypeResponsePB* resp, |
7978 | 53 | rpc::RpcContext* rpc) { |
7979 | 53 | LOG(INFO) << "Servicing DeleteUDType request from " << RequestorString(rpc) |
7980 | 53 | << ": " << req->ShortDebugString(); |
7981 | | |
7982 | 53 | scoped_refptr<UDTypeInfo> tp; |
7983 | 53 | scoped_refptr<NamespaceInfo> ns; |
7984 | | |
7985 | 53 | if (!req->has_type()) { |
7986 | 0 | Status s = STATUS(InvalidArgument, "No type given", req->DebugString()); |
7987 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s); |
7988 | 0 | } |
7989 | | |
7990 | | // Validate namespace. |
7991 | 53 | if (req->type().has_namespace_()) { |
7992 | | // Lookup the namespace and verify if it exists. |
7993 | 53 | TRACE("Looking up namespace"); |
7994 | 53 | ns = VERIFY_NAMESPACE_FOUND(FindNamespace(req->type().namespace_()), resp); |
7995 | 53 | } |
7996 | | |
7997 | 53 | { |
7998 | 53 | LockGuard lock(mutex_); |
7999 | 53 | TRACE("Acquired catalog manager lock"); |
8000 | | |
8001 | 53 | if (req->type().has_type_id()) { |
8002 | 0 | tp = FindPtrOrNull(udtype_ids_map_, req->type().type_id()); |
8003 | 53 | } else if (req->type().has_type_name()) { |
8004 | 53 | tp = FindPtrOrNull(udtype_names_map_, {ns->id(), req->type().type_name()}); |
8005 | 53 | } |
8006 | | |
8007 | 53 | if (tp == nullptr) { |
8008 | 2 | Status s = STATUS(NotFound, "The type does not exist", req->DebugString()); |
8009 | 2 | return SetupError(resp->mutable_error(), MasterErrorPB::TYPE_NOT_FOUND, s); |
8010 | 2 | } |
8011 | | |
8012 | | // Checking if any table uses this type. |
8013 | | // TODO: this could be more efficient. |
8014 | 959 | for (const TableInfoMap::value_type& entry : *table_ids_map_) { |
8015 | 959 | auto ltm = entry.second->LockForRead(); |
8016 | 959 | if (!ltm->started_deleting()) { |
8017 | 6.96k | for (const auto &col : ltm->schema().columns()) { |
8018 | 6.96k | if (col.type().main() == DataType::USER_DEFINED_TYPE && |
8019 | 8 | col.type().udtype_info().id() == tp->id()) { |
8020 | 2 | Status s = STATUS(QLError, |
8021 | 2 | Substitute("Cannot delete type '$0.$1'. It is used in column $2 of table $3", |
8022 | 2 | ns->name(), tp->name(), col.name(), ltm->name())); |
8023 | 2 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s); |
8024 | 2 | } |
8025 | 6.96k | } |
8026 | 853 | } |
8027 | 959 | } |
8028 | | |
8029 | | // Checking if any other type uses this type (i.e. in the case of nested types). |
8030 | | // TODO: this could be more efficient. |
8031 | 74 | for (const UDTypeInfoMap::value_type& entry : udtype_ids_map_) { |
8032 | 74 | auto ltm = entry.second->LockForRead(); |
8033 | | |
8034 | 203 | for (int i = 0; i < ltm->field_types_size(); i++) { |
8035 | | // Only need to check direct (non-transitive) type dependencies here. |
8036 | | // This also means we report more precise errors for in-use types. |
8037 | 133 | if (QLType::DoesUserDefinedTypeIdExist(ltm->field_types(i), |
8038 | 133 | false /* transitive */, |
8039 | 4 | tp->id())) { |
8040 | 4 | Status s = STATUS(QLError, |
8041 | 4 | Substitute("Cannot delete type '$0.$1'. It is used in field $2 of type '$3'", |
8042 | 4 | ns->name(), tp->name(), ltm->field_names(i), ltm->name())); |
8043 | 4 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s); |
8044 | 4 | } |
8045 | 133 | } |
8046 | 74 | } |
8047 | 49 | } |
8048 | | |
8049 | 45 | auto l = tp->LockForWrite(); |
8050 | | |
8051 | 45 | Status s = sys_catalog_->Delete(leader_ready_term(), tp); |
8052 | 45 | if (!s.ok()) { |
8053 | | // The mutation will be aborted when 'l' exits the scope on early return. |
8054 | 0 | s = s.CloneAndPrepend(Substitute("An error occurred while updating sys-catalog: $0", |
8055 | 0 | s.ToString())); |
8056 | 0 | LOG(WARNING) << s.ToString(); |
8057 | 0 | return CheckIfNoLongerLeaderAndSetupError(s, resp); |
8058 | 0 | } |
8059 | | |
8060 | | // Remove it from the maps. |
8061 | 45 | { |
8062 | 45 | TRACE("Removing from maps"); |
8063 | 45 | LockGuard lock(mutex_); |
8064 | 45 | if (udtype_ids_map_.erase(tp->id()) < 1) { |
8065 | 0 | PANIC_RPC(rpc, "Could not remove user defined type from map, name=" + l->name()); |
8066 | 0 | } |
8067 | 45 | if (udtype_names_map_.erase({ns->id(), tp->name()}) < 1) { |
8068 | 0 | PANIC_RPC(rpc, "Could not remove user defined type from map, name=" + l->name()); |
8069 | 0 | } |
8070 | 45 | } |
8071 | | |
8072 | | // Update the in-memory state. |
8073 | 45 | TRACE("Committing in-memory state"); |
8074 | 45 | l.Commit(); |
8075 | | |
8076 | 45 | LOG(INFO) << "Successfully deleted user-defined type " << tp->ToString() |
8077 | 45 | << " per request from " << RequestorString(rpc); |
8078 | | |
8079 | 45 | return Status::OK(); |
8080 | 45 | } |
8081 | | |
8082 | | Status CatalogManager::GetUDTypeInfo(const GetUDTypeInfoRequestPB* req, |
8083 | | GetUDTypeInfoResponsePB* resp, |
8084 | 55 | rpc::RpcContext* rpc) { |
8085 | 55 | LOG(INFO) << "GetUDTypeInfo from " << RequestorString(rpc) |
8086 | 55 | << ": " << req->DebugString(); |
8087 | 55 | Status s; |
8088 | 55 | scoped_refptr<UDTypeInfo> tp; |
8089 | 55 | scoped_refptr<NamespaceInfo> ns; |
8090 | | |
8091 | 55 | if (!req->has_type()) { |
8092 | 0 | s = STATUS(InvalidArgument, "Cannot get type, no type identifier given", req->DebugString()); |
8093 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::TYPE_NOT_FOUND, s); |
8094 | 0 | } |
8095 | | |
8096 | 55 | if (req->type().has_type_id()) { |
8097 | 0 | tp = FindPtrOrNull(udtype_ids_map_, req->type().type_id()); |
8098 | 55 | } else if (req->type().has_type_name() && req->type().has_namespace_()) { |
8099 | | // Lookup the type and verify if it exists. |
8100 | 55 | TRACE("Looking up namespace"); |
8101 | 55 | ns = VERIFY_NAMESPACE_FOUND(FindNamespace(req->type().namespace_()), resp); |
8102 | | |
8103 | 55 | tp = FindPtrOrNull(udtype_names_map_, std::make_pair(ns->id(), req->type().type_name())); |
8104 | 55 | } |
8105 | | |
8106 | 55 | if (tp == nullptr) { |
8107 | 7 | s = STATUS(InvalidArgument, "Couldn't find type", req->DebugString()); |
8108 | 7 | return SetupError(resp->mutable_error(), MasterErrorPB::TYPE_NOT_FOUND, s); |
8109 | 7 | } |
8110 | | |
8111 | 48 | { |
8112 | 48 | auto type_lock = tp->LockForRead(); |
8113 | | |
8114 | 48 | UDTypeInfoPB* type_info = resp->mutable_udtype(); |
8115 | | |
8116 | 48 | type_info->set_name(tp->name()); |
8117 | 48 | type_info->set_id(tp->id()); |
8118 | 48 | type_info->mutable_namespace_()->set_id(type_lock->namespace_id()); |
8119 | | |
8120 | 140 | for (int i = 0; i < type_lock->field_names_size(); i++) { |
8121 | 92 | type_info->add_field_names(type_lock->field_names(i)); |
8122 | 92 | } |
8123 | 140 | for (int i = 0; i < type_lock->field_types_size(); i++) { |
8124 | 92 | type_info->add_field_types()->CopyFrom(type_lock->field_types(i)); |
8125 | 92 | } |
8126 | | |
8127 | 48 | LOG(INFO) << "Retrieved user-defined type " << tp->ToString(); |
8128 | 48 | } |
8129 | 48 | return Status::OK(); |
8130 | 48 | } |
8131 | | |
8132 | | Status CatalogManager::ListUDTypes(const ListUDTypesRequestPB* req, |
8133 | 0 | ListUDTypesResponsePB* resp) { |
8134 | 0 | SharedLock lock(mutex_); |
8135 | | |
8136 | | // Lookup the namespace and verify that it exists. |
8137 | 0 | auto ns = VERIFY_NAMESPACE_FOUND(FindNamespaceUnlocked(req->namespace_()), resp); |
8138 | |
|
8139 | 0 | for (const UDTypeInfoByNameMap::value_type& entry : udtype_names_map_) { |
8140 | 0 | auto ltm = entry.second->LockForRead(); |
8141 | | |
8142 | | // key is a pair <namespace_id, type_name>. |
8143 | 0 | if (!ns->id().empty() && ns->id() != entry.first.first) { |
8144 | 0 | continue; // Skip types from other namespaces. |
8145 | 0 | } |
8146 | | |
8147 | 0 | UDTypeInfoPB* udtype = resp->add_udtypes(); |
8148 | 0 | udtype->set_id(entry.second->id()); |
8149 | 0 | udtype->set_name(ltm->name()); |
8150 | 0 | for (int i = 0; i <= ltm->field_names_size(); i++) { |
8151 | 0 | udtype->add_field_names(ltm->field_names(i)); |
8152 | 0 | } |
8153 | 0 | for (int i = 0; i <= ltm->field_types_size(); i++) { |
8154 | 0 | udtype->add_field_types()->CopyFrom(ltm->field_types(i)); |
8155 | 0 | } |
8156 | |
|
8157 | 0 | if (CHECK_NOTNULL(ns.get())) { |
8158 | 0 | auto l = ns->LockForRead(); |
8159 | 0 | udtype->mutable_namespace_()->set_id(ns->id()); |
8160 | 0 | udtype->mutable_namespace_()->set_name(ns->name()); |
8161 | 0 | } |
8162 | 0 | } |
8163 | 0 | return Status::OK(); |
8164 | 0 | } |
8165 | | |
8166 | | // For non-enterprise builds, this is a no-op. |
8167 | 0 | Status CatalogManager::DeleteCDCStreamsForTable(const TableId& table) { |
8168 | 0 | return Status::OK(); |
8169 | 0 | } |
8170 | | |
8171 | 0 | Status CatalogManager::DeleteCDCStreamsForTables(const vector<TableId>& table_ids) { |
8172 | 0 | return Status::OK(); |
8173 | 0 | } |
8174 | | |
8175 | | |
8176 | 0 | bool CatalogManager::CDCStreamExistsUnlocked(const CDCStreamId& stream_id) { |
8177 | 0 | return false; |
8178 | 0 | } |
8179 | | |
8180 | 0 | Result<uint64_t> CatalogManager::IncrementYsqlCatalogVersion() { |
8181 | |
|
8182 | 0 | auto l = CHECK_NOTNULL(ysql_catalog_config_.get())->LockForWrite(); |
8183 | 0 | uint64_t new_version = l->pb.ysql_catalog_config().version() + 1; |
8184 | 0 | l.mutable_data()->pb.mutable_ysql_catalog_config()->set_version(new_version); |
8185 | | |
8186 | | // Write to sys_catalog and in memory. |
8187 | 0 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), ysql_catalog_config_)); |
8188 | 0 | l.Commit(); |
8189 | |
|
8190 | 0 | if (FLAGS_log_ysql_catalog_versions) { |
8191 | 0 | LOG_WITH_FUNC(WARNING) << "set catalog version: " << new_version |
8192 | 0 | << " (using old protobuf method)"; |
8193 | 0 | } |
8194 | |
|
8195 | 0 | return new_version; |
8196 | 0 | } |
8197 | | |
8198 | 361 | Status CatalogManager::InitDbFinished(Status initdb_status, int64_t term) { |
8199 | 361 | if (initdb_status.ok()) { |
8200 | 361 | LOG(INFO) << "initdb completed successfully"; |
8201 | 0 | } else { |
8202 | 0 | LOG(ERROR) << "initdb failed: " << initdb_status; |
8203 | 0 | } |
8204 | | |
8205 | 361 | auto l = CHECK_NOTNULL(ysql_catalog_config_.get())->LockForWrite(); |
8206 | 361 | auto* mutable_ysql_catalog_config = l.mutable_data()->pb.mutable_ysql_catalog_config(); |
8207 | 361 | mutable_ysql_catalog_config->set_initdb_done(true); |
8208 | 361 | if (!initdb_status.ok()) { |
8209 | 0 | mutable_ysql_catalog_config->set_initdb_error(initdb_status.ToString()); |
8210 | 361 | } else { |
8211 | 361 | mutable_ysql_catalog_config->clear_initdb_error(); |
8212 | 361 | } |
8213 | | |
8214 | 361 | RETURN_NOT_OK(sys_catalog_->Upsert(term, ysql_catalog_config_)); |
8215 | 361 | l.Commit(); |
8216 | 361 | return Status::OK(); |
8217 | 361 | } |
8218 | | |
8219 | | CHECKED_STATUS CatalogManager::IsInitDbDone( |
8220 | | const IsInitDbDoneRequestPB* req, |
8221 | 715 | IsInitDbDoneResponsePB* resp) { |
8222 | 715 | auto l = CHECK_NOTNULL(ysql_catalog_config_.get())->LockForRead(); |
8223 | 715 | const auto& ysql_catalog_config = l->pb.ysql_catalog_config(); |
8224 | 715 | resp->set_pg_proc_exists(pg_proc_exists_.load(std::memory_order_acquire)); |
8225 | 715 | resp->set_done(ysql_catalog_config.initdb_done()); |
8226 | 715 | if (ysql_catalog_config.has_initdb_error() && |
8227 | 0 | !ysql_catalog_config.initdb_error().empty()) { |
8228 | 0 | resp->set_initdb_error(ysql_catalog_config.initdb_error()); |
8229 | 0 | } |
8230 | 715 | return Status::OK(); |
8231 | 715 | } |
8232 | | |
8233 | | Status CatalogManager::GetYsqlCatalogVersion(uint64_t* catalog_version, |
8234 | 385k | uint64_t* last_breaking_version) { |
8235 | 385k | auto table_info = GetTableInfo(kPgYbCatalogVersionTableId); |
8236 | 385k | if (table_info != nullptr) { |
8237 | 96.5k | RETURN_NOT_OK(sys_catalog_->ReadYsqlCatalogVersion(kPgYbCatalogVersionTableId, |
8238 | 96.5k | catalog_version, |
8239 | 96.5k | last_breaking_version)); |
8240 | | // If the version is properly initialized, we're done. |
8241 | 96.5k | if ((!catalog_version || *catalog_version > 0) && |
8242 | 95.7k | (!last_breaking_version || *last_breaking_version > 0)) { |
8243 | 95.1k | return Status::OK(); |
8244 | 95.1k | } |
8245 | | // However, it's possible for a table to have no entries mid-migration or if migration fails. |
8246 | | // In this case we'd like to fall back to the legacy approach. |
8247 | 96.5k | } |
8248 | | |
8249 | 290k | auto l = ysql_catalog_config_->LockForRead(); |
8250 | | // last_breaking_version is the last version (change) that invalidated ongoing transactions. |
8251 | | // If using the old (protobuf-based) version method, we do not have any information about |
8252 | | // breaking changes so assuming every change is a breaking change. |
8253 | 290k | if (catalog_version) { |
8254 | 288k | *catalog_version = l->pb.ysql_catalog_config().version(); |
8255 | 288k | } |
8256 | 290k | if (last_breaking_version) { |
8257 | 287k | *last_breaking_version = l->pb.ysql_catalog_config().version(); |
8258 | 287k | } |
8259 | 290k | return Status::OK(); |
8260 | 290k | } |
8261 | | |
8262 | 1.94k | Status CatalogManager::InitializeTransactionTablesConfig(int64_t term) { |
8263 | 1.94k | SysTransactionTablesConfigEntryPB transaction_tables_config; |
8264 | 1.94k | transaction_tables_config.set_version(0); |
8265 | | |
8266 | | // Create in memory objects. |
8267 | 1.94k | transaction_tables_config_ = new SysConfigInfo(kTransactionTablesConfigType); |
8268 | | |
8269 | | // Prepare write. |
8270 | 1.94k | auto l = transaction_tables_config_->LockForWrite(); |
8271 | 1.94k | *l.mutable_data()->pb.mutable_transaction_tables_config() = std::move(transaction_tables_config); |
8272 | | |
8273 | | // Write to sys_catalog and in memory. |
8274 | 1.94k | RETURN_NOT_OK(sys_catalog_->Upsert(term, transaction_tables_config_)); |
8275 | 1.94k | l.Commit(); |
8276 | | |
8277 | 1.94k | return Status::OK(); |
8278 | 1.94k | } |
8279 | | |
8280 | 563 | Status CatalogManager::IncrementTransactionTablesVersion() { |
8281 | 563 | auto l = CHECK_NOTNULL(transaction_tables_config_.get())->LockForWrite(); |
8282 | 563 | uint64_t new_version = l->pb.transaction_tables_config().version() + 1; |
8283 | 563 | l.mutable_data()->pb.mutable_transaction_tables_config()->set_version(new_version); |
8284 | | |
8285 | | // Write to sys_catalog and in memory. |
8286 | 563 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), transaction_tables_config_)); |
8287 | 563 | l.Commit(); |
8288 | | |
8289 | 563 | LOG(INFO) << "Set transaction tables version: " << new_version; |
8290 | | |
8291 | 563 | return Status::OK(); |
8292 | 563 | } |
8293 | | |
8294 | 383k | uint64_t CatalogManager::GetTransactionTablesVersion() { |
8295 | 383k | auto l = CHECK_NOTNULL(transaction_tables_config_.get())->LockForRead(); |
8296 | 383k | return l->pb.transaction_tables_config().version(); |
8297 | 383k | } |
8298 | | |
8299 | 55 | Status CatalogManager::RegisterTsFromRaftConfig(const consensus::RaftPeerPB& peer) { |
8300 | 55 | NodeInstancePB instance_pb; |
8301 | 55 | instance_pb.set_permanent_uuid(peer.permanent_uuid()); |
8302 | 55 | instance_pb.set_instance_seqno(0); |
8303 | | |
8304 | 55 | TSRegistrationPB registration_pb; |
8305 | 55 | auto* common = registration_pb.mutable_common(); |
8306 | 55 | *common->mutable_private_rpc_addresses() = peer.last_known_private_addr(); |
8307 | 55 | *common->mutable_broadcast_addresses() = peer.last_known_broadcast_addr(); |
8308 | 55 | *common->mutable_cloud_info() = peer.cloud_info(); |
8309 | | |
8310 | | // Todo(Rahul) : May need to be changed when we implement table level overrides. |
8311 | 55 | { |
8312 | 55 | auto l = cluster_config_->LockForRead(); |
8313 | | // If the config has no replication info, use empty string for the placement uuid, otherwise |
8314 | | // calculate it from the reported peer. |
8315 | 55 | auto placement_uuid = l->pb.has_replication_info() |
8316 | 55 | ? VERIFY_RESULT(CatalogManagerUtil::GetPlacementUuidFromRaftPeer( |
8317 | 55 | l->pb.replication_info(), peer)) |
8318 | 40 | : ""; |
8319 | 55 | common->set_placement_uuid(placement_uuid); |
8320 | 55 | } |
8321 | 55 | return master_->ts_manager()->RegisterTS(instance_pb, registration_pb, master_->MakeCloudInfoPB(), |
8322 | 55 | &master_->proxy_cache(), |
8323 | 55 | RegisteredThroughHeartbeat::kFalse); |
8324 | 55 | } |
8325 | | |
8326 | | void CatalogManager::ReconcileTabletReplicasInLocalMemoryWithReport( |
8327 | | const scoped_refptr<TabletInfo>& tablet, |
8328 | | const std::string& sender_uuid, |
8329 | | const ConsensusStatePB& consensus_state, |
8330 | 64.2k | const ReportedTabletPB& report) { |
8331 | 64.2k | auto replica_locations = std::make_shared<TabletReplicaMap>(); |
8332 | 64.2k | auto prev_rl = tablet->GetReplicaLocations(); |
8333 | | |
8334 | 191k | for (const consensus::RaftPeerPB& peer : consensus_state.config().peers()) { |
8335 | 191k | shared_ptr<TSDescriptor> ts_desc; |
8336 | 191k | if (!peer.has_permanent_uuid()) { |
8337 | 0 | LOG_WITH_PREFIX(WARNING) << "Missing UUID for peer" << peer.ShortDebugString(); |
8338 | 0 | continue; |
8339 | 0 | } |
8340 | 191k | if (!master_->ts_manager()->LookupTSByUUID(peer.permanent_uuid(), &ts_desc)) { |
8341 | 55 | if (!GetAtomicFlag(&FLAGS_enable_register_ts_from_raft)) { |
8342 | 0 | LOG_WITH_PREFIX(WARNING) << "Tablet server has never reported in. " |
8343 | 0 | << "Not including in replica locations map yet. Peer: " << peer.ShortDebugString() |
8344 | 0 | << "; Tablet: " << tablet->ToString(); |
8345 | 0 | continue; |
8346 | 0 | } |
8347 | | |
8348 | 55 | LOG_WITH_PREFIX(INFO) << "Tablet server has never reported in. Registering the ts using " |
8349 | 55 | << "the raft config. Peer: " << peer.ShortDebugString() |
8350 | 55 | << "; Tablet: " << tablet->ToString(); |
8351 | 55 | Status s = RegisterTsFromRaftConfig(peer); |
8352 | 55 | if (!s.ok()) { |
8353 | 9 | LOG_WITH_PREFIX(WARNING) << "Could not register ts from raft config: " << s |
8354 | 9 | << " Skip updating the replica map."; |
8355 | 9 | continue; |
8356 | 9 | } |
8357 | | |
8358 | | // Guaranteed to find the ts since we just registered. |
8359 | 46 | master_->ts_manager()->LookupTSByUUID(peer.permanent_uuid(), &ts_desc); |
8360 | 46 | if (!ts_desc.get()) { |
8361 | 0 | LOG_WITH_PREFIX(WARNING) << "Could not find ts with uuid " << peer.permanent_uuid() |
8362 | 0 | << " after registering from raft config. Skip updating the replica" |
8363 | 0 | << " map."; |
8364 | 0 | continue; |
8365 | 0 | } |
8366 | 191k | } |
8367 | | |
8368 | | // Do not update replicas in the NOT_STARTED or BOOTSTRAPPING state (unless they are stale). |
8369 | 191k | bool use_existing = false; |
8370 | 191k | const TabletReplica* existing_replica = nullptr; |
8371 | 191k | auto it = prev_rl->find(ts_desc->permanent_uuid()); |
8372 | 191k | if (it != prev_rl->end()) { |
8373 | 106k | existing_replica = &it->second; |
8374 | 106k | } |
8375 | 191k | if (existing_replica && peer.permanent_uuid() != sender_uuid) { |
8376 | | // IsStarting returns true if state == NOT_STARTED or state == BOOTSTRAPPING. |
8377 | 71.2k | use_existing = existing_replica->IsStarting() && !existing_replica->IsStale(); |
8378 | 71.2k | } |
8379 | 191k | if (use_existing) { |
8380 | 377 | InsertOrDie(replica_locations.get(), existing_replica->ts_desc->permanent_uuid(), |
8381 | 377 | *existing_replica); |
8382 | 190k | } else { |
8383 | 190k | TabletReplica replica; |
8384 | 190k | CreateNewReplicaForLocalMemory(ts_desc.get(), &consensus_state, report, &replica); |
8385 | 190k | auto result = replica_locations.get()->insert({replica.ts_desc->permanent_uuid(), replica}); |
8386 | 7 | LOG_IF(FATAL, !result.second) << "duplicate uuid: " << replica.ts_desc->permanent_uuid(); |
8387 | 190k | if (existing_replica) { |
8388 | 106k | result.first->second.UpdateDriveInfo(existing_replica->drive_info); |
8389 | 106k | } |
8390 | 190k | } |
8391 | 191k | } |
8392 | | |
8393 | | // Update the local tablet replica set. This deviates from persistent state during bootstrapping. |
8394 | 64.2k | tablet->SetReplicaLocations(replica_locations); |
8395 | 64.2k | tablet_locations_version_.fetch_add(1, std::memory_order_acq_rel); |
8396 | 64.2k | } |
8397 | | |
8398 | | void CatalogManager::UpdateTabletReplicaInLocalMemory(TSDescriptor* ts_desc, |
8399 | | const ConsensusStatePB* consensus_state, |
8400 | | const ReportedTabletPB& report, |
8401 | 188k | const scoped_refptr<TabletInfo>& tablet) { |
8402 | 188k | TabletReplica replica; |
8403 | 188k | CreateNewReplicaForLocalMemory(ts_desc, consensus_state, report, &replica); |
8404 | 188k | tablet->UpdateReplicaLocations(replica); |
8405 | 188k | tablet_locations_version_.fetch_add(1, std::memory_order_acq_rel); |
8406 | 188k | } |
8407 | | |
8408 | | void CatalogManager::CreateNewReplicaForLocalMemory(TSDescriptor* ts_desc, |
8409 | | const ConsensusStatePB* consensus_state, |
8410 | | const ReportedTabletPB& report, |
8411 | 378k | TabletReplica* new_replica) { |
8412 | | // Tablets in state NOT_STARTED or BOOTSTRAPPING don't have a consensus. |
8413 | 378k | if (consensus_state == nullptr) { |
8414 | 2.29k | new_replica->role = PeerRole::NON_PARTICIPANT; |
8415 | 2.29k | new_replica->member_type = PeerMemberType::UNKNOWN_MEMBER_TYPE; |
8416 | 376k | } else { |
8417 | 0 | CHECK(consensus_state != nullptr) << "No cstate: " << ts_desc->permanent_uuid() |
8418 | 0 | << " - " << report.state(); |
8419 | 376k | new_replica->role = GetConsensusRole(ts_desc->permanent_uuid(), *consensus_state); |
8420 | 376k | new_replica->member_type = GetConsensusMemberType(ts_desc->permanent_uuid(), *consensus_state); |
8421 | 376k | } |
8422 | 378k | if (report.has_should_disable_lb_move()) { |
8423 | 376k | new_replica->should_disable_lb_move = report.should_disable_lb_move(); |
8424 | 376k | } |
8425 | 378k | if (report.has_fs_data_dir()) { |
8426 | 378k | new_replica->fs_data_dir = report.fs_data_dir(); |
8427 | 378k | } |
8428 | 378k | new_replica->state = report.state(); |
8429 | 378k | new_replica->ts_desc = ts_desc; |
8430 | 378k | if (!ts_desc->registered_through_heartbeat()) { |
8431 | 2.46k | new_replica->time_updated = MonoTime::Now() - ts_desc->TimeSinceHeartbeat(); |
8432 | 2.46k | } |
8433 | 378k | } |
8434 | | |
8435 | | Status CatalogManager::GetTabletPeer(const TabletId& tablet_id, |
8436 | 1.15M | std::shared_ptr<TabletPeer>* ret_tablet_peer) const { |
8437 | | // Note: CatalogManager has only one table, 'sys_catalog', with only |
8438 | | // one tablet. |
8439 | | |
8440 | 1.15M | if (PREDICT_FALSE(!IsInitialized())) { |
8441 | | // Master puts up the consensus service first and then initiates catalog manager's creation |
8442 | | // asynchronously. So this case is possible, but harmless. The RPC will simply be retried. |
8443 | | // Previously, because we weren't checking for this condition, we would fatal down stream. |
8444 | 100 | const string& reason = "CatalogManager is not yet initialized"; |
8445 | 100 | YB_LOG_EVERY_N(WARNING, 1000) << reason; |
8446 | 100 | return STATUS(ServiceUnavailable, reason); |
8447 | 100 | } |
8448 | | |
8449 | 0 | CHECK(sys_catalog_) << "sys_catalog_ must be initialized!"; |
8450 | | |
8451 | 1.15M | if (master_->opts().IsShellMode()) { |
8452 | 143 | return STATUS_SUBSTITUTE(NotFound, |
8453 | 143 | "In shell mode: no tablet_id $0 exists in CatalogManager.", tablet_id); |
8454 | 143 | } |
8455 | | |
8456 | 1.15M | if (sys_catalog_->tablet_id() == tablet_id && sys_catalog_->tablet_peer().get() != nullptr && |
8457 | 1.15M | sys_catalog_->tablet_peer()->CheckRunning().ok()) { |
8458 | 1.15M | *ret_tablet_peer = tablet_peer(); |
8459 | 0 | } else { |
8460 | 0 | return STATUS_SUBSTITUTE(NotFound, |
8461 | 0 | "no SysTable in the RUNNING state exists with tablet_id $0 in CatalogManager", tablet_id); |
8462 | 0 | } |
8463 | 1.15M | return Status::OK(); |
8464 | 1.15M | } |
8465 | | |
8466 | 1.18M | const NodeInstancePB& CatalogManager::NodeInstance() const { |
8467 | 1.18M | return master_->instance_pb(); |
8468 | 1.18M | } |
8469 | | |
8470 | 19.9k | Status CatalogManager::GetRegistration(ServerRegistrationPB* reg) const { |
8471 | 19.9k | return master_->GetRegistration(reg, server::RpcOnly::kTrue); |
8472 | 19.9k | } |
8473 | | |
8474 | 39 | Status CatalogManager::UpdateMastersListInMemoryAndDisk() { |
8475 | 39 | DCHECK(master_->opts().IsShellMode()); |
8476 | | |
8477 | 39 | if (!master_->opts().IsShellMode()) { |
8478 | 0 | return STATUS(IllegalState, "Cannot update master's info when process is not in shell mode."); |
8479 | 0 | } |
8480 | | |
8481 | 39 | consensus::ConsensusStatePB consensus_state; |
8482 | 39 | RETURN_NOT_OK(GetCurrentConfig(&consensus_state)); |
8483 | | |
8484 | 39 | if (!consensus_state.has_config()) { |
8485 | 0 | return STATUS(NotFound, "No Raft config found."); |
8486 | 0 | } |
8487 | | |
8488 | 39 | RETURN_NOT_OK(sys_catalog_->ConvertConfigToMasterAddresses(consensus_state.config())); |
8489 | 39 | RETURN_NOT_OK(sys_catalog_->CreateAndFlushConsensusMeta(master_->fs_manager(), |
8490 | 39 | consensus_state.config(), |
8491 | 39 | consensus_state.current_term())); |
8492 | | |
8493 | 39 | return Status::OK(); |
8494 | 39 | } |
8495 | | |
8496 | 5.35k | Status CatalogManager::EnableBgTasks() { |
8497 | 5.35k | LockGuard lock(mutex_); |
8498 | | // Initialize refresh_ysql_tablespace_info_task_. This will be used to |
8499 | | // manage the background task that refreshes tablespace info. This task |
8500 | | // will be started by the CatalogManagerBgTasks below. |
8501 | 5.35k | refresh_ysql_tablespace_info_task_.Bind(&master_->messenger()->scheduler()); |
8502 | | |
8503 | 5.35k | background_tasks_.reset(new CatalogManagerBgTasks(this)); |
8504 | 5.35k | RETURN_NOT_OK_PREPEND(background_tasks_->Init(), |
8505 | 5.35k | "Failed to initialize catalog manager background tasks"); |
8506 | | |
8507 | | // Add bg thread to rebuild yql system partitions. |
8508 | 5.35k | refresh_yql_partitions_task_.Bind(&master_->messenger()->scheduler()); |
8509 | | |
8510 | 5.35k | RETURN_NOT_OK(background_tasks_thread_pool_->SubmitFunc( |
8511 | 5.35k | [this]() { RebuildYQLSystemPartitions(); })); |
8512 | | |
8513 | 5.35k | return Status::OK(); |
8514 | 5.35k | } |
8515 | | |
8516 | 125 | Status CatalogManager::StartRemoteBootstrap(const StartRemoteBootstrapRequestPB& req) { |
8517 | 125 | const TabletId& tablet_id = req.tablet_id(); |
8518 | 125 | std::unique_lock<std::mutex> l(remote_bootstrap_mtx_, std::try_to_lock); |
8519 | 125 | if (!l.owns_lock()) { |
8520 | 84 | return STATUS_SUBSTITUTE(AlreadyPresent, |
8521 | 84 | "Remote bootstrap of tablet $0 already in progress", tablet_id); |
8522 | 84 | } |
8523 | | |
8524 | 41 | if (!master_->opts().IsShellMode()) { |
8525 | 0 | return STATUS(IllegalState, "Cannot bootstrap a master which is not in shell mode."); |
8526 | 0 | } |
8527 | | |
8528 | 41 | LOG(INFO) << "Starting remote bootstrap: " << req.ShortDebugString(); |
8529 | | |
8530 | 41 | HostPort bootstrap_peer_addr = HostPortFromPB(DesiredHostPort( |
8531 | 41 | req.source_broadcast_addr(), req.source_private_addr(), req.source_cloud_info(), |
8532 | 41 | master_->MakeCloudInfoPB())); |
8533 | | |
8534 | 41 | const string& bootstrap_peer_uuid = req.bootstrap_peer_uuid(); |
8535 | 41 | int64_t leader_term = req.caller_term(); |
8536 | | |
8537 | 41 | std::shared_ptr<TabletPeer> old_tablet_peer; |
8538 | 41 | RaftGroupMetadataPtr meta; |
8539 | 41 | bool replacing_tablet = false; |
8540 | | |
8541 | 41 | if (tablet_exists_) { |
8542 | 0 | old_tablet_peer = tablet_peer(); |
8543 | | // Nothing to recover if the remote bootstrap client start failed the last time. |
8544 | 0 | if (old_tablet_peer) { |
8545 | 0 | meta = old_tablet_peer->tablet_metadata(); |
8546 | 0 | replacing_tablet = true; |
8547 | 0 | } |
8548 | 0 | } |
8549 | | |
8550 | 41 | if (replacing_tablet) { |
8551 | | // Make sure the existing tablet peer is shut down and tombstoned. |
8552 | 0 | RETURN_NOT_OK(tserver::HandleReplacingStaleTablet(meta, |
8553 | 0 | old_tablet_peer, |
8554 | 0 | tablet_id, |
8555 | 0 | master_->fs_manager()->uuid(), |
8556 | 0 | leader_term)); |
8557 | 0 | } |
8558 | | |
8559 | 41 | LOG_WITH_PREFIX(INFO) << " Initiating remote bootstrap from peer " << bootstrap_peer_uuid |
8560 | 41 | << " (" << bootstrap_peer_addr.ToString() << ")."; |
8561 | | |
8562 | 41 | auto rb_client = std::make_unique<tserver::RemoteBootstrapClient>( |
8563 | 41 | tablet_id, master_->fs_manager()); |
8564 | | |
8565 | | // Download and persist the remote superblock in TABLET_DATA_COPYING state. |
8566 | 41 | if (replacing_tablet) { |
8567 | 0 | RETURN_NOT_OK(rb_client->SetTabletToReplace(meta, leader_term)); |
8568 | 0 | } |
8569 | 41 | RETURN_NOT_OK(rb_client->Start( |
8570 | 41 | bootstrap_peer_uuid, &master_->proxy_cache(), bootstrap_peer_addr, &meta)); |
8571 | | // This SetupTabletPeer is needed by rb_client to perform the remote bootstrap/fetch. |
8572 | | // And the SetupTablet below to perform "local bootstrap" cannot be done until the remote fetch |
8573 | | // has succeeded. So keeping them seperate for now. |
8574 | 40 | sys_catalog_->SetupTabletPeer(meta); |
8575 | 40 | if (PREDICT_FALSE(FLAGS_TEST_inject_latency_during_remote_bootstrap_secs)) { |
8576 | 1 | LOG(INFO) << "Injecting " << FLAGS_TEST_inject_latency_during_remote_bootstrap_secs |
8577 | 1 | << " seconds of latency for test"; |
8578 | 1 | SleepFor(MonoDelta::FromSeconds(FLAGS_TEST_inject_latency_during_remote_bootstrap_secs)); |
8579 | 1 | } |
8580 | | |
8581 | | // From this point onward, the superblock is persisted in TABLET_DATA_COPYING |
8582 | | // state, and we need to tombstone the tablet if additional steps prior to |
8583 | | // getting to a TABLET_DATA_READY state fail. |
8584 | 40 | tablet_exists_ = true; |
8585 | | |
8586 | | // Download all of the remote files. |
8587 | 40 | TOMBSTONE_NOT_OK(rb_client->FetchAll(tablet_peer()->status_listener()), |
8588 | 40 | meta, |
8589 | 40 | master_->fs_manager()->uuid(), |
8590 | 40 | Substitute("Remote bootstrap: Unable to fetch data from remote peer $0 ($1)", |
8591 | 40 | bootstrap_peer_uuid, bootstrap_peer_addr.ToString()), |
8592 | 40 | nullptr); |
8593 | | |
8594 | | // Write out the last files to make the new replica visible and update the |
8595 | | // TabletDataState in the superblock to TABLET_DATA_READY. |
8596 | | // Finish() will call EndRemoteSession() and wait for the leader to successfully submit a |
8597 | | // ChangeConfig request (to change this master's role from PRE_VOTER or PRE_OBSERVER to VOTER or |
8598 | | // OBSERVER respectively). If the RPC times out, we will ignore the error (since the leader could |
8599 | | // have successfully submitted the ChangeConfig request and failed to respond before in time) |
8600 | | // and check the committed config until we find that this master's role has changed, or until we |
8601 | | // time out which will cause us to tombstone the tablet. |
8602 | 39 | TOMBSTONE_NOT_OK(rb_client->Finish(), |
8603 | 39 | meta, |
8604 | 39 | master_->fs_manager()->uuid(), |
8605 | 39 | "Remote bootstrap: Failed calling Finish()", |
8606 | 39 | nullptr); |
8607 | | |
8608 | | // Synchronous tablet open for "local bootstrap". |
8609 | 39 | RETURN_NOT_OK(tserver::ShutdownAndTombstoneTabletPeerNotOk( |
8610 | 39 | sys_catalog_->OpenTablet(meta), sys_catalog_->tablet_peer(), meta, |
8611 | 39 | master_->fs_manager()->uuid(), "Remote bootstrap: Failed opening sys catalog")); |
8612 | | |
8613 | | // Set up the in-memory master list and also flush the cmeta. |
8614 | 39 | RETURN_NOT_OK(UpdateMastersListInMemoryAndDisk()); |
8615 | | |
8616 | 39 | master_->SetShellMode(false); |
8617 | | |
8618 | | // Call VerifyChangeRoleSucceeded only after we have set shell mode to false. Otherwise, |
8619 | | // CatalogManager::GetTabletPeer will always return an error, and the consensus will never get |
8620 | | // updated. |
8621 | 39 | auto status = rb_client->VerifyChangeRoleSucceeded( |
8622 | 39 | sys_catalog_->tablet_peer()->shared_consensus()); |
8623 | | |
8624 | 39 | if (!status.ok()) { |
8625 | 0 | LOG_WITH_PREFIX(WARNING) << "Remote bootstrap finished. " |
8626 | 0 | << "Failed calling VerifyChangeRoleSucceeded: " |
8627 | 0 | << status.ToString(); |
8628 | 39 | } else { |
8629 | 39 | LOG_WITH_PREFIX(INFO) << "Remote bootstrap finished successfully"; |
8630 | 39 | } |
8631 | | |
8632 | 39 | LOG(INFO) << "Master completed remote bootstrap and is out of shell mode."; |
8633 | | |
8634 | 39 | RETURN_NOT_OK(EnableBgTasks()); |
8635 | | |
8636 | 39 | return Status::OK(); |
8637 | 39 | } |
8638 | | |
8639 | | CHECKED_STATUS CatalogManager::SendAlterTableRequest(const scoped_refptr<TableInfo>& table, |
8640 | 5.20k | const AlterTableRequestPB* req) { |
8641 | 5.20k | auto tablets = table->GetTablets(); |
8642 | | |
8643 | 5.20k | bool is_ysql_table_with_transaction_metadata = |
8644 | 5.20k | table->GetTableType() == TableType::PGSQL_TABLE_TYPE && |
8645 | 3.07k | req != nullptr && |
8646 | 2.69k | req->has_transaction() && |
8647 | 154 | req->transaction().has_transaction_id(); |
8648 | | |
8649 | 5.20k | bool alter_table_has_add_or_drop_column_step = false; |
8650 | 5.20k | if (req && (req->alter_schema_steps_size() || req->has_alter_properties())) { |
8651 | 268 | for (const AlterTableRequestPB::Step& step : req->alter_schema_steps()) { |
8652 | 268 | if (step.type() == AlterTableRequestPB::ADD_COLUMN || |
8653 | 253 | step.type() == AlterTableRequestPB::DROP_COLUMN) { |
8654 | 253 | alter_table_has_add_or_drop_column_step = true; |
8655 | 253 | break; |
8656 | 253 | } |
8657 | 268 | } |
8658 | 275 | } |
8659 | | |
8660 | 5.20k | TransactionId txn_id = TransactionId::Nil(); |
8661 | 5.20k | if (is_ysql_table_with_transaction_metadata && alter_table_has_add_or_drop_column_step) { |
8662 | 111 | { |
8663 | 111 | LOG(INFO) << "Persist transaction metadata into SysTableEntryPB for table ID " << table->id(); |
8664 | 111 | TRACE("Locking table"); |
8665 | 111 | auto l = table->LockForWrite(); |
8666 | 111 | auto& tablet_data = *l.mutable_data(); |
8667 | 111 | auto& table_pb = tablet_data.pb; |
8668 | 111 | table_pb.mutable_transaction()->CopyFrom(req->transaction()); |
8669 | | |
8670 | | // Update sys-catalog with the transaction ID. |
8671 | 111 | TRACE("Updating table metadata on disk"); |
8672 | 111 | RETURN_NOT_OK(master_->catalog_manager_impl()->sys_catalog_->Upsert( |
8673 | 111 | master_->catalog_manager()->leader_ready_term(), table.get())); |
8674 | | |
8675 | | // Update the in-memory state. |
8676 | 111 | TRACE("Committing in-memory state"); |
8677 | 111 | l.Commit(); |
8678 | 111 | } |
8679 | 111 | txn_id = VERIFY_RESULT(FullyDecodeTransactionId(req->transaction().transaction_id())); |
8680 | 111 | } |
8681 | | |
8682 | 18.2k | for (const scoped_refptr<TabletInfo>& tablet : tablets) { |
8683 | 18.2k | auto call = std::make_shared<AsyncAlterTable>(master_, AsyncTaskPool(), tablet, table, txn_id); |
8684 | 18.2k | tablet->table()->AddTask(call); |
8685 | 18.2k | if (PREDICT_FALSE(FLAGS_TEST_slowdown_alter_table_rpcs_ms > 0)) { |
8686 | 0 | LOG(INFO) << "Sleeping for " << tablet->id() << " " |
8687 | 0 | << FLAGS_TEST_slowdown_alter_table_rpcs_ms |
8688 | 0 | << "ms before sending async alter table request"; |
8689 | 0 | SleepFor(MonoDelta::FromMilliseconds(FLAGS_TEST_slowdown_alter_table_rpcs_ms)); |
8690 | 0 | } |
8691 | 18.2k | RETURN_NOT_OK(ScheduleTask(call)); |
8692 | 18.2k | } |
8693 | 5.20k | return Status::OK(); |
8694 | 5.20k | } |
8695 | | |
8696 | | void CatalogManager::SendCopartitionTabletRequest(const scoped_refptr<TabletInfo>& tablet, |
8697 | 0 | const scoped_refptr<TableInfo>& table) { |
8698 | 0 | auto call = std::make_shared<AsyncCopartitionTable>(master_, AsyncTaskPool(), tablet, table); |
8699 | 0 | table->AddTask(call); |
8700 | 0 | WARN_NOT_OK(ScheduleTask(call), "Failed to send copartition table request"); |
8701 | 0 | } |
8702 | | |
8703 | | Status CatalogManager::SendSplitTabletRequest( |
8704 | | const scoped_refptr<TabletInfo>& tablet, std::array<TabletId, kNumSplitParts> new_tablet_ids, |
8705 | 43 | const std::string& split_encoded_key, const std::string& split_partition_key) { |
8706 | 0 | VLOG(2) << "Scheduling SplitTablet request to leader tserver for source tablet ID: " |
8707 | 0 | << tablet->tablet_id() << ", after-split tablet IDs: " << AsString(new_tablet_ids); |
8708 | 43 | auto call = std::make_shared<AsyncSplitTablet>( |
8709 | 43 | master_, AsyncTaskPool(), tablet, new_tablet_ids, split_encoded_key, split_partition_key, |
8710 | 43 | &tablet_split_manager_); |
8711 | 43 | tablet->table()->AddTask(call); |
8712 | 43 | return ScheduleTask(call); |
8713 | 43 | } |
8714 | | |
8715 | | void CatalogManager::DeleteTabletReplicas( |
8716 | 15.6k | TabletInfo* tablet, const std::string& msg, HideOnly hide_only) { |
8717 | 15.6k | auto locations = tablet->GetReplicaLocations(); |
8718 | 15.6k | LOG(INFO) << "Sending DeleteTablet for " << locations->size() |
8719 | 15.6k | << " replicas of tablet " << tablet->tablet_id(); |
8720 | 46.6k | for (const auto& r : *locations) { |
8721 | 46.6k | SendDeleteTabletRequest(tablet->tablet_id(), TABLET_DATA_DELETED, boost::none, tablet->table(), |
8722 | 46.6k | r.second.ts_desc, msg, hide_only); |
8723 | 46.6k | } |
8724 | 15.6k | } |
8725 | | |
8726 | 5.05k | Status CatalogManager::CheckIfForbiddenToDeleteTabletOf(const scoped_refptr<TableInfo>& table) { |
8727 | | // Do not delete the system catalog tablet. |
8728 | 5.05k | if (IsSystemTable(*table)) { |
8729 | 2.26k | return STATUS(InvalidArgument, "It is not allowed to delete system tables"); |
8730 | 2.26k | } |
8731 | | // Do not delete the tablet of a colocated table. |
8732 | 2.78k | if (table->IsColocatedUserTable()) { |
8733 | 27 | return STATUS(InvalidArgument, "It is not allowed to delete tablets of the colocated tables."); |
8734 | 27 | } |
8735 | 2.76k | return Status::OK(); |
8736 | 2.76k | } |
8737 | | |
8738 | | Status CatalogManager::DeleteTabletsAndSendRequests( |
8739 | 5.04k | const TableInfoPtr& table, const RepeatedBytes& retained_by_snapshot_schedules) { |
8740 | | // Silently fail if tablet deletion is forbidden so table deletion can continue executing. |
8741 | 5.04k | if (!CheckIfForbiddenToDeleteTabletOf(table).ok()) { |
8742 | 2.29k | return Status::OK(); |
8743 | 2.29k | } |
8744 | | |
8745 | 2.75k | auto tablets = table->GetTablets(IncludeInactive::kTrue); |
8746 | | |
8747 | 37.1k | std::sort(tablets.begin(), tablets.end(), [](const auto& lhs, const auto& rhs) { |
8748 | 37.1k | return lhs->tablet_id() < rhs->tablet_id(); |
8749 | 37.1k | }); |
8750 | | |
8751 | 2.75k | string deletion_msg = "Table deleted at " + LocalTimeAsString(); |
8752 | 2.75k | RETURN_NOT_OK(DeleteTabletListAndSendRequests( |
8753 | 2.75k | tablets, deletion_msg, retained_by_snapshot_schedules)); |
8754 | | |
8755 | 2.75k | if (table->IsColocatedParentTable()) { |
8756 | 2 | SharedLock lock(mutex_); |
8757 | 2 | colocated_tablet_ids_map_.erase(table->namespace_id()); |
8758 | 2.75k | } else if (table->IsTablegroupParentTable()) { |
8759 | | // In the case of dropped tablegroup parent table, need to delete tablegroup info. |
8760 | 2 | SharedLock lock(mutex_); |
8761 | 2 | tablegroup_ids_map_.erase(table->id().substr(0, 32)); |
8762 | 2 | } |
8763 | 2.75k | return Status::OK(); |
8764 | 2.75k | } |
8765 | | |
8766 | | Status CatalogManager::DeleteTabletListAndSendRequests( |
8767 | | const std::vector<scoped_refptr<TabletInfo>>& tablets, const std::string& deletion_msg, |
8768 | 2.76k | const google::protobuf::RepeatedPtrField<std::string>& retained_by_snapshot_schedules) { |
8769 | 2.76k | struct TabletData { |
8770 | 2.76k | TabletInfoPtr tablet; |
8771 | 2.76k | TabletInfo::WriteLock lock; |
8772 | 2.76k | HideOnly hide_only; |
8773 | 2.76k | }; |
8774 | 2.76k | std::vector<TabletData> tablets_data; |
8775 | 2.76k | tablets_data.reserve(tablets.size()); |
8776 | 2.76k | std::vector<TabletInfo*> tablet_infos; |
8777 | 2.76k | tablet_infos.reserve(tablets_data.size()); |
8778 | 2.76k | std::vector<TabletInfoPtr> marked_as_hidden; |
8779 | | |
8780 | | // Grab tablets and tablet write locks. The list should already be in tablet_id sorted order. |
8781 | 2.76k | { |
8782 | 2.76k | SharedLock read_lock(mutex_); |
8783 | 15.6k | for (const auto& tablet : tablets) { |
8784 | 15.6k | tablets_data.push_back(TabletData { |
8785 | 15.6k | .tablet = tablet, |
8786 | 15.6k | .lock = tablet->LockForWrite(), |
8787 | | // Hide tablet if it is retained by snapshot schedule, or is part of a cdc stream. |
8788 | 15.6k | .hide_only = HideOnly(!retained_by_snapshot_schedules.empty()), |
8789 | 15.6k | }); |
8790 | 15.6k | if (!tablets_data.back().hide_only) { |
8791 | | // Also check if this tablet is part of a cdc stream and is not already hidden. If this is |
8792 | | // a cdc stream producer and is already hidden, then we should delete this tablet. |
8793 | 15.6k | tablets_data.back().hide_only = HideOnly( |
8794 | 15.6k | IsTableCdcProducer(*tablet->table()) && !tablets_data.back().lock->ListedAsHidden()); |
8795 | 15.6k | } |
8796 | | |
8797 | 15.6k | tablet_infos.emplace_back(tablet.get()); |
8798 | 15.6k | } |
8799 | 2.76k | } |
8800 | | |
8801 | | // Use the same hybrid time for all hidden tablets. |
8802 | 2.76k | HybridTime hide_hybrid_time = master_->clock()->Now(); |
8803 | | |
8804 | | // Mark the tablets as deleted. |
8805 | 15.6k | for (auto& tablet_data : tablets_data) { |
8806 | 15.6k | auto& tablet = tablet_data.tablet; |
8807 | 15.6k | auto& tablet_lock = tablet_data.lock; |
8808 | | |
8809 | 15.6k | bool was_hidden = tablet_lock->ListedAsHidden(); |
8810 | | // Inactive tablet now, so remove it from partitions_. |
8811 | | // After all the tablets have been deleted from the tservers, we remove it from tablets_. |
8812 | 15.6k | tablet->table()->RemoveTablet(tablet->id(), DeactivateOnly::kTrue); |
8813 | | |
8814 | 15.6k | if (tablet_data.hide_only) { |
8815 | 0 | LOG(INFO) << "Hiding tablet " << tablet->tablet_id(); |
8816 | 0 | tablet_lock.mutable_data()->pb.set_hide_hybrid_time(hide_hybrid_time.ToUint64()); |
8817 | 0 | *tablet_lock.mutable_data()->pb.mutable_retained_by_snapshot_schedules() = |
8818 | 0 | retained_by_snapshot_schedules; |
8819 | 15.6k | } else { |
8820 | 15.6k | LOG(INFO) << "Deleting tablet " << tablet->tablet_id(); |
8821 | 15.6k | tablet_lock.mutable_data()->set_state(SysTabletsEntryPB::DELETED, deletion_msg); |
8822 | 15.6k | } |
8823 | 15.6k | if (tablet_lock->ListedAsHidden() && !was_hidden) { |
8824 | 0 | marked_as_hidden.push_back(tablet); |
8825 | 0 | } |
8826 | 15.6k | } |
8827 | | |
8828 | | // Update all the tablet states in raft in bulk. |
8829 | 2.76k | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), tablet_infos)); |
8830 | | |
8831 | | // Commit the change. |
8832 | 15.6k | for (auto& tablet_data : tablets_data) { |
8833 | 15.6k | auto& tablet = tablet_data.tablet; |
8834 | 15.6k | auto& tablet_lock = tablet_data.lock; |
8835 | | |
8836 | 15.6k | tablet_lock.Commit(); |
8837 | 15.6k | LOG(INFO) << (tablet_data.hide_only ? "Hid tablet " : "Deleted tablet ") << tablet->tablet_id(); |
8838 | | |
8839 | 15.6k | DeleteTabletReplicas(tablet.get(), deletion_msg, tablet_data.hide_only); |
8840 | 15.6k | } |
8841 | | |
8842 | 2.76k | if (!marked_as_hidden.empty()) { |
8843 | 0 | LockGuard lock(mutex_); |
8844 | 0 | hidden_tablets_.insert(hidden_tablets_.end(), marked_as_hidden.begin(), marked_as_hidden.end()); |
8845 | 0 | } |
8846 | | |
8847 | 2.76k | return Status::OK(); |
8848 | 2.76k | } |
8849 | | |
8850 | | void CatalogManager::SendDeleteTabletRequest( |
8851 | | const TabletId& tablet_id, |
8852 | | TabletDataState delete_type, |
8853 | | const boost::optional<int64_t>& cas_config_opid_index_less_or_equal, |
8854 | | const scoped_refptr<TableInfo>& table, |
8855 | | TSDescriptor* ts_desc, |
8856 | | const string& reason, |
8857 | 46.6k | bool hide_only) { |
8858 | 46.6k | if (PREDICT_FALSE(GetAtomicFlag(&FLAGS_TEST_disable_tablet_deletion))) { |
8859 | 0 | return; |
8860 | 0 | } |
8861 | 46.6k | LOG_WITH_PREFIX(INFO) |
8862 | 46.6k | << (hide_only ? "Hiding" : "Deleting") << " tablet " << tablet_id << " on peer " |
8863 | 46.6k | << ts_desc->permanent_uuid() << " with delete type " |
8864 | 46.6k | << TabletDataState_Name(delete_type) << " (" << reason << ")"; |
8865 | 46.6k | auto call = std::make_shared<AsyncDeleteReplica>(master_, AsyncTaskPool(), |
8866 | 46.6k | ts_desc->permanent_uuid(), table, tablet_id, delete_type, |
8867 | 46.6k | cas_config_opid_index_less_or_equal, reason); |
8868 | 46.6k | if (hide_only) { |
8869 | 0 | call->set_hide_only(hide_only); |
8870 | 0 | } |
8871 | 46.6k | if (table != nullptr) { |
8872 | 46.6k | table->AddTask(call); |
8873 | 46.6k | } |
8874 | | |
8875 | 46.6k | auto status = ScheduleTask(call); |
8876 | 46.6k | WARN_NOT_OK(status, Substitute("Failed to send delete request for tablet $0", tablet_id)); |
8877 | | // TODO(bogdan): does the pending delete semantics need to change? |
8878 | 46.6k | if (status.ok()) { |
8879 | 46.6k | ts_desc->AddPendingTabletDelete(tablet_id); |
8880 | 46.6k | } |
8881 | 46.6k | } |
8882 | | |
8883 | | void CatalogManager::SendLeaderStepDownRequest( |
8884 | | const scoped_refptr<TabletInfo>& tablet, const ConsensusStatePB& cstate, |
8885 | | const string& change_config_ts_uuid, bool should_remove, |
8886 | 5.81k | const string& new_leader_uuid) { |
8887 | 5.81k | auto task = std::make_shared<AsyncTryStepDown>( |
8888 | 5.81k | master_, AsyncTaskPool(), tablet, cstate, change_config_ts_uuid, should_remove, |
8889 | 5.81k | new_leader_uuid); |
8890 | 5.81k | tablet->table()->AddTask(task); |
8891 | 5.81k | Status status = ScheduleTask(task); |
8892 | 5.81k | WARN_NOT_OK(status, Substitute("Failed to send new $0 request", task->type_name())); |
8893 | 5.81k | } |
8894 | | |
8895 | | // TODO: refactor this into a joint method with the add one. |
8896 | | void CatalogManager::SendRemoveServerRequest( |
8897 | | const scoped_refptr<TabletInfo>& tablet, const ConsensusStatePB& cstate, |
8898 | 794 | const string& change_config_ts_uuid) { |
8899 | | // Check if the user wants the leader to be stepped down. |
8900 | 794 | auto task = std::make_shared<AsyncRemoveServerTask>( |
8901 | 794 | master_, AsyncTaskPool(), tablet, cstate, change_config_ts_uuid); |
8902 | 794 | tablet->table()->AddTask(task); |
8903 | 794 | WARN_NOT_OK(ScheduleTask(task), Substitute("Failed to send new $0 request", task->type_name())); |
8904 | 794 | } |
8905 | | |
8906 | | void CatalogManager::SendAddServerRequest( |
8907 | | const scoped_refptr<TabletInfo>& tablet, PeerMemberType member_type, |
8908 | 1.04k | const ConsensusStatePB& cstate, const string& change_config_ts_uuid) { |
8909 | 1.04k | auto task = std::make_shared<AsyncAddServerTask>(master_, AsyncTaskPool(), tablet, member_type, |
8910 | 1.04k | cstate, change_config_ts_uuid); |
8911 | 1.04k | tablet->table()->AddTask(task); |
8912 | 1.04k | WARN_NOT_OK( |
8913 | 1.04k | ScheduleTask(task), |
8914 | 1.04k | Substitute("Failed to send AddServer of tserver $0 to tablet $1", |
8915 | 1.04k | change_config_ts_uuid, tablet.get()->ToString())); |
8916 | 1.04k | } |
8917 | | |
8918 | | void CatalogManager::GetPendingServerTasksUnlocked( |
8919 | | const TableId &table_uuid, |
8920 | | TabletToTabletServerMap *add_replica_tasks_map, |
8921 | | TabletToTabletServerMap *remove_replica_tasks_map, |
8922 | 121k | TabletToTabletServerMap *stepdown_leader_tasks_map) { |
8923 | | |
8924 | 121k | auto table = GetTableInfoUnlocked(table_uuid); |
8925 | 111k | for (const auto& task : table->GetTasks()) { |
8926 | 111k | TabletToTabletServerMap* outputMap = nullptr; |
8927 | 111k | if (task->type() == MonitoredTask::ASYNC_ADD_SERVER) { |
8928 | 173 | outputMap = add_replica_tasks_map; |
8929 | 110k | } else if (task->type() == MonitoredTask::ASYNC_REMOVE_SERVER) { |
8930 | 343 | outputMap = remove_replica_tasks_map; |
8931 | 110k | } else if (task->type() == MonitoredTask::ASYNC_TRY_STEP_DOWN) { |
8932 | | // Store new_leader_uuid instead of change_config_ts_uuid. |
8933 | 96 | auto raft_task = static_cast<AsyncTryStepDown*>(task.get()); |
8934 | 96 | (*stepdown_leader_tasks_map)[raft_task->tablet_id()] = raft_task->new_leader_uuid(); |
8935 | 96 | continue; |
8936 | 96 | } |
8937 | 110k | if (outputMap) { |
8938 | 516 | auto raft_task = static_cast<CommonInfoForRaftTask*>(task.get()); |
8939 | 516 | (*outputMap)[raft_task->tablet_id()] = raft_task->change_config_ts_uuid(); |
8940 | 516 | } |
8941 | 110k | } |
8942 | 121k | } |
8943 | | |
8944 | | void CatalogManager::ExtractTabletsToProcess( |
8945 | | TabletInfos *tablets_to_delete, |
8946 | 90.0k | TableToTabletInfos *tablets_to_process) { |
8947 | 90.0k | SharedLock lock(mutex_); |
8948 | | |
8949 | | // TODO: At the moment we loop through all the tablets |
8950 | | // we can keep a set of tablets waiting for "assignment" |
8951 | | // or just a counter to avoid to take the lock and loop through the tablets |
8952 | | // if everything is "stable". |
8953 | | |
8954 | 3.61M | for (const TabletInfoMap::value_type& entry : *tablet_map_) { |
8955 | 3.61M | scoped_refptr<TabletInfo> tablet = entry.second; |
8956 | 3.61M | auto table = tablet->table(); |
8957 | 3.61M | if (!table) { |
8958 | | // Tablet is orphaned or in preparing state, continue. |
8959 | 0 | continue; |
8960 | 0 | } |
8961 | | |
8962 | | // acquire table lock before tablets. |
8963 | 3.61M | auto table_lock = table->LockForRead(); |
8964 | 3.61M | auto tablet_lock = tablet->LockForRead(); |
8965 | | |
8966 | | // If the table is deleted or the tablet was replaced at table creation time. |
8967 | 3.61M | if (tablet_lock->is_deleted() || table_lock->started_deleting()) { |
8968 | | // Process this table deletion only once (tombstones for table may remain longer). |
8969 | 793k | if (table_ids_map_->find(tablet->table()->id()) != table_ids_map_->end()) { |
8970 | 793k | tablets_to_delete->push_back(tablet); |
8971 | 793k | } |
8972 | | // Don't process deleted tables regardless. |
8973 | 793k | continue; |
8974 | 793k | } |
8975 | | |
8976 | | // Running tablets. |
8977 | 2.82M | if (tablet_lock->is_running()) { |
8978 | | // TODO: handle last update > not responding timeout? |
8979 | 2.76M | continue; |
8980 | 2.76M | } |
8981 | | |
8982 | | // Tablets not yet assigned or with a report just received. |
8983 | 55.6k | (*tablets_to_process)[tablet->table()->id()].push_back(tablet); |
8984 | 55.6k | } |
8985 | 90.0k | } |
8986 | | |
8987 | 72.7k | bool CatalogManager::AreTablesDeleting() { |
8988 | 72.7k | SharedLock lock(mutex_); |
8989 | | |
8990 | 8.20M | for (const TableInfoMap::value_type& entry : *table_ids_map_) { |
8991 | 8.20M | scoped_refptr<TableInfo> table(entry.second); |
8992 | 8.20M | auto table_lock = table->LockForRead(); |
8993 | | // TODO(jason): possibly change this to started_deleting when we begin removing DELETED tables |
8994 | | // from table_ids_map_ (see CleanUpDeletedTables). |
8995 | 8.20M | if (table_lock->is_deleting()) { |
8996 | 17 | return true; |
8997 | 17 | } |
8998 | 8.20M | } |
8999 | 72.7k | return false; |
9000 | 72.7k | } |
9001 | | |
9002 | | struct DeferredAssignmentActions { |
9003 | | std::vector<TabletInfo*> modified_tablets; |
9004 | | std::vector<TabletInfo*> needs_create_rpc; |
9005 | | }; |
9006 | | |
9007 | | void CatalogManager::HandleAssignPreparingTablet(TabletInfo* tablet, |
9008 | 28.3k | DeferredAssignmentActions* deferred) { |
9009 | | // The tablet was just created (probably by a CreateTable RPC). |
9010 | | // Update the state to "creating" to be ready for the creation request. |
9011 | 28.3k | tablet->mutable_metadata()->mutable_dirty()->set_state( |
9012 | 28.3k | SysTabletsEntryPB::CREATING, "Sending initial creation of tablet"); |
9013 | 28.3k | deferred->modified_tablets.push_back(tablet); |
9014 | 28.3k | deferred->needs_create_rpc.push_back(tablet); |
9015 | 0 | VLOG(1) << "Assign new tablet " << tablet->ToString(); |
9016 | 28.3k | } |
9017 | | |
9018 | | void CatalogManager::HandleAssignCreatingTablet(TabletInfo* tablet, |
9019 | | DeferredAssignmentActions* deferred, |
9020 | 27.2k | vector<scoped_refptr<TabletInfo>>* new_tablets) { |
9021 | 27.2k | MonoDelta time_since_updated = |
9022 | 27.2k | MonoTime::Now().GetDeltaSince(tablet->last_update_time()); |
9023 | 27.2k | int64_t remaining_timeout_ms = |
9024 | 27.2k | FLAGS_tablet_creation_timeout_ms - time_since_updated.ToMilliseconds(); |
9025 | | |
9026 | 27.2k | if (tablet->LockForRead()->pb.has_split_parent_tablet_id()) { |
9027 | | // No need to recreate post-split tablets, since this is always done on source tablet replicas. |
9028 | 0 | VLOG(2) << "Post-split tablet " << AsString(tablet) << " still being created."; |
9029 | 118 | return; |
9030 | 118 | } |
9031 | | // Skip the tablet if the assignment timeout is not yet expired. |
9032 | 27.1k | if (remaining_timeout_ms > 0) { |
9033 | 0 | VLOG(2) << "Tablet " << tablet->ToString() << " still being created. " |
9034 | 0 | << remaining_timeout_ms << "ms remain until timeout."; |
9035 | 27.1k | return; |
9036 | 27.1k | } |
9037 | | |
9038 | 13 | const PersistentTabletInfo& old_info = tablet->metadata().state(); |
9039 | | |
9040 | | // The "tablet creation" was already sent, but we didn't receive an answer |
9041 | | // within the timeout. So the tablet will be replaced by a new one. |
9042 | 13 | TabletInfoPtr replacement; |
9043 | 13 | { |
9044 | 13 | LockGuard lock(mutex_); |
9045 | 13 | replacement = CreateTabletInfo(tablet->table().get(), old_info.pb.partition()); |
9046 | 13 | } |
9047 | 13 | LOG(WARNING) << "Tablet " << tablet->ToString() << " was not created within " |
9048 | 13 | << "the allowed timeout. Replacing with a new tablet " |
9049 | 13 | << replacement->tablet_id(); |
9050 | | |
9051 | 13 | tablet->table()->ReplaceTablet(tablet, replacement); |
9052 | 13 | { |
9053 | 13 | LockGuard lock(mutex_); |
9054 | 13 | auto tablet_map_checkout = tablet_map_.CheckOut(); |
9055 | 13 | (*tablet_map_checkout)[replacement->tablet_id()] = replacement; |
9056 | 13 | } |
9057 | | |
9058 | | // Mark old tablet as replaced. |
9059 | 13 | tablet->mutable_metadata()->mutable_dirty()->set_state( |
9060 | 13 | SysTabletsEntryPB::REPLACED, |
9061 | 13 | Substitute("Replaced by $0 at $1", |
9062 | 13 | replacement->tablet_id(), LocalTimeAsString())); |
9063 | | |
9064 | | // Mark new tablet as being created. |
9065 | 13 | replacement->mutable_metadata()->mutable_dirty()->set_state( |
9066 | 13 | SysTabletsEntryPB::CREATING, |
9067 | 13 | Substitute("Replacement for $0", tablet->tablet_id())); |
9068 | | |
9069 | 13 | deferred->modified_tablets.push_back(tablet); |
9070 | 13 | deferred->modified_tablets.push_back(replacement.get()); |
9071 | 13 | deferred->needs_create_rpc.push_back(replacement.get()); |
9072 | 0 | VLOG(1) << "Replaced tablet " << tablet->tablet_id() |
9073 | 0 | << " with " << replacement->tablet_id() |
9074 | 0 | << " (table " << tablet->table()->ToString() << ")"; |
9075 | | |
9076 | 13 | new_tablets->push_back(replacement); |
9077 | 13 | } |
9078 | | |
9079 | | // TODO: we could batch the IO onto a background thread. |
9080 | | Status CatalogManager::HandleTabletSchemaVersionReport( |
9081 | 57.0k | TabletInfo *tablet, uint32_t version, const scoped_refptr<TableInfo>& table_info) { |
9082 | 57.0k | scoped_refptr<TableInfo> table; |
9083 | 57.0k | if (table_info) { |
9084 | 20.5k | table = table_info; |
9085 | 36.4k | } else { |
9086 | 36.4k | table = tablet->table(); |
9087 | 36.4k | } |
9088 | | |
9089 | | // Update the schema version if it's the latest. |
9090 | 57.0k | tablet->set_reported_schema_version(table->id(), version); |
9091 | 110 | VLOG_WITH_PREFIX_AND_FUNC(1) |
9092 | 110 | << "Tablet " << tablet->tablet_id() << " reported version " << version; |
9093 | | |
9094 | | // Verify if it's the last tablet report, and the alter completed. |
9095 | 57.0k | { |
9096 | 57.0k | auto l = table->LockForRead(); |
9097 | 57.0k | if (l->pb.state() != SysTablesEntryPB::ALTERING) { |
9098 | 18.4E | VLOG_WITH_PREFIX_AND_FUNC(2) << "Table " << table->ToString() << " is not altering"; |
9099 | 38.7k | return Status::OK(); |
9100 | 38.7k | } |
9101 | | |
9102 | 18.2k | uint32_t current_version = l->pb.version(); |
9103 | 18.2k | if (table->IsAlterInProgress(current_version)) { |
9104 | 0 | VLOG_WITH_PREFIX_AND_FUNC(2) << "Table " << table->ToString() << " has IsAlterInProgress (" |
9105 | 0 | << current_version << ")"; |
9106 | 12.7k | return Status::OK(); |
9107 | 12.7k | } |
9108 | 5.53k | } |
9109 | | |
9110 | 5.53k | return MultiStageAlterTable::LaunchNextTableInfoVersionIfNecessary(this, table, version); |
9111 | 5.53k | } |
9112 | | |
9113 | | Status CatalogManager::ProcessPendingAssignmentsPerTable( |
9114 | 12.6k | const TableId& table_id, const TabletInfos& tablets, CMGlobalLoadState* global_load_state) { |
9115 | 0 | VLOG(1) << "Processing pending assignments"; |
9116 | | |
9117 | 12.6k | TSDescriptorVector ts_descs = GetAllLiveNotBlacklistedTServers(); |
9118 | | |
9119 | | // Initialize this table load state. |
9120 | 12.6k | CMPerTableLoadState table_load_state(global_load_state); |
9121 | 12.6k | InitializeTableLoadState(table_id, ts_descs, &table_load_state); |
9122 | 12.6k | table_load_state.SortLoad(); |
9123 | | |
9124 | | // Take write locks on all tablets to be processed, and ensure that they are |
9125 | | // unlocked at the end of this scope. |
9126 | 55.6k | for (const scoped_refptr<TabletInfo>& tablet : tablets) { |
9127 | 55.6k | tablet->mutable_metadata()->StartMutation(); |
9128 | 55.6k | } |
9129 | 12.6k | ScopedInfoCommitter<TabletInfo> unlocker_in(&tablets); |
9130 | | |
9131 | | // Any tablets created by the helper functions will also be created in a |
9132 | | // locked state, so we must ensure they are unlocked before we return to |
9133 | | // avoid deadlocks. |
9134 | 12.6k | TabletInfos new_tablets; |
9135 | 12.6k | ScopedInfoCommitter<TabletInfo> unlocker_out(&new_tablets); |
9136 | | |
9137 | 12.6k | DeferredAssignmentActions deferred; |
9138 | | |
9139 | | // Iterate over each of the tablets and handle it, whatever state |
9140 | | // it may be in. The actions required for the tablet are collected |
9141 | | // into 'deferred'. |
9142 | 55.6k | for (const scoped_refptr<TabletInfo>& tablet : tablets) { |
9143 | 55.6k | SysTabletsEntryPB::State t_state = tablet->metadata().state().pb.state(); |
9144 | | |
9145 | 55.6k | switch (t_state) { |
9146 | 28.3k | case SysTabletsEntryPB::PREPARING: |
9147 | 28.3k | HandleAssignPreparingTablet(tablet.get(), &deferred); |
9148 | 28.3k | break; |
9149 | | |
9150 | 27.2k | case SysTabletsEntryPB::CREATING: |
9151 | 27.2k | HandleAssignCreatingTablet(tablet.get(), &deferred, &new_tablets); |
9152 | 27.2k | break; |
9153 | | |
9154 | 50 | default: |
9155 | 0 | VLOG(2) << "Nothing to do for tablet " << tablet->tablet_id() << ": state = " |
9156 | 0 | << SysTabletsEntryPB_State_Name(t_state); |
9157 | 50 | break; |
9158 | 55.6k | } |
9159 | 55.6k | } |
9160 | | |
9161 | | // Nothing to do. |
9162 | 12.6k | if (deferred.modified_tablets.empty() && |
9163 | 8.68k | deferred.needs_create_rpc.empty()) { |
9164 | 8.68k | return Status::OK(); |
9165 | 8.68k | } |
9166 | | |
9167 | | // For those tablets which need to be created in this round, assign replicas. |
9168 | 3.98k | Status s; |
9169 | 3.98k | std::unordered_set<TableInfo*> ok_status_tables; |
9170 | 28.3k | for (TabletInfo *tablet : deferred.needs_create_rpc) { |
9171 | | // NOTE: if we fail to select replicas on the first pass (due to |
9172 | | // insufficient Tablet Servers being online), we will still try |
9173 | | // again unless the tablet/table creation is cancelled. |
9174 | 28.3k | LOG(INFO) << "Selecting replicas for tablet " << tablet->id(); |
9175 | 28.3k | s = SelectReplicasForTablet(ts_descs, tablet, &table_load_state, global_load_state); |
9176 | 28.3k | if (!s.ok()) { |
9177 | 0 | s = s.CloneAndPrepend(Substitute( |
9178 | 0 | "An error occurred while selecting replicas for tablet $0: $1", |
9179 | 0 | tablet->tablet_id(), s.ToString())); |
9180 | 0 | tablet->table()->SetCreateTableErrorStatus(s); |
9181 | 0 | break; |
9182 | 28.3k | } else { |
9183 | 28.3k | ok_status_tables.emplace(tablet->table().get()); |
9184 | 28.3k | } |
9185 | 28.3k | } |
9186 | | |
9187 | | // Update the sys catalog with the new set of tablets/metadata. |
9188 | 3.98k | if (s.ok()) { |
9189 | | // If any of the ok_status_tables had an error in the previous iterations, we |
9190 | | // need to clear up the error status to reflect that all the create tablets have now |
9191 | | // succeded. |
9192 | 3.98k | for (TableInfo* table : ok_status_tables) { |
9193 | 3.98k | table->SetCreateTableErrorStatus(Status::OK()); |
9194 | 3.98k | } |
9195 | | |
9196 | 3.98k | s = sys_catalog_->Upsert(leader_ready_term(), deferred.modified_tablets); |
9197 | 3.98k | if (!s.ok()) { |
9198 | 2 | s = s.CloneAndPrepend("An error occurred while persisting the updated tablet metadata"); |
9199 | 2 | } |
9200 | 3.98k | } |
9201 | | |
9202 | 3.98k | if (!s.ok()) { |
9203 | 2 | LOG(WARNING) << "Aborting the current task due to error: " << s.ToString(); |
9204 | | // If there was an error, abort any mutations started by the current task. |
9205 | | // NOTE: Lock order should be lock_ -> table -> tablet. |
9206 | | // We currently have a bunch of tablets locked and need to unlock first to ensure this holds. |
9207 | | |
9208 | 0 | std::sort(new_tablets.begin(), new_tablets.end(), [](const auto& lhs, const auto& rhs) { |
9209 | 0 | return lhs->table().get() < rhs->table().get(); |
9210 | 0 | }); |
9211 | 2 | { |
9212 | 2 | std::string current_table_name; |
9213 | 2 | TableInfoPtr current_table; |
9214 | 0 | for (auto& tablet_to_remove : new_tablets) { |
9215 | 0 | if (tablet_to_remove->table()->RemoveTablet(tablet_to_remove->tablet_id())) { |
9216 | 0 | if (VLOG_IS_ON(1)) { |
9217 | 0 | if (current_table != tablet_to_remove->table()) { |
9218 | 0 | current_table = tablet_to_remove->table(); |
9219 | 0 | current_table_name = current_table->name(); |
9220 | 0 | } |
9221 | 0 | LOG(INFO) << "Removed tablet " << tablet_to_remove->tablet_id() << " from table " |
9222 | 0 | << current_table_name; |
9223 | 0 | } |
9224 | 0 | } |
9225 | 0 | } |
9226 | 2 | } |
9227 | | |
9228 | 2 | unlocker_out.Abort(); // tablet.unlock |
9229 | 2 | unlocker_in.Abort(); |
9230 | | |
9231 | 2 | { |
9232 | 2 | LockGuard lock(mutex_); // lock_.lock |
9233 | 2 | auto tablet_map_checkout = tablet_map_.CheckOut(); |
9234 | 0 | for (auto& tablet_to_remove : new_tablets) { |
9235 | | // Potential race condition above, but it's okay if a background thread deleted this. |
9236 | 0 | tablet_map_checkout->erase(tablet_to_remove->tablet_id()); |
9237 | 0 | } |
9238 | 2 | } |
9239 | 2 | return s; |
9240 | 2 | } |
9241 | | |
9242 | | // Send DeleteTablet requests to tablet servers serving deleted tablets. |
9243 | | // This is asynchronous / non-blocking. |
9244 | 28.3k | for (auto* tablet : deferred.modified_tablets) { |
9245 | 28.3k | if (tablet->metadata().dirty().is_deleted()) { |
9246 | | // Actual delete, because we delete tablet replica. |
9247 | 13 | DeleteTabletReplicas(tablet, tablet->metadata().dirty().pb.state_msg(), HideOnly::kFalse); |
9248 | 13 | } |
9249 | 28.3k | } |
9250 | | // Send the CreateTablet() requests to the servers. This is asynchronous / non-blocking. |
9251 | 3.98k | return SendCreateTabletRequests(deferred.needs_create_rpc); |
9252 | 3.98k | } |
9253 | | |
9254 | | Status CatalogManager::SelectReplicasForTablet( |
9255 | | const TSDescriptorVector& ts_descs, TabletInfo* tablet, |
9256 | 28.3k | CMPerTableLoadState* per_table_state, CMGlobalLoadState* global_state) { |
9257 | 28.3k | auto table_guard = tablet->table()->LockForRead(); |
9258 | | |
9259 | 28.3k | if (!table_guard->pb.IsInitialized()) { |
9260 | 0 | return STATUS_SUBSTITUTE(InvalidArgument, |
9261 | 0 | "TableInfo for tablet $0 is not initialized (aborted CreateTable attempt?)", |
9262 | 0 | tablet->tablet_id()); |
9263 | 0 | } |
9264 | | |
9265 | 28.3k | const auto& replication_info = |
9266 | 28.3k | VERIFY_RESULT(GetTableReplicationInfo(table_guard->pb.replication_info(), |
9267 | 28.3k | tablet->table()->TablespaceIdForTableCreation())); |
9268 | | |
9269 | | // Select the set of replicas for the tablet. |
9270 | 28.3k | ConsensusStatePB* cstate = tablet->mutable_metadata()->mutable_dirty() |
9271 | 28.3k | ->pb.mutable_committed_consensus_state(); |
9272 | 0 | VLOG_WITH_FUNC(3) << "Committed consensus state: " << AsString(cstate); |
9273 | 28.3k | cstate->set_current_term(kMinimumTerm); |
9274 | 28.3k | consensus::RaftConfigPB *config = cstate->mutable_config(); |
9275 | 28.3k | config->set_opid_index(consensus::kInvalidOpIdIndex); |
9276 | | |
9277 | 28.3k | Status s = HandlePlacementUsingReplicationInfo( |
9278 | 28.3k | replication_info, ts_descs, config, per_table_state, global_state); |
9279 | 28.3k | if (!s.ok()) { |
9280 | 0 | return s; |
9281 | 0 | } |
9282 | | |
9283 | 28.3k | std::ostringstream out; |
9284 | 28.3k | out << "Initial tserver uuids for tablet " << tablet->tablet_id() << ": "; |
9285 | 82.1k | for (const RaftPeerPB& peer : config->peers()) { |
9286 | 82.1k | out << peer.permanent_uuid() << " "; |
9287 | 82.1k | } |
9288 | | |
9289 | 28.3k | if (VLOG_IS_ON(0)) { |
9290 | 28.3k | out.str(); |
9291 | 28.3k | } |
9292 | | |
9293 | 0 | VLOG_WITH_FUNC(3) << "Committed consensus state has been updated to: " << AsString(cstate); |
9294 | | |
9295 | 28.3k | return Status::OK(); |
9296 | 28.3k | } |
9297 | | |
9298 | | void CatalogManager::GetTsDescsFromPlacementInfo(const PlacementInfoPB& placement_info, |
9299 | | const TSDescriptorVector& all_ts_descs, |
9300 | 60.7k | TSDescriptorVector* ts_descs) { |
9301 | 60.7k | ts_descs->clear(); |
9302 | 177k | for (const auto& ts_desc : all_ts_descs) { |
9303 | 177k | if (placement_info.has_placement_uuid()) { |
9304 | 3.01k | string placement_uuid = placement_info.placement_uuid(); |
9305 | 3.01k | if (ts_desc->placement_uuid() == placement_uuid) { |
9306 | 1.89k | ts_descs->push_back(ts_desc); |
9307 | 1.89k | } |
9308 | 174k | } else if (ts_desc->placement_uuid() == "") { |
9309 | | // Since the placement info has no placement id, we know it is live, so we add this ts. |
9310 | 174k | ts_descs->push_back(ts_desc); |
9311 | 174k | } |
9312 | 177k | } |
9313 | 60.7k | } |
9314 | | |
9315 | | Status CatalogManager::HandlePlacementUsingReplicationInfo( |
9316 | | const ReplicationInfoPB& replication_info, |
9317 | | const TSDescriptorVector& all_ts_descs, |
9318 | | consensus::RaftConfigPB* config, |
9319 | | CMPerTableLoadState* per_table_state, |
9320 | 28.3k | CMGlobalLoadState* global_state) { |
9321 | | // Validate if we have enough tservers to put the replicas. |
9322 | 28.3k | ValidateReplicationInfoRequestPB req; |
9323 | 28.3k | req.mutable_replication_info()->CopyFrom(replication_info); |
9324 | 28.3k | ValidateReplicationInfoResponsePB resp; |
9325 | 28.3k | RETURN_NOT_OK(ValidateReplicationInfo(&req, &resp)); |
9326 | | |
9327 | 28.3k | TSDescriptorVector ts_descs; |
9328 | 28.3k | GetTsDescsFromPlacementInfo(replication_info.live_replicas(), all_ts_descs, &ts_descs); |
9329 | 28.3k | RETURN_NOT_OK(HandlePlacementUsingPlacementInfo( |
9330 | 28.3k | replication_info.live_replicas(), ts_descs, PeerMemberType::VOTER, |
9331 | 28.3k | config, per_table_state, global_state)); |
9332 | 28.4k | for (int i = 0; i < replication_info.read_replicas_size(); i++) { |
9333 | 92 | GetTsDescsFromPlacementInfo(replication_info.read_replicas(i), all_ts_descs, &ts_descs); |
9334 | 92 | RETURN_NOT_OK(HandlePlacementUsingPlacementInfo( |
9335 | 92 | replication_info.read_replicas(i), ts_descs, PeerMemberType::OBSERVER, |
9336 | 92 | config, per_table_state, global_state)); |
9337 | 92 | } |
9338 | 28.3k | return Status::OK(); |
9339 | 28.3k | } |
9340 | | |
9341 | | Status CatalogManager::HandlePlacementUsingPlacementInfo(const PlacementInfoPB& placement_info, |
9342 | | const TSDescriptorVector& ts_descs, |
9343 | | PeerMemberType member_type, |
9344 | | consensus::RaftConfigPB* config, |
9345 | | CMPerTableLoadState* per_table_state, |
9346 | 28.4k | CMGlobalLoadState* global_state) { |
9347 | 28.4k | size_t nreplicas = GetNumReplicasFromPlacementInfo(placement_info); |
9348 | 28.4k | size_t ntservers = ts_descs.size(); |
9349 | | // Keep track of servers we've already selected, so that we don't attempt to |
9350 | | // put two replicas on the same host. |
9351 | 28.4k | set<TabletServerId> already_selected_ts; |
9352 | 28.4k | if (placement_info.placement_blocks().empty()) { |
9353 | | // If we don't have placement info, just place the replicas as before, distributed across the |
9354 | | // whole cluster. |
9355 | | // We cannot put more than ntservers replicas. |
9356 | 28.2k | nreplicas = min(nreplicas, ntservers); |
9357 | 28.2k | SelectReplicas(ts_descs, nreplicas, config, &already_selected_ts, member_type, |
9358 | 28.2k | per_table_state, global_state); |
9359 | 132 | } else { |
9360 | | // TODO(bogdan): move to separate function |
9361 | | // |
9362 | | // If we do have placement info, we'll try to use the same power of two algorithm, but also |
9363 | | // match the requested policies. We'll assign the minimum requested replicas in each combination |
9364 | | // of cloud.region.zone and then if we still have leftover replicas, we'll assign those |
9365 | | // in any of the allowed areas. |
9366 | 132 | auto all_allowed_ts = VERIFY_RESULT(FindTServersForPlacementInfo(placement_info, ts_descs)); |
9367 | | |
9368 | | // Loop through placements and assign to respective available TSs. |
9369 | 132 | size_t min_replica_count_sum = 0; |
9370 | 252 | for (const auto& pb : placement_info.placement_blocks()) { |
9371 | | // This works because currently we don't allow placement blocks to overlap. |
9372 | 252 | auto available_ts_descs = VERIFY_RESULT(FindTServersForPlacementBlock(pb, ts_descs)); |
9373 | 252 | size_t available_ts_descs_size = available_ts_descs.size(); |
9374 | 252 | size_t min_num_replicas = pb.min_num_replicas(); |
9375 | | // We cannot put more than the available tablet servers in that placement block. |
9376 | 252 | size_t num_replicas = min(min_num_replicas, available_ts_descs_size); |
9377 | 252 | min_replica_count_sum += min_num_replicas; |
9378 | 252 | SelectReplicas(available_ts_descs, num_replicas, config, &already_selected_ts, member_type, |
9379 | 252 | per_table_state, global_state); |
9380 | 252 | } |
9381 | | |
9382 | 132 | size_t replicas_left = nreplicas - min_replica_count_sum; |
9383 | 132 | size_t max_tservers_left = all_allowed_ts.size() - already_selected_ts.size(); |
9384 | | // Upper bounded by the tservers left. |
9385 | 132 | replicas_left = min(replicas_left, max_tservers_left); |
9386 | 132 | DCHECK_GE(replicas_left, 0); |
9387 | 132 | if (replicas_left > 0) { |
9388 | | // No need to do an extra check here, as we checked early if we have enough to cover all |
9389 | | // requested placements and checked individually per placement info, if we could cover the |
9390 | | // minimums. |
9391 | 12 | SelectReplicas(all_allowed_ts, replicas_left, config, &already_selected_ts, member_type, |
9392 | 12 | per_table_state, global_state); |
9393 | 12 | } |
9394 | 132 | } |
9395 | 28.4k | return Status::OK(); |
9396 | 28.4k | } |
9397 | | |
9398 | | Result<vector<shared_ptr<TSDescriptor>>> CatalogManager::FindTServersForPlacementInfo( |
9399 | | const PlacementInfoPB& placement_info, |
9400 | 237 | const TSDescriptorVector& ts_descs) const { |
9401 | | |
9402 | 237 | vector<shared_ptr<TSDescriptor>> all_allowed_ts; |
9403 | 1.00k | for (const auto& ts : ts_descs) { |
9404 | 1.65k | for (const auto& pb : placement_info.placement_blocks()) { |
9405 | 1.65k | if (ts->MatchesCloudInfo(pb.cloud_info())) { |
9406 | 1.00k | all_allowed_ts.push_back(ts); |
9407 | 1.00k | break; |
9408 | 1.00k | } |
9409 | 1.65k | } |
9410 | 1.00k | } |
9411 | | |
9412 | 237 | return all_allowed_ts; |
9413 | 237 | } |
9414 | | |
9415 | | Result<vector<shared_ptr<TSDescriptor>>> CatalogManager::FindTServersForPlacementBlock( |
9416 | | const PlacementBlockPB& placement_block, |
9417 | 491 | const TSDescriptorVector& ts_descs) { |
9418 | | |
9419 | 491 | vector<shared_ptr<TSDescriptor>> allowed_ts; |
9420 | 491 | const auto& cloud_info = placement_block.cloud_info(); |
9421 | 2.20k | for (const auto& ts : ts_descs) { |
9422 | 2.20k | if (ts->MatchesCloudInfo(cloud_info)) { |
9423 | 1.00k | allowed_ts.push_back(ts); |
9424 | 1.00k | } |
9425 | 2.20k | } |
9426 | | |
9427 | 491 | return allowed_ts; |
9428 | 491 | } |
9429 | | |
9430 | 3.97k | Status CatalogManager::SendCreateTabletRequests(const vector<TabletInfo*>& tablets) { |
9431 | 3.97k | auto schedules_to_tablets_map = VERIFY_RESULT(MakeSnapshotSchedulesToObjectIdsMap( |
9432 | 3.97k | SysRowEntryType::TABLET)); |
9433 | 28.3k | for (TabletInfo *tablet : tablets) { |
9434 | 28.3k | const consensus::RaftConfigPB& config = |
9435 | 28.3k | tablet->metadata().dirty().pb.committed_consensus_state().config(); |
9436 | 28.3k | tablet->set_last_update_time(MonoTime::Now()); |
9437 | 28.3k | std::vector<SnapshotScheduleId> schedules; |
9438 | 0 | for (const auto& pair : schedules_to_tablets_map) { |
9439 | 0 | if (std::binary_search(pair.second.begin(), pair.second.end(), tablet->id())) { |
9440 | 0 | schedules.push_back(pair.first); |
9441 | 0 | } |
9442 | 0 | } |
9443 | 82.1k | for (const RaftPeerPB& peer : config.peers()) { |
9444 | 82.1k | auto task = std::make_shared<AsyncCreateReplica>(master_, AsyncTaskPool(), |
9445 | 82.1k | peer.permanent_uuid(), tablet, schedules); |
9446 | 82.1k | tablet->table()->AddTask(task); |
9447 | 82.1k | WARN_NOT_OK(ScheduleTask(task), "Failed to send new tablet request"); |
9448 | 82.1k | } |
9449 | 28.3k | } |
9450 | | |
9451 | 3.97k | return Status::OK(); |
9452 | 3.97k | } |
9453 | | |
9454 | | // If responses have been received from sufficient replicas (including hinted leader), |
9455 | | // pick proposed leader and start election. |
9456 | | void CatalogManager::StartElectionIfReady( |
9457 | 91.8k | const consensus::ConsensusStatePB& cstate, TabletInfo* tablet) { |
9458 | 91.8k | auto replicas = tablet->GetReplicaLocations(); |
9459 | 91.8k | int num_voters = 0; |
9460 | 274k | for (const auto& peer : cstate.config().peers()) { |
9461 | 274k | if (peer.member_type() == PeerMemberType::VOTER) { |
9462 | 273k | ++num_voters; |
9463 | 273k | } |
9464 | 274k | } |
9465 | 91.8k | int majority_size = num_voters / 2 + 1; |
9466 | 91.8k | int running_voters = 0; |
9467 | 274k | for (const auto& replica : *replicas) { |
9468 | 274k | if (replica.second.member_type == PeerMemberType::VOTER) { |
9469 | 273k | ++running_voters; |
9470 | 273k | } |
9471 | 274k | } |
9472 | | |
9473 | 1 | VLOG_WITH_PREFIX(4) |
9474 | 1 | << __func__ << ": T " << tablet->tablet_id() << ": " << AsString(*replicas) << ", voters: " |
9475 | 1 | << running_voters << "/" << majority_size; |
9476 | | |
9477 | 91.8k | if (running_voters < majority_size) { |
9478 | 0 | VLOG_WITH_PREFIX(4) << __func__ << ": Not enough voters"; |
9479 | 0 | return; |
9480 | 0 | } |
9481 | | |
9482 | 91.8k | ReplicationInfoPB replication_info; |
9483 | 91.8k | { |
9484 | 91.8k | auto l = cluster_config_->LockForRead(); |
9485 | 91.8k | replication_info = l->pb.replication_info(); |
9486 | 91.8k | } |
9487 | | |
9488 | | // Find tservers that can be leaders for a tablet. |
9489 | 91.8k | TSDescriptorVector ts_descs = GetAllLiveNotBlacklistedTServers(); |
9490 | | |
9491 | 91.8k | std::vector<std::string> possible_leaders; |
9492 | 274k | for (const auto& replica : *replicas) { |
9493 | 555k | for (const auto& ts_desc : ts_descs) { |
9494 | 555k | if (ts_desc->permanent_uuid() == replica.first) { |
9495 | 274k | if (ts_desc->IsAcceptingLeaderLoad(replication_info)) { |
9496 | 272k | possible_leaders.push_back(replica.first); |
9497 | 272k | } |
9498 | 274k | break; |
9499 | 274k | } |
9500 | 555k | } |
9501 | 274k | } |
9502 | | |
9503 | 91.8k | if (FLAGS_TEST_create_table_leader_hint_min_lexicographic) { |
9504 | 6 | std::string min_lexicographic; |
9505 | 18 | for (const auto& peer : cstate.config().peers()) { |
9506 | 18 | if (peer.member_type() == PeerMemberType::VOTER) { |
9507 | 18 | if (min_lexicographic.empty() || peer.permanent_uuid() < min_lexicographic) { |
9508 | 6 | min_lexicographic = peer.permanent_uuid(); |
9509 | 6 | } |
9510 | 18 | } |
9511 | 18 | } |
9512 | 6 | if (min_lexicographic.empty() || !replicas->count(min_lexicographic)) { |
9513 | 0 | LOG_WITH_PREFIX(INFO) |
9514 | 0 | << __func__ << ": Min lexicographic is not yet ready: " << min_lexicographic; |
9515 | 0 | return; |
9516 | 0 | } |
9517 | 6 | possible_leaders = { min_lexicographic }; |
9518 | 6 | } |
9519 | | |
9520 | 91.8k | if (possible_leaders.empty()) { |
9521 | 0 | VLOG_WITH_PREFIX(4) << __func__ << ": Cannot pick candidate"; |
9522 | 64 | return; |
9523 | 64 | } |
9524 | | |
9525 | 91.7k | if (!tablet->InitiateElection()) { |
9526 | 0 | VLOG_WITH_PREFIX(4) << __func__ << ": Already initiated"; |
9527 | 63.7k | return; |
9528 | 63.7k | } |
9529 | | |
9530 | 28.0k | const auto& protege = RandomElement(possible_leaders); |
9531 | | |
9532 | 28.0k | LOG_WITH_PREFIX(INFO) |
9533 | 28.0k | << "Starting election at " << tablet->tablet_id() << " in favor of " << protege; |
9534 | | |
9535 | 28.0k | auto task = std::make_shared<AsyncStartElection>(master_, AsyncTaskPool(), protege, tablet); |
9536 | 28.0k | tablet->table()->AddTask(task); |
9537 | 28.0k | WARN_NOT_OK(task->Run(), "Failed to send new tablet start election request"); |
9538 | 28.0k | } |
9539 | | |
9540 | | shared_ptr<TSDescriptor> CatalogManager::SelectReplica( |
9541 | | const TSDescriptorVector& ts_descs, |
9542 | | set<TabletServerId>* excluded, |
9543 | 82.1k | CMPerTableLoadState* per_table_state, CMGlobalLoadState* global_state) { |
9544 | 82.1k | shared_ptr<TSDescriptor> found_ts; |
9545 | 83.7k | for (const auto& sorted_load : per_table_state->sorted_load_) { |
9546 | | // Don't consider a tserver that has already been considered for this tablet. |
9547 | 83.7k | if (excluded->count(sorted_load)) { |
9548 | 3 | continue; |
9549 | 3 | } |
9550 | | // Only choose from the set of allowed tservers for this tablet. |
9551 | 168k | auto it = std::find_if(ts_descs.begin(), ts_descs.end(), [&sorted_load](const auto& ts) { |
9552 | 168k | return ts->permanent_uuid() == sorted_load; |
9553 | 168k | }); |
9554 | | |
9555 | 83.7k | if (it != ts_descs.end()) { |
9556 | 82.1k | found_ts = *it; |
9557 | 82.1k | break; |
9558 | 82.1k | } |
9559 | 83.7k | } |
9560 | | |
9561 | 82.1k | return found_ts; |
9562 | 82.1k | } |
9563 | | |
9564 | | void CatalogManager::SelectReplicas( |
9565 | | const TSDescriptorVector& ts_descs, size_t nreplicas, consensus::RaftConfigPB* config, |
9566 | | set<TabletServerId>* already_selected_ts, PeerMemberType member_type, |
9567 | 28.5k | CMPerTableLoadState* per_table_state, CMGlobalLoadState* global_state) { |
9568 | 28.5k | DCHECK_LE(nreplicas, ts_descs.size()); |
9569 | | |
9570 | 110k | for (size_t i = 0; i < nreplicas; ++i) { |
9571 | 82.1k | shared_ptr<TSDescriptor> ts = SelectReplica( |
9572 | 82.1k | ts_descs, already_selected_ts, per_table_state, global_state); |
9573 | 82.1k | InsertOrDie(already_selected_ts, ts->permanent_uuid()); |
9574 | | // Update the load state at global and table level. |
9575 | 82.1k | per_table_state->per_ts_load_[ts->permanent_uuid()]++; |
9576 | 82.1k | global_state->per_ts_load_[ts->permanent_uuid()]++; |
9577 | 82.1k | per_table_state->SortLoad(); |
9578 | | |
9579 | | // Increment the number of pending replicas so that we take this selection into |
9580 | | // account when assigning replicas for other tablets of the same table. This |
9581 | | // value decays back to 0 over time. |
9582 | 82.1k | ts->IncrementRecentReplicaCreations(); |
9583 | | |
9584 | 82.1k | TSRegistrationPB reg = ts->GetRegistration(); |
9585 | | |
9586 | 82.1k | RaftPeerPB *peer = config->add_peers(); |
9587 | 82.1k | peer->set_permanent_uuid(ts->permanent_uuid()); |
9588 | | |
9589 | | // TODO: This is temporary, we will use only UUIDs. |
9590 | 82.1k | TakeRegistration(reg.mutable_common(), peer); |
9591 | 82.1k | peer->set_member_type(member_type); |
9592 | 82.1k | } |
9593 | 28.5k | } |
9594 | | |
9595 | | Status CatalogManager::ConsensusStateToTabletLocations(const consensus::ConsensusStatePB& cstate, |
9596 | 159k | TabletLocationsPB* locs_pb) { |
9597 | 416k | for (const consensus::RaftPeerPB& peer : cstate.config().peers()) { |
9598 | 416k | TabletLocationsPB_ReplicaPB* replica_pb = locs_pb->add_replicas(); |
9599 | 416k | if (!peer.has_permanent_uuid()) { |
9600 | 0 | return STATUS_SUBSTITUTE(IllegalState, "Missing UUID $0", peer.ShortDebugString()); |
9601 | 0 | } |
9602 | 416k | replica_pb->set_role(GetConsensusRole(peer.permanent_uuid(), cstate)); |
9603 | 416k | if (peer.has_member_type()) { |
9604 | 416k | replica_pb->set_member_type(peer.member_type()); |
9605 | 2 | } else { |
9606 | 2 | replica_pb->set_member_type(PeerMemberType::UNKNOWN_MEMBER_TYPE); |
9607 | 2 | } |
9608 | 416k | TSInfoPB* tsinfo_pb = replica_pb->mutable_ts_info(); |
9609 | 416k | tsinfo_pb->set_permanent_uuid(peer.permanent_uuid()); |
9610 | 416k | CopyRegistration(peer, tsinfo_pb); |
9611 | 416k | } |
9612 | 159k | return Status::OK(); |
9613 | 159k | } |
9614 | | |
9615 | | Status CatalogManager::BuildLocationsForTablet(const scoped_refptr<TabletInfo>& tablet, |
9616 | | TabletLocationsPB* locs_pb, |
9617 | 366k | IncludeInactive include_inactive) { |
9618 | 366k | { |
9619 | 366k | auto l_tablet = tablet->LockForRead(); |
9620 | 366k | if (l_tablet->is_hidden() && !include_inactive) { |
9621 | 0 | return STATUS_FORMAT(NotFound, "Tablet hidden", tablet->id()); |
9622 | 0 | } |
9623 | 366k | locs_pb->set_table_id(l_tablet->pb.table_id()); |
9624 | 366k | *locs_pb->mutable_table_ids() = l_tablet->pb.table_ids(); |
9625 | 366k | } |
9626 | | |
9627 | | // For system tables, the set of replicas is always the set of masters. |
9628 | 366k | if (system_tablets_.find(tablet->id()) != system_tablets_.end()) { |
9629 | 159k | consensus::ConsensusStatePB master_consensus; |
9630 | 159k | RETURN_NOT_OK(GetCurrentConfig(&master_consensus)); |
9631 | 159k | locs_pb->set_tablet_id(tablet->tablet_id()); |
9632 | 159k | locs_pb->set_stale(false); |
9633 | 159k | const auto initial_size = locs_pb->replicas_size(); |
9634 | 159k | RETURN_NOT_OK(ConsensusStateToTabletLocations(master_consensus, locs_pb)); |
9635 | 159k | const auto capabilities = Capabilities(); |
9636 | | // Set capabilities of master node for all newly created system table locations. |
9637 | 159k | for (auto i = locs_pb->mutable_replicas()->begin() + initial_size, |
9638 | 574k | end = locs_pb->mutable_replicas()->end(); i != end; ++i) { |
9639 | 415k | *i->mutable_ts_info()->mutable_capabilities() = google::protobuf::RepeatedField<CapabilityId>( |
9640 | 415k | capabilities.begin(), capabilities.end()); |
9641 | 415k | } |
9642 | 159k | return Status::OK(); |
9643 | 206k | } |
9644 | | |
9645 | 206k | TSRegistrationPB reg; |
9646 | | |
9647 | 206k | std::shared_ptr<const TabletReplicaMap> locs; |
9648 | 206k | consensus::ConsensusStatePB cstate; |
9649 | 206k | { |
9650 | 206k | auto l_tablet = tablet->LockForRead(); |
9651 | 206k | if (PREDICT_FALSE(l_tablet->is_deleted())) { |
9652 | 324 | std::vector<TabletId> split_tablet_ids; |
9653 | 4 | for (const auto& split_tablet_id : l_tablet->pb.split_tablet_ids()) { |
9654 | 4 | split_tablet_ids.push_back(split_tablet_id); |
9655 | 4 | } |
9656 | 324 | return STATUS( |
9657 | 324 | NotFound, "Tablet deleted", l_tablet->pb.state_msg(), |
9658 | 324 | SplitChildTabletIdsData(split_tablet_ids)); |
9659 | 324 | } |
9660 | | |
9661 | 206k | if (PREDICT_FALSE(!l_tablet->is_running())) { |
9662 | 8.80k | return STATUS_FORMAT(ServiceUnavailable, "Tablet $0 not running", tablet->id()); |
9663 | 8.80k | } |
9664 | | |
9665 | 197k | locs = tablet->GetReplicaLocations(); |
9666 | 197k | if (locs->empty() && l_tablet->pb.has_committed_consensus_state()) { |
9667 | 220 | cstate = l_tablet->pb.committed_consensus_state(); |
9668 | 220 | } |
9669 | | |
9670 | 197k | const auto& metadata = tablet->metadata().state().pb; |
9671 | 197k | locs_pb->mutable_partition()->CopyFrom(metadata.partition()); |
9672 | 197k | locs_pb->set_split_depth(metadata.split_depth()); |
9673 | 197k | locs_pb->set_split_parent_tablet_id(metadata.split_parent_tablet_id()); |
9674 | 150 | for (const auto& split_tablet_id : metadata.split_tablet_ids()) { |
9675 | 150 | *locs_pb->add_split_tablet_ids() = split_tablet_id; |
9676 | 150 | } |
9677 | 197k | } |
9678 | | |
9679 | 197k | locs_pb->set_tablet_id(tablet->tablet_id()); |
9680 | 197k | locs_pb->set_stale(locs->empty()); |
9681 | | |
9682 | | // If the locations are cached. |
9683 | 197k | if (!locs->empty()) { |
9684 | 197k | if (cstate.IsInitialized() && |
9685 | 0 | locs->size() != implicit_cast<size_t>(cstate.config().peers_size())) { |
9686 | 0 | LOG(WARNING) << "Cached tablet replicas " << locs->size() << " does not match consensus " |
9687 | 0 | << cstate.config().peers_size(); |
9688 | 0 | } |
9689 | | |
9690 | 584k | for (const auto& replica : *locs) { |
9691 | 584k | TabletLocationsPB_ReplicaPB* replica_pb = locs_pb->add_replicas(); |
9692 | 584k | replica_pb->set_role(replica.second.role); |
9693 | 584k | replica_pb->set_member_type(replica.second.member_type); |
9694 | 584k | auto tsinfo_pb = replica.second.ts_desc->GetTSInformationPB(); |
9695 | | |
9696 | 584k | TSInfoPB* out_ts_info = replica_pb->mutable_ts_info(); |
9697 | 584k | out_ts_info->set_permanent_uuid(tsinfo_pb->tserver_instance().permanent_uuid()); |
9698 | 584k | CopyRegistration(tsinfo_pb->registration().common(), out_ts_info); |
9699 | 584k | out_ts_info->set_placement_uuid(tsinfo_pb->registration().common().placement_uuid()); |
9700 | 584k | *out_ts_info->mutable_capabilities() = tsinfo_pb->registration().capabilities(); |
9701 | 584k | } |
9702 | 197k | return Status::OK(); |
9703 | 197k | } |
9704 | | |
9705 | | // If the locations were not cached. |
9706 | | // TODO: Why would this ever happen? See KUDU-759. |
9707 | 291 | if (cstate.IsInitialized()) { |
9708 | 220 | RETURN_NOT_OK(ConsensusStateToTabletLocations(cstate, locs_pb)); |
9709 | 220 | } |
9710 | | |
9711 | 291 | return Status::OK(); |
9712 | 291 | } |
9713 | | |
9714 | 643k | Result<shared_ptr<tablet::AbstractTablet>> CatalogManager::GetSystemTablet(const TabletId& id) { |
9715 | 643k | const auto iter = system_tablets_.find(id); |
9716 | 643k | if (iter == system_tablets_.end()) { |
9717 | 0 | return STATUS_SUBSTITUTE(InvalidArgument, "$0 is not a valid system tablet id", id); |
9718 | 0 | } |
9719 | 643k | return iter->second; |
9720 | 643k | } |
9721 | | |
9722 | | Status CatalogManager::GetTabletLocations( |
9723 | 9.22k | const TabletId& tablet_id, TabletLocationsPB* locs_pb, IncludeInactive include_inactive) { |
9724 | 9.22k | scoped_refptr<TabletInfo> tablet_info; |
9725 | 9.22k | { |
9726 | 9.22k | SharedLock lock(mutex_); |
9727 | 9.22k | if (!FindCopy(*tablet_map_, tablet_id, &tablet_info)) { |
9728 | 0 | return STATUS_SUBSTITUTE(NotFound, "Unknown tablet $0", tablet_id); |
9729 | 0 | } |
9730 | 9.22k | } |
9731 | 9.22k | Status s = GetTabletLocations(tablet_info, locs_pb, include_inactive); |
9732 | | |
9733 | 9.22k | auto num_replicas = GetReplicationFactorForTablet(tablet_info); |
9734 | 9.22k | if (num_replicas.ok() && *num_replicas > 0 && |
9735 | 9.22k | implicit_cast<size_t>(locs_pb->replicas().size()) != *num_replicas) { |
9736 | 610 | YB_LOG_EVERY_N_SECS(WARNING, 1) |
9737 | 284 | << "Expected replicas " << num_replicas << " but found " |
9738 | 284 | << locs_pb->replicas().size() << " for tablet " << tablet_info->id() << ": " |
9739 | 284 | << locs_pb->ShortDebugString() << THROTTLE_MSG; |
9740 | 610 | } |
9741 | 9.22k | return s; |
9742 | 9.22k | } |
9743 | | |
9744 | | Status CatalogManager::GetTabletLocations( |
9745 | | scoped_refptr<TabletInfo> tablet_info, |
9746 | | TabletLocationsPB* locs_pb, |
9747 | 59.3k | IncludeInactive include_inactive) { |
9748 | 59.3k | DCHECK_EQ(locs_pb->replicas().size(), 0); |
9749 | 59.3k | locs_pb->mutable_replicas()->Clear(); |
9750 | 59.3k | return BuildLocationsForTablet(tablet_info, locs_pb, include_inactive); |
9751 | 59.3k | } |
9752 | | |
9753 | | Status CatalogManager::GetTableLocations( |
9754 | | const GetTableLocationsRequestPB* req, |
9755 | 167k | GetTableLocationsResponsePB* resp) { |
9756 | 18.4E | VLOG(4) << "GetTableLocations: " << req->ShortDebugString(); |
9757 | | |
9758 | | // If start-key is > end-key report an error instead of swap the two |
9759 | | // since probably there is something wrong app-side. |
9760 | 167k | if (req->has_partition_key_start() && req->has_partition_key_end() |
9761 | 1 | && req->partition_key_start() > req->partition_key_end()) { |
9762 | 1 | return STATUS(InvalidArgument, "start partition key is greater than the end partition key"); |
9763 | 1 | } |
9764 | | |
9765 | 167k | if (req->max_returned_locations() <= 0) { |
9766 | 0 | return STATUS(InvalidArgument, "max_returned_locations must be greater than 0"); |
9767 | 0 | } |
9768 | | |
9769 | 167k | scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTable(req->table())); |
9770 | | |
9771 | 167k | if (table->IsCreateInProgress()) { |
9772 | 7.98k | resp->set_creating(true); |
9773 | 7.98k | } |
9774 | | |
9775 | 167k | auto l = table->LockForRead(); |
9776 | 167k | RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp)); |
9777 | | |
9778 | 167k | vector<scoped_refptr<TabletInfo>> tablets; |
9779 | 167k | table->GetTabletsInRange(req, &tablets); |
9780 | | |
9781 | 167k | IncludeInactive include_inactive(req->has_include_inactive() && req->include_inactive()); |
9782 | 167k | bool require_tablets_runnings = req->require_tablets_running(); |
9783 | | |
9784 | 167k | int expected_live_replicas = 0; |
9785 | 167k | int expected_read_replicas = 0; |
9786 | 167k | GetExpectedNumberOfReplicas(&expected_live_replicas, &expected_read_replicas); |
9787 | 267k | for (const scoped_refptr<TabletInfo>& tablet : tablets) { |
9788 | 267k | TabletLocationsPB* locs_pb = resp->add_tablet_locations(); |
9789 | 267k | locs_pb->set_expected_live_replicas(expected_live_replicas); |
9790 | 267k | locs_pb->set_expected_read_replicas(expected_read_replicas); |
9791 | 267k | auto status = BuildLocationsForTablet(tablet, locs_pb, include_inactive); |
9792 | 267k | if (!status.ok()) { |
9793 | | // Not running. |
9794 | 8.78k | if (require_tablets_runnings) { |
9795 | 8.67k | resp->mutable_tablet_locations()->Clear(); |
9796 | 8.67k | return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, status); |
9797 | 8.67k | } |
9798 | 113 | resp->mutable_tablet_locations()->RemoveLast(); |
9799 | 113 | } |
9800 | 267k | } |
9801 | | |
9802 | 158k | resp->set_table_type(l->pb.table_type()); |
9803 | 158k | resp->set_partition_list_version(l->pb.partition_list_version()); |
9804 | | |
9805 | 158k | return Status::OK(); |
9806 | 167k | } |
9807 | | |
9808 | 568k | Status CatalogManager::GetCurrentConfig(consensus::ConsensusStatePB* cpb) const { |
9809 | 568k | auto tablet_peer = sys_catalog_->tablet_peer(); |
9810 | 556k | auto consensus = tablet_peer ? tablet_peer->shared_consensus() : nullptr; |
9811 | 568k | if (!consensus) { |
9812 | 11.6k | std::string uuid = master_->fs_manager()->uuid(); |
9813 | 11.6k | return STATUS_FORMAT(IllegalState, "Node $0 peer not initialized.", uuid); |
9814 | 11.6k | } |
9815 | | |
9816 | 557k | *cpb = consensus->ConsensusState(CONSENSUS_CONFIG_COMMITTED); |
9817 | | |
9818 | 557k | return Status::OK(); |
9819 | 557k | } |
9820 | | |
9821 | 0 | void CatalogManager::DumpState(std::ostream* out, bool on_disk_dump) const { |
9822 | 0 | NamespaceInfoMap namespace_ids_copy; |
9823 | 0 | TableInfoMap ids_copy; |
9824 | 0 | TableInfoByNameMap names_copy; |
9825 | 0 | TabletInfoMap tablets_copy; |
9826 | | |
9827 | | // Copy the internal state so that, if the output stream blocks, |
9828 | | // we don't end up holding the lock for a long time. |
9829 | 0 | { |
9830 | 0 | SharedLock lock(mutex_); |
9831 | 0 | namespace_ids_copy = namespace_ids_map_; |
9832 | 0 | ids_copy = *table_ids_map_; |
9833 | 0 | names_copy = table_names_map_; |
9834 | 0 | tablets_copy = *tablet_map_; |
9835 | 0 | } |
9836 | |
|
9837 | 0 | *out << "Dumping current state of master.\nNamespaces:\n"; |
9838 | 0 | for (const NamespaceInfoMap::value_type& e : namespace_ids_copy) { |
9839 | 0 | NamespaceInfo* t = e.second.get(); |
9840 | 0 | auto l = t->LockForRead(); |
9841 | 0 | const NamespaceName& name = l->name(); |
9842 | |
|
9843 | 0 | *out << t->id() << ":\n"; |
9844 | 0 | *out << " name: \"" << strings::CHexEscape(name) << "\"\n"; |
9845 | 0 | *out << " metadata: " << l->pb.ShortDebugString() << "\n"; |
9846 | 0 | } |
9847 | |
|
9848 | 0 | *out << "Tables:\n"; |
9849 | 0 | for (const TableInfoMap::value_type& e : ids_copy) { |
9850 | 0 | TableInfo* t = e.second.get(); |
9851 | 0 | TabletInfos table_tablets; |
9852 | 0 | { |
9853 | 0 | auto l = t->LockForRead(); |
9854 | 0 | const TableName& name = l->name(); |
9855 | 0 | const NamespaceId& namespace_id = l->namespace_id(); |
9856 | | // Find namespace by its ID. |
9857 | 0 | scoped_refptr<NamespaceInfo> ns = FindPtrOrNull(namespace_ids_copy, namespace_id); |
9858 | |
|
9859 | 0 | *out << t->id() << ":\n"; |
9860 | 0 | *out << " namespace id: \"" << strings::CHexEscape(namespace_id) << "\"\n"; |
9861 | |
|
9862 | 0 | if (ns != nullptr) { |
9863 | 0 | *out << " namespace name: \"" << strings::CHexEscape(ns->name()) << "\"\n"; |
9864 | 0 | } |
9865 | |
|
9866 | 0 | *out << " name: \"" << strings::CHexEscape(name) << "\"\n"; |
9867 | | // Erase from the map, so later we can check that we don't have |
9868 | | // any orphaned tables in the by-name map that aren't in the |
9869 | | // by-id map. |
9870 | 0 | if (names_copy.erase({namespace_id, name}) != 1) { |
9871 | 0 | *out << " [not present in by-name map]\n"; |
9872 | 0 | } |
9873 | 0 | *out << " metadata: " << l->pb.ShortDebugString() << "\n"; |
9874 | |
|
9875 | 0 | *out << " tablets:\n"; |
9876 | 0 | table_tablets = t->GetTablets(); |
9877 | 0 | } |
9878 | 0 | for (const scoped_refptr<TabletInfo>& tablet : table_tablets) { |
9879 | 0 | auto l_tablet = tablet->LockForRead(); |
9880 | 0 | *out << " " << tablet->tablet_id() << ": " |
9881 | 0 | << l_tablet->pb.ShortDebugString() << "\n"; |
9882 | |
|
9883 | 0 | if (tablets_copy.erase(tablet->tablet_id()) != 1) { |
9884 | 0 | *out << " [ERROR: not present in CM tablet map!]\n"; |
9885 | 0 | } |
9886 | 0 | } |
9887 | 0 | } |
9888 | |
|
9889 | 0 | if (!tablets_copy.empty()) { |
9890 | 0 | *out << "Orphaned tablets (not referenced by any table):\n"; |
9891 | 0 | for (const TabletInfoMap::value_type& entry : tablets_copy) { |
9892 | 0 | const scoped_refptr<TabletInfo>& tablet = entry.second; |
9893 | 0 | auto l_tablet = tablet->LockForRead(); |
9894 | 0 | *out << " " << tablet->tablet_id() << ": " |
9895 | 0 | << l_tablet->pb.ShortDebugString() << "\n"; |
9896 | 0 | } |
9897 | 0 | } |
9898 | |
|
9899 | 0 | if (!names_copy.empty()) { |
9900 | 0 | *out << "Orphaned tables (in by-name map, but not id map):\n"; |
9901 | 0 | for (const TableInfoByNameMap::value_type& e : names_copy) { |
9902 | 0 | *out << e.second->id() << ":\n"; |
9903 | 0 | *out << " namespace id: \"" << strings::CHexEscape(e.first.first) << "\"\n"; |
9904 | 0 | *out << " name: \"" << CHexEscape(e.first.second) << "\"\n"; |
9905 | 0 | } |
9906 | 0 | } |
9907 | |
|
9908 | 0 | master_->DumpMasterOptionsInfo(out); |
9909 | |
|
9910 | 0 | if (on_disk_dump) { |
9911 | 0 | consensus::ConsensusStatePB cur_consensus_state; |
9912 | | // TODO: proper error handling below. |
9913 | 0 | CHECK_OK(GetCurrentConfig(&cur_consensus_state)); |
9914 | 0 | *out << "Current raft config: " << cur_consensus_state.ShortDebugString() << "\n"; |
9915 | 0 | } |
9916 | 0 | } |
9917 | | |
9918 | | Status CatalogManager::PeerStateDump(const vector<RaftPeerPB>& peers, |
9919 | | const DumpMasterStateRequestPB* req, |
9920 | 0 | DumpMasterStateResponsePB* resp) { |
9921 | 0 | std::unique_ptr<MasterClusterProxy> peer_proxy; |
9922 | 0 | Endpoint sockaddr; |
9923 | 0 | MonoTime timeout = MonoTime::Now(); |
9924 | 0 | DumpMasterStateRequestPB peer_req; |
9925 | 0 | rpc::RpcController rpc; |
9926 | |
|
9927 | 0 | timeout.AddDelta(MonoDelta::FromMilliseconds(FLAGS_master_ts_rpc_timeout_ms)); |
9928 | 0 | rpc.set_deadline(timeout); |
9929 | 0 | peer_req.set_on_disk(req->on_disk()); |
9930 | 0 | peer_req.set_return_dump_as_string(req->return_dump_as_string()); |
9931 | 0 | string dump; |
9932 | |
|
9933 | 0 | for (const RaftPeerPB& peer : peers) { |
9934 | 0 | HostPort hostport = HostPortFromPB(DesiredHostPort(peer, master_->MakeCloudInfoPB())); |
9935 | 0 | peer_proxy = std::make_unique<MasterClusterProxy>(&master_->proxy_cache(), hostport); |
9936 | |
|
9937 | 0 | DumpMasterStateResponsePB peer_resp; |
9938 | 0 | rpc.Reset(); |
9939 | |
|
9940 | 0 | RETURN_NOT_OK(peer_proxy->DumpState(peer_req, &peer_resp, &rpc)); |
9941 | |
|
9942 | 0 | if (peer_resp.has_error()) { |
9943 | 0 | LOG(WARNING) << "Hit err " << peer_resp.ShortDebugString() << " during peer " |
9944 | 0 | << peer.ShortDebugString() << " state dump."; |
9945 | 0 | return StatusFromPB(peer_resp.error().status()); |
9946 | 0 | } else if (req->return_dump_as_string()) { |
9947 | 0 | dump += peer_resp.dump(); |
9948 | 0 | } |
9949 | 0 | } |
9950 | |
|
9951 | 0 | if (req->return_dump_as_string()) { |
9952 | 0 | resp->set_dump(resp->dump() + dump); |
9953 | 0 | } |
9954 | 0 | return Status::OK(); |
9955 | 0 | } |
9956 | | |
9957 | 90.0k | void CatalogManager::ReportMetrics() { |
9958 | | // Report metrics on how many tservers are alive. |
9959 | 90.0k | TSDescriptorVector ts_descs; |
9960 | 90.0k | master_->ts_manager()->GetAllLiveDescriptors(&ts_descs); |
9961 | 90.0k | const auto num_live_servers = ts_descs.size(); |
9962 | 90.0k | metric_num_tablet_servers_live_->set_value(narrow_cast<uint32_t>(num_live_servers)); |
9963 | | |
9964 | 90.0k | master_->ts_manager()->GetAllDescriptors(&ts_descs); |
9965 | 90.0k | metric_num_tablet_servers_dead_->set_value( |
9966 | 90.0k | narrow_cast<uint32_t>(ts_descs.size() - num_live_servers)); |
9967 | 90.0k | } |
9968 | | |
9969 | 150k | void CatalogManager::ResetMetrics() { |
9970 | 150k | metric_num_tablet_servers_live_->set_value(0); |
9971 | 150k | metric_num_tablet_servers_dead_->set_value(0); |
9972 | 150k | } |
9973 | | |
9974 | | |
9975 | 260k | std::string CatalogManager::LogPrefix() const { |
9976 | 260k | if (tablet_peer()) { |
9977 | 260k | return consensus::MakeTabletLogPrefix( |
9978 | 260k | tablet_peer()->tablet_id(), tablet_peer()->permanent_uuid()); |
9979 | 41 | } else { |
9980 | 41 | return consensus::MakeTabletLogPrefix( |
9981 | 41 | kSysCatalogTabletId, master_->fs_manager()->uuid()); |
9982 | 41 | } |
9983 | 260k | } |
9984 | | |
9985 | 0 | void CatalogManager::SetLoadBalancerEnabled(bool is_enabled) { |
9986 | 0 | load_balance_policy_->SetLoadBalancerEnabled(is_enabled); |
9987 | 0 | } |
9988 | | |
9989 | 1 | bool CatalogManager::IsLoadBalancerEnabled() { |
9990 | 1 | return load_balance_policy_->IsLoadBalancerEnabled(); |
9991 | 1 | } |
9992 | | |
9993 | 78.7k | MonoDelta CatalogManager::TimeSinceElectedLeader() { |
9994 | 78.7k | return MonoTime::Now() - time_elected_leader_; |
9995 | 78.7k | } |
9996 | | |
9997 | 26 | Status CatalogManager::GoIntoShellMode() { |
9998 | 26 | if (master_->IsShellMode()) { |
9999 | 0 | return STATUS(IllegalState, "Master is already in shell mode."); |
10000 | 0 | } |
10001 | | |
10002 | 26 | LOG(INFO) << "Starting going into shell mode."; |
10003 | 26 | master_->SetShellMode(true); |
10004 | | |
10005 | 26 | { |
10006 | 26 | LockGuard lock(mutex_); |
10007 | 26 | RETURN_NOT_OK(sys_catalog_->GoIntoShellMode()); |
10008 | 26 | background_tasks_->Shutdown(); |
10009 | 26 | background_tasks_.reset(); |
10010 | 26 | } |
10011 | 26 | { |
10012 | 26 | std::lock_guard<std::mutex> l(remote_bootstrap_mtx_); |
10013 | 26 | tablet_exists_ = false; |
10014 | 26 | } |
10015 | | |
10016 | 26 | LOG(INFO) << "Done going into shell mode."; |
10017 | | |
10018 | 26 | return Status::OK(); |
10019 | 26 | } |
10020 | | |
10021 | 206 | Status CatalogManager::GetClusterConfig(GetMasterClusterConfigResponsePB* resp) { |
10022 | 206 | return GetClusterConfig(resp->mutable_cluster_config()); |
10023 | 206 | } |
10024 | | |
10025 | 566k | Status CatalogManager::GetClusterConfig(SysClusterConfigEntryPB* config) { |
10026 | 56 | DCHECK(cluster_config_) << "Missing cluster config for master!"; |
10027 | 566k | auto l = cluster_config_->LockForRead(); |
10028 | 566k | *config = l->pb; |
10029 | 566k | return Status::OK(); |
10030 | 566k | } |
10031 | | |
10032 | | Status CatalogManager::SetClusterConfig( |
10033 | 111 | const ChangeMasterClusterConfigRequestPB* req, ChangeMasterClusterConfigResponsePB* resp) { |
10034 | 111 | SysClusterConfigEntryPB config(req->cluster_config()); |
10035 | | |
10036 | 111 | if (config.has_server_blacklist()) { |
10037 | 20 | config.mutable_server_blacklist()->set_initial_replica_load(narrow_cast<int32_t>( |
10038 | 20 | GetNumRelevantReplicas(config.server_blacklist(), false /* leaders_only */))); |
10039 | 20 | LOG(INFO) << Format("Set blacklist of total tservers: $0, with initial load: $1", |
10040 | 20 | config.server_blacklist().hosts().size(), |
10041 | 20 | config.server_blacklist().initial_replica_load()); |
10042 | 20 | } |
10043 | 111 | if (config.has_leader_blacklist()) { |
10044 | 14 | config.mutable_leader_blacklist()->set_initial_leader_load(narrow_cast<int32_t>( |
10045 | 14 | GetNumRelevantReplicas(config.leader_blacklist(), true /* leaders_only */))); |
10046 | 14 | LOG(INFO) << Format("Set leader blacklist of total tservers: $0, with initial load: $1", |
10047 | 14 | config.leader_blacklist().hosts().size(), |
10048 | 14 | config.leader_blacklist().initial_leader_load()); |
10049 | 14 | } |
10050 | | |
10051 | 111 | auto l = cluster_config_->LockForWrite(); |
10052 | | // We should only set the config, if the caller provided us with a valid update to the |
10053 | | // existing config. |
10054 | 111 | if (l->pb.version() != config.version()) { |
10055 | 0 | Status s = STATUS_SUBSTITUTE(IllegalState, |
10056 | 0 | "Config version does not match, got $0, but most recent one is $1. Should call Get again", |
10057 | 0 | config.version(), l->pb.version()); |
10058 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::CONFIG_VERSION_MISMATCH, s); |
10059 | 0 | } |
10060 | | |
10061 | 111 | if (config.cluster_uuid() != l->pb.cluster_uuid()) { |
10062 | 1 | Status s = STATUS(InvalidArgument, "Config cluster UUID cannot be updated"); |
10063 | 1 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_CLUSTER_CONFIG, s); |
10064 | 1 | } |
10065 | | |
10066 | | // TODO(bogdan): should this live here? |
10067 | 110 | const ReplicationInfoPB& replication_info = config.replication_info(); |
10068 | 118 | for (int i = 0; i < replication_info.read_replicas_size(); i++) { |
10069 | 8 | if (!replication_info.read_replicas(i).has_placement_uuid()) { |
10070 | 0 | Status s = STATUS(IllegalState, |
10071 | 0 | "All read-only clusters must have a placement uuid specified"); |
10072 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_CLUSTER_CONFIG, s); |
10073 | 0 | } |
10074 | 8 | } |
10075 | | |
10076 | | // Validate placement information according to rules defined. |
10077 | 110 | if (replication_info.has_live_replicas()) { |
10078 | 81 | Status s = CatalogManagerUtil::IsPlacementInfoValid(replication_info.live_replicas()); |
10079 | 81 | if (!s.ok()) { |
10080 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_CLUSTER_CONFIG, s); |
10081 | 0 | } |
10082 | 110 | } |
10083 | | |
10084 | 110 | l.mutable_data()->pb.CopyFrom(config); |
10085 | | // Bump the config version, to indicate an update. |
10086 | 110 | l.mutable_data()->pb.set_version(config.version() + 1); |
10087 | | |
10088 | 110 | LOG(INFO) << "Updating cluster config to " << config.version() + 1; |
10089 | | |
10090 | 110 | RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), cluster_config_)); |
10091 | | |
10092 | 110 | l.Commit(); |
10093 | | |
10094 | 110 | return Status::OK(); |
10095 | 110 | } |
10096 | | |
10097 | | Status CatalogManager::ValidateReplicationInfo( |
10098 | 32.3k | const ValidateReplicationInfoRequestPB* req, ValidateReplicationInfoResponsePB* resp) { |
10099 | 32.3k | TSDescriptorVector all_ts_descs; |
10100 | 32.3k | { |
10101 | 32.3k | BlacklistSet blacklist = BlacklistSetFromPB(); |
10102 | 32.3k | master_->ts_manager()->GetAllLiveDescriptors(&all_ts_descs, blacklist); |
10103 | 32.3k | } |
10104 | | // We don't need any validation checks for read replica placements |
10105 | | // because they aren't a part of any raft quorum underneath. |
10106 | | // Technically, it is ok to have even 0 read replica nodes for them upfront. |
10107 | | // We only need it for the primary cluster replicas. |
10108 | 32.3k | TSDescriptorVector ts_descs; |
10109 | 32.3k | GetTsDescsFromPlacementInfo(req->replication_info().live_replicas(), all_ts_descs, &ts_descs); |
10110 | 32.3k | Status s = CheckValidPlacementInfo(req->replication_info().live_replicas(), all_ts_descs, resp); |
10111 | 32.3k | if (!s.ok()) { |
10112 | 6 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_TABLE_REPLICATION_INFO, s); |
10113 | 6 | } |
10114 | 32.3k | return Status::OK(); |
10115 | 32.3k | } |
10116 | | |
10117 | | Status CatalogManager::SetPreferredZones( |
10118 | 3 | const SetPreferredZonesRequestPB* req, SetPreferredZonesResponsePB* resp) { |
10119 | 3 | auto l = cluster_config_->LockForWrite(); |
10120 | 3 | auto replication_info = l.mutable_data()->pb.mutable_replication_info(); |
10121 | 3 | replication_info->clear_affinitized_leaders(); |
10122 | | |
10123 | 5 | for (const auto& cloud_info : req->preferred_zones()) { |
10124 | 5 | const auto& placement_info = replication_info->live_replicas(); |
10125 | 5 | if (!CatalogManagerUtil::DoesPlacementInfoContainCloudInfo(placement_info, cloud_info)) { |
10126 | 0 | Status s = STATUS_FORMAT(InvalidArgument, "Placement info $0 does not contain cloud info $1", |
10127 | 0 | placement_info, TSDescriptor::generate_placement_id(cloud_info)); |
10128 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_CLUSTER_CONFIG, s); |
10129 | 0 | } |
10130 | 5 | *replication_info->add_affinitized_leaders() = cloud_info; |
10131 | 5 | } |
10132 | | |
10133 | 3 | l.mutable_data()->pb.set_version(l.mutable_data()->pb.version() + 1); |
10134 | | |
10135 | 3 | LOG(INFO) << "Updating cluster config to " << l.mutable_data()->pb.version(); |
10136 | | |
10137 | 3 | Status s = sys_catalog_->Upsert(leader_ready_term(), cluster_config_); |
10138 | 3 | if (!s.ok()) { |
10139 | 0 | return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_CLUSTER_CONFIG, s); |
10140 | 0 | } |
10141 | | |
10142 | 3 | l.Commit(); |
10143 | | |
10144 | 3 | return Status::OK(); |
10145 | 3 | } |
10146 | | |
10147 | 56.0k | Result<size_t> CatalogManager::GetReplicationFactor() { |
10148 | 2 | DCHECK(cluster_config_) << "Missing cluster config for master!"; |
10149 | 56.0k | auto l = cluster_config_->LockForRead(); |
10150 | 56.0k | const ReplicationInfoPB& replication_info = l->pb.replication_info(); |
10151 | 56.0k | return GetNumReplicasFromPlacementInfo(replication_info.live_replicas()); |
10152 | 56.0k | } |
10153 | | |
10154 | | Result<size_t> CatalogManager::GetReplicationFactorForTablet( |
10155 | 9.22k | const scoped_refptr<TabletInfo>& tablet) { |
10156 | | // For system tables, the set of replicas is always the set of masters. |
10157 | 9.22k | if (system_tablets_.find(tablet->id()) != system_tablets_.end()) { |
10158 | 90 | consensus::ConsensusStatePB master_consensus; |
10159 | 90 | RETURN_NOT_OK(GetCurrentConfig(&master_consensus)); |
10160 | 90 | return master_consensus.config().peers().size(); |
10161 | 9.13k | } |
10162 | 9.13k | int num_live_replicas = 0, num_read_replicas = 0; |
10163 | 9.13k | GetExpectedNumberOfReplicas(&num_live_replicas, &num_read_replicas); |
10164 | 9.13k | return num_live_replicas + num_read_replicas; |
10165 | 9.13k | } |
10166 | | |
10167 | 185k | void CatalogManager::GetExpectedNumberOfReplicas(int* num_live_replicas, int* num_read_replicas) { |
10168 | 185k | auto l = cluster_config_->LockForRead(); |
10169 | 185k | const ReplicationInfoPB& replication_info = l->pb.replication_info(); |
10170 | 185k | *num_live_replicas = narrow_cast<int>(GetNumReplicasFromPlacementInfo( |
10171 | 185k | replication_info.live_replicas())); |
10172 | 711 | for (const auto& read_replica_placement_info : replication_info.read_replicas()) { |
10173 | 711 | *num_read_replicas += read_replica_placement_info.num_replicas(); |
10174 | 711 | } |
10175 | 185k | } |
10176 | | |
10177 | 2.95k | string CatalogManager::placement_uuid() const { |
10178 | 0 | DCHECK(cluster_config_) << "Missing cluster config for master!"; |
10179 | 2.95k | auto l = cluster_config_->LockForRead(); |
10180 | 2.95k | const ReplicationInfoPB& replication_info = l->pb.replication_info(); |
10181 | 2.95k | return replication_info.live_replicas().placement_uuid(); |
10182 | 2.95k | } |
10183 | | |
10184 | | Status CatalogManager::IsLoadBalanced(const IsLoadBalancedRequestPB* req, |
10185 | 203 | IsLoadBalancedResponsePB* resp) { |
10186 | 203 | if (req->has_expected_num_servers()) { |
10187 | 202 | TSDescriptorVector ts_descs; |
10188 | 202 | master_->ts_manager()->GetAllLiveDescriptors(&ts_descs); |
10189 | | |
10190 | 202 | if (implicit_cast<size_t>(req->expected_num_servers()) > ts_descs.size()) { |
10191 | 9 | Status s = STATUS_SUBSTITUTE(IllegalState, |
10192 | 9 | "Found $0, which is below the expected number of servers $1.", |
10193 | 9 | ts_descs.size(), req->expected_num_servers()); |
10194 | 9 | return SetupError(resp->mutable_error(), MasterErrorPB::CAN_RETRY_LOAD_BALANCE_CHECK, s); |
10195 | 9 | } |
10196 | 194 | } |
10197 | | |
10198 | 194 | Status s = load_balance_policy_->IsIdle(); |
10199 | 194 | if (!s.ok()) { |
10200 | 174 | return SetupError(resp->mutable_error(), MasterErrorPB::CAN_RETRY_LOAD_BALANCE_CHECK, s); |
10201 | 174 | } |
10202 | | |
10203 | 20 | return Status::OK(); |
10204 | 20 | } |
10205 | | |
10206 | | Status CatalogManager::IsLoadBalancerIdle(const IsLoadBalancerIdleRequestPB* req, |
10207 | 2.25k | IsLoadBalancerIdleResponsePB* resp) { |
10208 | 2.25k | Status s = load_balance_policy_->IsIdle(); |
10209 | 2.25k | if (!s.ok()) { |
10210 | 1.72k | return SetupError(resp->mutable_error(), MasterErrorPB::LOAD_BALANCER_RECENTLY_ACTIVE, s); |
10211 | 1.72k | } |
10212 | | |
10213 | 531 | return Status::OK(); |
10214 | 531 | } |
10215 | | |
10216 | | Status CatalogManager::AreLeadersOnPreferredOnly(const AreLeadersOnPreferredOnlyRequestPB* req, |
10217 | 153 | AreLeadersOnPreferredOnlyResponsePB* resp) { |
10218 | | // If we have cluster replication info, then only fetch live tservers (ignore read replicas). |
10219 | 153 | TSDescriptorVector ts_descs; |
10220 | 153 | string live_replicas_placement_uuid = ""; |
10221 | 153 | { |
10222 | 153 | auto l = cluster_config_->LockForRead(); |
10223 | 153 | const ReplicationInfoPB& cluster_replication_info = l->pb.replication_info(); |
10224 | 153 | if (cluster_replication_info.has_live_replicas()) { |
10225 | 116 | live_replicas_placement_uuid = cluster_replication_info.live_replicas().placement_uuid(); |
10226 | 116 | } |
10227 | 153 | } |
10228 | | |
10229 | 153 | { |
10230 | 153 | BlacklistSet blacklist = BlacklistSetFromPB(); |
10231 | 153 | if (live_replicas_placement_uuid.empty()) { |
10232 | 152 | master_->ts_manager()->GetAllLiveDescriptors(&ts_descs, blacklist); |
10233 | 1 | } else { |
10234 | 1 | master_->ts_manager()->GetAllLiveDescriptorsInCluster( |
10235 | 1 | &ts_descs, live_replicas_placement_uuid, |
10236 | 1 | blacklist); |
10237 | 1 | } |
10238 | 153 | } |
10239 | | |
10240 | | // Only need to fetch if txn tables are not using preferred zones. |
10241 | 153 | vector<TableInfoPtr> tables; |
10242 | 153 | if (!FLAGS_transaction_tables_use_preferred_zones) { |
10243 | 153 | tables = master_->catalog_manager()->GetTables(GetTablesMode::kRunning); |
10244 | 153 | } |
10245 | | |
10246 | 153 | auto l = cluster_config_->LockForRead(); |
10247 | 153 | Status s = CatalogManagerUtil::AreLeadersOnPreferredOnly( |
10248 | 153 | ts_descs, l->pb.replication_info(), tables); |
10249 | 153 | if (!s.ok()) { |
10250 | 138 | return SetupError( |
10251 | 138 | resp->mutable_error(), MasterErrorPB::CAN_RETRY_ARE_LEADERS_ON_PREFERRED_ONLY_CHECK, s); |
10252 | 138 | } |
10253 | | |
10254 | 15 | return Status::OK(); |
10255 | 15 | } |
10256 | | |
10257 | 1.23k | int64_t CatalogManager::GetNumRelevantReplicas(const BlacklistPB& blacklist, bool leaders_only) { |
10258 | 1.23k | int64_t res = 0; |
10259 | 1.23k | SharedLock lock(mutex_); |
10260 | 43.0k | for (const TabletInfoMap::value_type& entry : *tablet_map_) { |
10261 | 43.0k | scoped_refptr<TabletInfo> tablet = entry.second; |
10262 | 43.0k | auto l = tablet->LockForRead(); |
10263 | | // Not checking being created on purpose as we do not want initial load to be under accounted. |
10264 | 43.0k | if (!tablet->table() || |
10265 | 43.0k | PREDICT_FALSE(l->is_deleted())) { |
10266 | 0 | continue; |
10267 | 0 | } |
10268 | | |
10269 | 43.0k | auto locs = tablet->GetReplicaLocations(); |
10270 | 66.2k | for (const auto& replica : *locs) { |
10271 | 66.2k | if (leaders_only && replica.second.role != PeerRole::LEADER) { |
10272 | 7.38k | continue; |
10273 | 7.38k | } |
10274 | 180k | for (int i = 0; i < blacklist.hosts_size(); i++) { |
10275 | 144k | if (replica.second.ts_desc->IsRunningOn(blacklist.hosts(i))) { |
10276 | 23.4k | ++res; |
10277 | 23.4k | break; |
10278 | 23.4k | } |
10279 | 144k | } |
10280 | 58.9k | } |
10281 | 43.0k | } |
10282 | | |
10283 | 1.23k | return res; |
10284 | 1.23k | } |
10285 | | |
10286 | | Status CatalogManager::FillHeartbeatResponse(const TSHeartbeatRequestPB* req, |
10287 | 0 | TSHeartbeatResponsePB* resp) { |
10288 | 0 | return Status::OK(); |
10289 | 0 | } |
10290 | | |
10291 | 1.00k | Status CatalogManager::GetLoadMoveCompletionPercent(GetLoadMovePercentResponsePB* resp) { |
10292 | 1.00k | return GetLoadMoveCompletionPercent(resp, false); |
10293 | 1.00k | } |
10294 | | |
10295 | 194 | Status CatalogManager::GetLeaderBlacklistCompletionPercent(GetLoadMovePercentResponsePB* resp) { |
10296 | 194 | return GetLoadMoveCompletionPercent(resp, true); |
10297 | 194 | } |
10298 | | |
10299 | | Status CatalogManager::GetLoadMoveCompletionPercent(GetLoadMovePercentResponsePB* resp, |
10300 | 1.20k | bool blacklist_leader) { |
10301 | 1.20k | auto l = cluster_config_->LockForRead(); |
10302 | | |
10303 | | // Fine to pass in empty defaults if server_blacklist or leader_blacklist is not filled. |
10304 | 1.00k | const BlacklistPB& state = blacklist_leader ? l->pb.leader_blacklist() : l->pb.server_blacklist(); |
10305 | 1.20k | int64_t blacklist_replicas = GetNumRelevantReplicas(state, blacklist_leader); |
10306 | 1.20k | int64_t initial_load = (blacklist_leader) ? |
10307 | 1.00k | state.initial_leader_load(): state.initial_replica_load(); |
10308 | | // If we are starting up and don't find any load on the tservers, return progress as 0. |
10309 | | // We expect that by blacklist_progress_initial_delay_secs time, this should go away and if the |
10310 | | // load is reported as 0 on the blacklisted tservers after this time then it means that |
10311 | | // the transfer is successfully complete. |
10312 | 1.20k | if (blacklist_replicas == 0 && |
10313 | 580 | TimeSinceElectedLeader() <= MonoDelta::FromSeconds(FLAGS_blacklist_progress_initial_delay_secs)) { |
10314 | 466 | LOG(INFO) << "Master leadership has changed. Reporting progress as 0 until the catalog " << |
10315 | 466 | "manager gets the correct estimates of the remaining load on the blacklisted" << |
10316 | 466 | "tservers."; |
10317 | 466 | resp->set_percent(0); |
10318 | 466 | resp->set_total(initial_load); |
10319 | 466 | resp->set_remaining(initial_load); |
10320 | 466 | return Status::OK(); |
10321 | 466 | } |
10322 | | |
10323 | | // On change of master leader, initial_load_ information may be lost temporarily. Reset to |
10324 | | // current value to avoid reporting progress percent as 100. Note that doing so will report |
10325 | | // progress percent as 0 instead. |
10326 | | // TODO(Sanket): This might be no longer relevant after we persist and load the initial load |
10327 | | // on failover. Need to investigate. |
10328 | 737 | if (initial_load < blacklist_replicas) { |
10329 | 0 | LOG(INFO) << Format("Initial load: $0, current load: $1." |
10330 | 0 | " Initial load is less than the current load. Probably a master leader change." |
10331 | 0 | " Reporting progress as 0", state.initial_replica_load(), |
10332 | 0 | blacklist_replicas); |
10333 | 0 | initial_load = blacklist_replicas; |
10334 | 0 | } |
10335 | | |
10336 | 737 | LOG(INFO) << "Blacklisted count " << blacklist_replicas |
10337 | 737 | << " across " << state.hosts_size() |
10338 | 737 | << " servers, with initial load " << initial_load; |
10339 | | |
10340 | | // Case when a blacklisted servers did not have any starting load. |
10341 | 737 | if (initial_load == 0) { |
10342 | 32 | resp->set_percent(100); |
10343 | 32 | return Status::OK(); |
10344 | 32 | } |
10345 | | |
10346 | 705 | resp->set_percent( |
10347 | 705 | 100 - (static_cast<double>(blacklist_replicas) * 100 / initial_load)); |
10348 | 705 | resp->set_remaining(blacklist_replicas); |
10349 | 705 | resp->set_total(initial_load); |
10350 | | |
10351 | 705 | return Status::OK(); |
10352 | 705 | } |
10353 | | |
10354 | 2.10k | void CatalogManager::AbortAndWaitForAllTasks(const vector<scoped_refptr<TableInfo>>& tables) { |
10355 | 1.85k | for (const auto& t : tables) { |
10356 | 0 | VLOG(1) << "Aborting tasks for table " << t->ToString(); |
10357 | 1.85k | t->AbortTasksAndClose(); |
10358 | 1.85k | } |
10359 | 1.85k | for (const auto& t : tables) { |
10360 | 0 | VLOG(1) << "Waiting on Aborting tasks for table " << t->ToString(); |
10361 | 1.85k | t->WaitTasksCompletion(); |
10362 | 1.85k | } |
10363 | 0 | VLOG(1) << "Waiting on Aborting tasks done"; |
10364 | 2.10k | } |
10365 | | |
10366 | 242k | void CatalogManager::HandleNewTableId(const TableId& table_id) { |
10367 | 242k | if (table_id == kPgProcTableId) { |
10368 | | // Needed to track whether initdb has started running. |
10369 | 363 | pg_proc_exists_.store(true, std::memory_order_release); |
10370 | 363 | } |
10371 | 242k | } |
10372 | | |
10373 | 243k | scoped_refptr<TableInfo> CatalogManager::NewTableInfo(TableId id) { |
10374 | 243k | return make_scoped_refptr<TableInfo>(id, tasks_tracker_); |
10375 | 243k | } |
10376 | | |
10377 | 212k | Status CatalogManager::ScheduleTask(std::shared_ptr<RetryingTSRpcTask> task) { |
10378 | 212k | Status s = async_task_pool_->SubmitFunc([task]() { |
10379 | 212k | WARN_NOT_OK(task->Run(), "Failed task"); |
10380 | 212k | }); |
10381 | | // If we are not able to enqueue, abort the task. |
10382 | 212k | if (!s.ok()) { |
10383 | 0 | task->AbortAndReturnPrevState(s); |
10384 | 0 | } |
10385 | 212k | return s; |
10386 | 212k | } |
10387 | | |
10388 | | Status CatalogManager::CollectTable( |
10389 | | const TableDescription& table_description, |
10390 | | CollectFlags flags, |
10391 | | std::vector<TableDescription>* all_tables, |
10392 | 7 | std::unordered_set<NamespaceId>* parent_colocated_table_ids) { |
10393 | 7 | auto lock = table_description.table_info->LockForRead(); |
10394 | 7 | if (lock->started_hiding()) { |
10395 | 0 | VLOG_WITH_PREFIX_AND_FUNC(4) |
10396 | 0 | << "Rejected hidden table: " << AsString(table_description.table_info); |
10397 | 0 | return Status::OK(); |
10398 | 0 | } |
10399 | 7 | if (lock->started_deleting()) { |
10400 | 0 | VLOG_WITH_PREFIX_AND_FUNC(4) |
10401 | 0 | << "Rejected deleted table: " << AsString(table_description.table_info); |
10402 | 0 | return Status::OK(); |
10403 | 0 | } |
10404 | 7 | if (flags.Test(CollectFlag::kIncludeParentColocatedTable) && lock->pb.colocated()) { |
10405 | | // If a table is colocated, add its parent colocated table as well. |
10406 | 0 | const auto parent_table_id = |
10407 | 0 | table_description.namespace_info->id() + kColocatedParentTableIdSuffix; |
10408 | 0 | auto result = parent_colocated_table_ids->insert(parent_table_id); |
10409 | 0 | if (result.second) { |
10410 | | // We have not processed this parent table id yet, so do that now. |
10411 | 0 | TableIdentifierPB parent_table_pb; |
10412 | 0 | parent_table_pb.set_table_id(parent_table_id); |
10413 | 0 | parent_table_pb.mutable_namespace_()->set_id(table_description.namespace_info->id()); |
10414 | 0 | all_tables->push_back(VERIFY_RESULT(DescribeTable( |
10415 | 0 | parent_table_pb, flags.Test(CollectFlag::kSucceedIfCreateInProgress)))); |
10416 | 0 | } |
10417 | 0 | } |
10418 | 7 | all_tables->push_back(table_description); |
10419 | | |
10420 | 7 | if (flags.Test(CollectFlag::kAddIndexes)) { |
10421 | 0 | TRACE(Substitute("Locking object with id $0", table_description.table_info->id())); |
10422 | |
|
10423 | 0 | if (lock->is_index()) { |
10424 | 0 | return STATUS(InvalidArgument, "Expected table, but found index", |
10425 | 0 | table_description.table_info->id(), |
10426 | 0 | MasterError(MasterErrorPB::INVALID_TABLE_TYPE)); |
10427 | 0 | } |
10428 | | |
10429 | 0 | if (lock->table_type() == PGSQL_TABLE_TYPE) { |
10430 | 0 | return STATUS(InvalidArgument, "Getting indexes for YSQL table is not supported", |
10431 | 0 | table_description.table_info->id(), |
10432 | 0 | MasterError(MasterErrorPB::INVALID_TABLE_TYPE)); |
10433 | 0 | } |
10434 | | |
10435 | 0 | auto collect_index_flags = flags; |
10436 | | // Don't need to collect indexes for index. |
10437 | 0 | collect_index_flags.Reset(CollectFlag::kAddIndexes); |
10438 | 0 | for (const auto& index_info : lock->pb.indexes()) { |
10439 | 0 | LOG_IF(DFATAL, table_description.table_info->id() != index_info.indexed_table_id()) |
10440 | 0 | << "Wrong indexed table id in index descriptor"; |
10441 | 0 | TableIdentifierPB index_id_pb; |
10442 | 0 | index_id_pb.set_table_id(index_info.table_id()); |
10443 | 0 | index_id_pb.mutable_namespace_()->set_id(table_description.namespace_info->id()); |
10444 | 0 | auto index_description = VERIFY_RESULT(DescribeTable( |
10445 | 0 | index_id_pb, flags.Test(CollectFlag::kSucceedIfCreateInProgress))); |
10446 | 0 | RETURN_NOT_OK(CollectTable( |
10447 | 0 | index_description, collect_index_flags, all_tables, parent_colocated_table_ids)); |
10448 | 0 | } |
10449 | 0 | } |
10450 | | |
10451 | 7 | return Status::OK(); |
10452 | 7 | } |
10453 | | |
10454 | | Result<vector<TableDescription>> CatalogManager::CollectTables( |
10455 | | const google::protobuf::RepeatedPtrField<TableIdentifierPB>& table_identifiers, |
10456 | | CollectFlags flags, |
10457 | 7 | std::unordered_set<NamespaceId>* namespaces) { |
10458 | 7 | std::vector<std::pair<TableInfoPtr, CollectFlags>> table_with_flags; |
10459 | | |
10460 | 7 | { |
10461 | 7 | SharedLock lock(mutex_); |
10462 | 7 | for (const auto& table_id_pb : table_identifiers) { |
10463 | 7 | if (table_id_pb.table_name().empty() && table_id_pb.table_id().empty() && |
10464 | 0 | table_id_pb.has_namespace_()) { |
10465 | 0 | auto namespace_info = FindNamespaceUnlocked(table_id_pb.namespace_()); |
10466 | 0 | if (!namespace_info.ok()) { |
10467 | 0 | if (namespace_info.status().IsNotFound()) { |
10468 | 0 | continue; |
10469 | 0 | } |
10470 | 0 | return namespace_info.status(); |
10471 | 0 | } |
10472 | 0 | if (namespaces) { |
10473 | 0 | namespaces->insert((**namespace_info).id()); |
10474 | 0 | } |
10475 | | |
10476 | |
|
10477 | 0 | auto ns_collect_flags = flags; |
10478 | | // Don't collect indexes, since they should be in the same namespace and will be collected |
10479 | | // as regular tables. |
10480 | | // It is necessary because we don't support kAddIndexes for YSQL tables. |
10481 | 0 | ns_collect_flags.Reset(CollectFlag::kAddIndexes); |
10482 | 0 | VLOG_WITH_PREFIX_AND_FUNC(1) |
10483 | 0 | << "Collecting all tables from: " << (**namespace_info).ToString() << ", specified as: " |
10484 | 0 | << table_id_pb.namespace_().ShortDebugString(); |
10485 | 0 | for (const auto& id_and_table : *table_ids_map_) { |
10486 | 0 | if (id_and_table.second->is_system()) { |
10487 | 0 | VLOG_WITH_PREFIX_AND_FUNC(4) << "Rejected system table: " << AsString(id_and_table); |
10488 | 0 | continue; |
10489 | 0 | } |
10490 | 0 | auto lock = id_and_table.second->LockForRead(); |
10491 | 0 | if (lock->namespace_id() != (**namespace_info).id()) { |
10492 | 0 | VLOG_WITH_PREFIX_AND_FUNC(4) |
10493 | 0 | << "Rejected table from other namespace: " << AsString(id_and_table); |
10494 | 0 | continue; |
10495 | 0 | } |
10496 | 0 | VLOG_WITH_PREFIX_AND_FUNC(4) << "Accepted: " << AsString(id_and_table); |
10497 | 0 | table_with_flags.emplace_back(id_and_table.second, ns_collect_flags); |
10498 | 0 | } |
10499 | 7 | } else { |
10500 | 7 | auto table = VERIFY_RESULT(FindTableUnlocked(table_id_pb)); |
10501 | 0 | VLOG_WITH_PREFIX_AND_FUNC(1) << "Collecting table: " << table->ToString(); |
10502 | 7 | table_with_flags.emplace_back(table, flags); |
10503 | 7 | } |
10504 | 7 | } |
10505 | 7 | } |
10506 | | |
10507 | 7 | std::sort(table_with_flags.begin(), table_with_flags.end(), [](const auto& p1, const auto& p2) { |
10508 | 0 | return p1.first->id() < p2.first->id(); |
10509 | 0 | }); |
10510 | 7 | std::vector<TableDescription> all_tables; |
10511 | 7 | std::unordered_set<NamespaceId> parent_colocated_table_ids; |
10512 | 7 | const TableId* table_id = nullptr; |
10513 | 7 | for (auto& table_and_flags : table_with_flags) { |
10514 | 7 | if (table_id && *table_id == table_and_flags.first->id()) { |
10515 | 0 | return STATUS_FORMAT(InternalError, "Table collected twice $0", *table_id); |
10516 | 0 | } |
10517 | 7 | auto description = VERIFY_RESULT(DescribeTable( |
10518 | 7 | table_and_flags.first, |
10519 | 7 | table_and_flags.second.Test(CollectFlag::kSucceedIfCreateInProgress))); |
10520 | 7 | RETURN_NOT_OK(CollectTable( |
10521 | 7 | description, table_and_flags.second, &all_tables, &parent_colocated_table_ids)); |
10522 | 7 | table_id = &table_and_flags.first->id(); |
10523 | 7 | } |
10524 | | |
10525 | 7 | return all_tables; |
10526 | 7 | } |
10527 | | |
10528 | | Result<std::vector<TableDescription>> CatalogManager::CollectTables( |
10529 | | const google::protobuf::RepeatedPtrField<TableIdentifierPB>& table_identifiers, |
10530 | | bool add_indexes, |
10531 | 7 | bool include_parent_colocated_table) { |
10532 | 7 | CollectFlags flags; |
10533 | 7 | flags.SetIf(CollectFlag::kAddIndexes, add_indexes); |
10534 | 7 | flags.SetIf(CollectFlag::kIncludeParentColocatedTable, include_parent_colocated_table); |
10535 | 7 | return CollectTables(table_identifiers, flags); |
10536 | 7 | } |
10537 | | |
10538 | 2.00k | Status CatalogManager::GetYQLPartitionsVTable(std::shared_ptr<SystemTablet>* tablet) { |
10539 | 2.00k | scoped_refptr<TableInfo> table = FindPtrOrNull(table_names_map_, |
10540 | 2.00k | std::make_pair(kSystemNamespaceId, kSystemPartitionsTableName)); |
10541 | 2.00k | SCHECK(table != nullptr, NotFound, "YQL system.partitions table not found"); |
10542 | | |
10543 | 2.00k | auto tablets = table->GetTablets(); |
10544 | 2.00k | SCHECK(tablets.size() == 1, NotFound, "YQL system.partitions tablet not found"); |
10545 | 2.00k | *tablet = std::dynamic_pointer_cast<SystemTablet>( |
10546 | 2.00k | VERIFY_RESULT(GetSystemTablet(tablets[0]->tablet_id()))); |
10547 | 2.00k | return Status::OK(); |
10548 | 2.00k | } |
10549 | | |
10550 | 15.9k | void CatalogManager::RebuildYQLSystemPartitions() { |
10551 | 15.9k | if (YQLPartitionsVTable::GeneratePartitionsVTableWithBgTask() || |
10552 | 15.9k | YQLPartitionsVTable::GeneratePartitionsVTableOnChanges()) { |
10553 | 15.9k | SCOPED_LEADER_SHARED_LOCK(l, this); |
10554 | 15.9k | if (l.catalog_status().ok() && l.leader_status().ok()) { |
10555 | 5.03k | if (system_partitions_tablet_ != nullptr) { |
10556 | 5.03k | Status s; |
10557 | 5.03k | if (YQLPartitionsVTable::GeneratePartitionsVTableWithBgTask()) { |
10558 | | // If we are not generating the vtable on changes, then we need to do a full refresh. |
10559 | 2 | s = ResultToStatus(GetYqlPartitionsVtable().GenerateAndCacheData()); |
10560 | 5.02k | } else { |
10561 | | // Otherwise, we can simply update the cached vtable with the internal map. |
10562 | 5.02k | s = GetYqlPartitionsVtable().UpdateCache(); |
10563 | 5.02k | } |
10564 | 5.03k | if (!s.ok()) { |
10565 | 0 | LOG(ERROR) << "Error rebuilding system.partitions: " << s.ToString(); |
10566 | 0 | } |
10567 | 0 | } else { |
10568 | 0 | LOG(ERROR) << "Error finding system.partitions vtable."; |
10569 | 0 | } |
10570 | 5.03k | } |
10571 | 15.9k | } |
10572 | | |
10573 | 15.9k | auto wait_time = FLAGS_partitions_vtable_cache_refresh_secs * 1s; |
10574 | 15.9k | if (wait_time <= 0s) { |
10575 | 15.8k | wait_time = kDefaultYQLPartitionsRefreshBgTaskSleep; |
10576 | 15.8k | } |
10577 | 10.6k | refresh_yql_partitions_task_.Schedule([this](const Status& status) { |
10578 | 10.6k | WARN_NOT_OK( |
10579 | 10.6k | background_tasks_thread_pool_->SubmitFunc([this]() { RebuildYQLSystemPartitions(); }), |
10580 | 10.6k | "Failed to schedule: RebuildYQLSystemPartitions"); |
10581 | 10.6k | }, wait_time); |
10582 | 15.9k | } |
10583 | | |
10584 | 90.0k | Status CatalogManager::SysCatalogRespectLeaderAffinity() { |
10585 | 90.0k | auto l = cluster_config_->LockForRead(); |
10586 | | |
10587 | 90.0k | const auto& affinitized_leaders = l->pb.replication_info().affinitized_leaders(); |
10588 | 90.0k | if (affinitized_leaders.empty()) { |
10589 | 89.8k | return Status::OK(); |
10590 | 89.8k | } |
10591 | | |
10592 | 192 | for (const CloudInfoPB& cloud_info : affinitized_leaders) { |
10593 | | // Do nothing if already in an affinitized zone. |
10594 | 192 | if (CatalogManagerUtil::IsCloudInfoEqual(cloud_info, server_registration_.cloud_info())) { |
10595 | 86 | return Status::OK(); |
10596 | 86 | } |
10597 | 192 | } |
10598 | | |
10599 | | // Not in affinitized zone, try finding a master to send a step down request to. |
10600 | 80 | std::vector<ServerEntryPB> masters; |
10601 | 80 | RETURN_NOT_OK(master_->ListMasters(&masters)); |
10602 | | |
10603 | 142 | for (const ServerEntryPB& master : masters) { |
10604 | 142 | auto master_cloud_info = master.registration().cloud_info(); |
10605 | | |
10606 | 180 | for (const CloudInfoPB& config_cloud_info : affinitized_leaders) { |
10607 | 180 | if (CatalogManagerUtil::IsCloudInfoEqual(config_cloud_info, master_cloud_info)) { |
10608 | 0 | if (PREDICT_FALSE( |
10609 | 0 | GetAtomicFlag(&FLAGS_TEST_crash_server_on_sys_catalog_leader_affinity_move))) { |
10610 | 0 | LOG_WITH_PREFIX(FATAL) << "For test: Crashing the server instead of performing sys " |
10611 | 0 | "catalog leader affinity move."; |
10612 | 0 | } |
10613 | 0 | YB_LOG_WITH_PREFIX_EVERY_N_SECS(INFO, 10) |
10614 | 0 | << "Sys catalog tablet is not in an affinitized zone, " |
10615 | 0 | << "sending step down request to master uuid " |
10616 | 0 | << master.instance_id().permanent_uuid() |
10617 | 0 | << " in zone " |
10618 | 0 | << TSDescriptor::generate_placement_id(master_cloud_info); |
10619 | 0 | std::shared_ptr<TabletPeer> tablet_peer; |
10620 | 0 | RETURN_NOT_OK(GetTabletPeer(sys_catalog_->tablet_id(), &tablet_peer)); |
10621 | |
|
10622 | 0 | consensus::LeaderStepDownRequestPB req; |
10623 | 0 | req.set_tablet_id(sys_catalog_->tablet_id()); |
10624 | 0 | req.set_dest_uuid(sys_catalog_->tablet_peer()->permanent_uuid()); |
10625 | 0 | req.set_new_leader_uuid(master.instance_id().permanent_uuid()); |
10626 | |
|
10627 | 0 | consensus::LeaderStepDownResponsePB resp; |
10628 | 0 | RETURN_NOT_OK(tablet_peer->consensus()->StepDown(&req, &resp)); |
10629 | 0 | if (resp.has_error()) { |
10630 | 0 | YB_LOG_WITH_PREFIX_EVERY_N_SECS(INFO, 10) << "Step down failed: " |
10631 | 0 | << resp.error().status().message(); |
10632 | 0 | break; |
10633 | 0 | } |
10634 | 0 | LOG_WITH_PREFIX(INFO) << "Successfully stepped down to new master"; |
10635 | 0 | return Status::OK(); |
10636 | 0 | } |
10637 | 180 | } |
10638 | 142 | } |
10639 | | |
10640 | 80 | return STATUS(NotFound, "Couldn't step down to a master in an affinitized zone"); |
10641 | 80 | } |
10642 | | |
10643 | 149k | BlacklistSet CatalogManager::BlacklistSetFromPB() const { |
10644 | 149k | auto l = cluster_config_->LockForRead(); |
10645 | | |
10646 | 149k | const auto& blacklist_pb = l->pb.server_blacklist(); |
10647 | 149k | BlacklistSet blacklist_set; |
10648 | 149k | for (int i = 0; i < blacklist_pb.hosts_size(); i++) { |
10649 | 104 | blacklist_set.insert(HostPortFromPB(blacklist_pb.hosts(i))); |
10650 | 104 | } |
10651 | | |
10652 | 149k | return blacklist_set; |
10653 | 149k | } |
10654 | | |
10655 | | void CatalogManager::ProcessTabletStorageMetadata( |
10656 | | const std::string& ts_uuid, |
10657 | 285k | const TabletDriveStorageMetadataPB& storage_metadata) { |
10658 | 285k | const string& tablet_id = storage_metadata.tablet_id(); |
10659 | 285k | scoped_refptr<TabletInfo> tablet; |
10660 | 285k | { |
10661 | 285k | SharedLock lock(mutex_); |
10662 | 285k | tablet = FindPtrOrNull(*tablet_map_, tablet_id); |
10663 | 285k | } |
10664 | 285k | if (!tablet) { |
10665 | 0 | VLOG(1) << Format("Tablet $0 not found on ts $1", tablet_id, ts_uuid); |
10666 | 0 | return; |
10667 | 0 | } |
10668 | 285k | TabletReplicaDriveInfo drive_info{ |
10669 | 285k | storage_metadata.sst_file_size(), |
10670 | 285k | storage_metadata.wal_file_size(), |
10671 | 285k | storage_metadata.uncompressed_sst_file_size(), |
10672 | 285k | storage_metadata.may_have_orphaned_post_split_data()}; |
10673 | 285k | tablet->UpdateReplicaDriveInfo(ts_uuid, drive_info); |
10674 | 285k | } |
10675 | | |
10676 | 76.8k | void CatalogManager::CheckTableDeleted(const TableInfoPtr& table) { |
10677 | 76.8k | if (!FLAGS_master_drop_table_after_task_response) { |
10678 | 0 | return; |
10679 | 0 | } |
10680 | | // Since this is called after every successful async DeleteTablet, it's possible if all tasks |
10681 | | // complete, for us to mark the table as DELETED/HIDDEN asap. This is desirable as clients will |
10682 | | // wait for this before returning success to the user. |
10683 | | // |
10684 | | // However, if tasks fail, timeout, or are aborted, we still have the background thread as a |
10685 | | // catch all. |
10686 | 76.8k | auto lock = MaybeTransitionTableToDeleted(table); |
10687 | 76.8k | if (!lock.locked()) { |
10688 | 74.1k | return; |
10689 | 74.1k | } |
10690 | 2.76k | Status s = sys_catalog_->Upsert(leader_ready_term(), table); |
10691 | 2.76k | if (!s.ok()) { |
10692 | 0 | LOG_WITH_PREFIX(WARNING) |
10693 | 0 | << "Error marking table as " |
10694 | 0 | << (table->LockForRead()->started_deleting() ? "DELETED" : "HIDDEN") << ": " << s; |
10695 | 0 | return; |
10696 | 0 | } |
10697 | 2.76k | lock.Commit(); |
10698 | 2.76k | } |
10699 | | |
10700 | 288k | const YQLPartitionsVTable& CatalogManager::GetYqlPartitionsVtable() const { |
10701 | 288k | return down_cast<const YQLPartitionsVTable&>(system_partitions_tablet_->QLStorage()); |
10702 | 288k | } |
10703 | | |
10704 | | void CatalogManager::InitializeTableLoadState( |
10705 | 12.6k | const TableId& table_id, TSDescriptorVector ts_descs, CMPerTableLoadState* state) { |
10706 | 37.1k | for (const auto& ts : ts_descs) { |
10707 | | // Touch every tserver with 0 load. |
10708 | 37.1k | state->per_ts_load_[ts->permanent_uuid()]; |
10709 | | // Insert into the sorted list. |
10710 | 37.1k | state->sorted_load_.emplace_back(ts->permanent_uuid()); |
10711 | 37.1k | } |
10712 | | |
10713 | 12.6k | auto table_info = GetTableInfo(table_id); |
10714 | | |
10715 | 12.6k | if (!table_info) { |
10716 | 0 | return; |
10717 | 0 | } |
10718 | 12.6k | CatalogManagerUtil::FillTableLoadState(table_info, state); |
10719 | 12.6k | } |
10720 | | |
10721 | | void CatalogManager::InitializeGlobalLoadState( |
10722 | 11.8k | TSDescriptorVector ts_descs, CMGlobalLoadState* state) { |
10723 | 34.6k | for (const auto& ts : ts_descs) { |
10724 | | // Touch every tserver with 0 load. |
10725 | 34.6k | state->per_ts_load_[ts->permanent_uuid()]; |
10726 | 34.6k | } |
10727 | | |
10728 | 11.8k | SharedLock l(mutex_); |
10729 | 2.71M | for (const auto& id_and_info : *table_ids_map_) { |
10730 | | // Ignore system, colocated and deleting/deleted tables. |
10731 | 2.71M | { |
10732 | 2.71M | auto l = id_and_info.second->LockForRead(); |
10733 | 2.71M | if (IsSystemTable(*(id_and_info.second)) || |
10734 | 90.2k | id_and_info.second->IsColocatedUserTable() || |
10735 | 2.67M | l->started_deleting()) { |
10736 | 2.67M | continue; |
10737 | 2.67M | } |
10738 | 42.0k | } |
10739 | 42.0k | CatalogManagerUtil::FillTableLoadState(id_and_info.second, state); |
10740 | 42.0k | } |
10741 | 11.8k | } |
10742 | | |
10743 | | } // namespace master |
10744 | | } // namespace yb |