YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/master/catalog_manager.cc
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
//
18
// The following only applies to changes made to this file as part of YugaByte development.
19
//
20
// Portions Copyright (c) YugaByte, Inc.
21
//
22
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
23
// in compliance with the License.  You may obtain a copy of the License at
24
//
25
// http://www.apache.org/licenses/LICENSE-2.0
26
//
27
// Unless required by applicable law or agreed to in writing, software distributed under the License
28
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
29
// or implied.  See the License for the specific language governing permissions and limitations
30
// under the License.
31
//
32
// ================================================================================================
33
//
34
// The catalog manager handles the current list of tables
35
// and tablets in the cluster, as well as their current locations.
36
// Since most operations in the master go through these data
37
// structures, locking is carefully managed here to prevent unnecessary
38
// contention and deadlocks:
39
//
40
// - each structure has an internal spinlock used for operations that
41
//   are purely in-memory (eg the current status of replicas)
42
// - data that is persisted on disk is stored in separate PersistentTable(t)Info
43
//   structs. These are managed using copy-on-write so that writers may block
44
//   writing them back to disk while not impacting concurrent readers.
45
//
46
// Usage rules:
47
// - You may obtain READ locks in any order. READ locks should never block,
48
//   since they only conflict with COMMIT which is a purely in-memory operation.
49
//   Thus they are deadlock-free.
50
// - If you need a WRITE lock on both a table and one or more of its tablets,
51
//   acquire the lock on the table first. This strict ordering prevents deadlocks.
52
//
53
// ================================================================================================
54
55
#include "yb/master/catalog_manager.h"
56
57
#include <stdlib.h>
58
59
#include <algorithm>
60
#include <atomic>
61
#include <bitset>
62
#include <functional>
63
#include <memory>
64
#include <mutex>
65
#include <set>
66
#include <string>
67
#include <unordered_map>
68
#include <vector>
69
70
#include <boost/optional.hpp>
71
#include <glog/logging.h>
72
73
#include "yb/client/client-internal.h"
74
#include "yb/client/client.h"
75
#include "yb/client/schema.h"
76
#include "yb/client/universe_key_client.h"
77
78
#include "yb/common/common.pb.h"
79
#include "yb/common/common_flags.h"
80
#include "yb/common/constants.h"
81
#include "yb/common/key_encoder.h"
82
#include "yb/common/partial_row.h"
83
#include "yb/common/partition.h"
84
#include "yb/common/ql_type.h"
85
#include "yb/common/roles_permissions.h"
86
#include "yb/common/schema.h"
87
#include "yb/common/wire_protocol.h"
88
89
#include "yb/consensus/consensus.h"
90
#include "yb/consensus/consensus.pb.h"
91
#include "yb/consensus/consensus_util.h"
92
#include "yb/consensus/metadata.pb.h"
93
#include "yb/consensus/opid_util.h"
94
#include "yb/consensus/quorum_util.h"
95
96
#include "yb/docdb/doc_key.h"
97
98
#include "yb/gutil/atomicops.h"
99
#include "yb/gutil/bind.h"
100
#include "yb/gutil/casts.h"
101
#include "yb/gutil/map-util.h"
102
#include "yb/gutil/mathlimits.h"
103
#include "yb/gutil/stl_util.h"
104
#include "yb/gutil/strings/escaping.h"
105
#include "yb/gutil/strings/join.h"
106
#include "yb/gutil/strings/substitute.h"
107
#include "yb/gutil/sysinfo.h"
108
#include "yb/gutil/walltime.h"
109
110
#include "yb/master/master_fwd.h"
111
#include "yb/master/async_rpc_tasks.h"
112
#include "yb/master/backfill_index.h"
113
#include "yb/master/catalog_entity_info.h"
114
#include "yb/master/catalog_loaders.h"
115
#include "yb/master/catalog_manager-internal.h"
116
#include "yb/master/catalog_manager_bg_tasks.h"
117
#include "yb/master/catalog_manager_util.h"
118
#include "yb/master/cluster_balance.h"
119
#include "yb/master/encryption_manager.h"
120
#include "yb/master/master.h"
121
#include "yb/master/master_admin.pb.h"
122
#include "yb/master/master_client.pb.h"
123
#include "yb/master/master_cluster.proxy.h"
124
#include "yb/master/master_dcl.pb.h"
125
#include "yb/master/master_ddl.pb.h"
126
#include "yb/master/master_encryption.pb.h"
127
#include "yb/master/master_error.h"
128
#include "yb/master/master_heartbeat.pb.h"
129
#include "yb/master/master_replication.pb.h"
130
#include "yb/master/master_util.h"
131
#include "yb/master/permissions_manager.h"
132
#include "yb/master/scoped_leader_shared_lock-internal.h"
133
#include "yb/master/sys_catalog.h"
134
#include "yb/master/sys_catalog_constants.h"
135
#include "yb/master/ts_descriptor.h"
136
#include "yb/master/yql_aggregates_vtable.h"
137
#include "yb/master/yql_auth_resource_role_permissions_index.h"
138
#include "yb/master/yql_auth_role_permissions_vtable.h"
139
#include "yb/master/yql_auth_roles_vtable.h"
140
#include "yb/master/yql_columns_vtable.h"
141
#include "yb/master/yql_empty_vtable.h"
142
#include "yb/master/yql_functions_vtable.h"
143
#include "yb/master/yql_indexes_vtable.h"
144
#include "yb/master/yql_keyspaces_vtable.h"
145
#include "yb/master/yql_local_vtable.h"
146
#include "yb/master/yql_partitions_vtable.h"
147
#include "yb/master/yql_peers_vtable.h"
148
#include "yb/master/yql_size_estimates_vtable.h"
149
#include "yb/master/yql_tables_vtable.h"
150
#include "yb/master/yql_triggers_vtable.h"
151
#include "yb/master/yql_types_vtable.h"
152
#include "yb/master/yql_views_vtable.h"
153
#include "yb/master/ysql_transaction_ddl.h"
154
155
#include "yb/rpc/messenger.h"
156
#include "yb/rpc/rpc_controller.h"
157
158
#include "yb/tablet/operations/change_metadata_operation.h"
159
#include "yb/tablet/tablet.h"
160
#include "yb/tablet/tablet_metadata.h"
161
#include "yb/tablet/tablet_peer.h"
162
#include "yb/tablet/tablet_retention_policy.h"
163
164
#include "yb/tserver/remote_bootstrap_client.h"
165
#include "yb/tserver/ts_tablet_manager.h"
166
#include "yb/tserver/tserver_error.h"
167
168
#include "yb/util/atomic.h"
169
#include "yb/util/countdown_latch.h"
170
#include "yb/util/debug-util.h"
171
#include "yb/util/debug/trace_event.h"
172
#include "yb/util/flag_tags.h"
173
#include "yb/util/format.h"
174
#include "yb/util/hash_util.h"
175
#include "yb/util/locks.h"
176
#include "yb/util/math_util.h"
177
#include "yb/util/metrics.h"
178
#include "yb/util/monotime.h"
179
#include "yb/util/net/net_util.h"
180
#include "yb/util/oid_generator.h"
181
#include "yb/util/random_util.h"
182
#include "yb/util/rw_mutex.h"
183
#include "yb/util/semaphore.h"
184
#include "yb/util/shared_lock.h"
185
#include "yb/util/size_literals.h"
186
#include "yb/util/status.h"
187
#include "yb/util/status_format.h"
188
#include "yb/util/status_log.h"
189
#include "yb/util/stopwatch.h"
190
#include "yb/util/string_util.h"
191
#include "yb/util/sync_point.h"
192
#include "yb/util/thread.h"
193
#include "yb/util/threadpool.h"
194
#include "yb/util/trace.h"
195
#include "yb/util/tsan_util.h"
196
#include "yb/util/uuid.h"
197
198
#include "yb/yql/pgwrapper/pg_wrapper.h"
199
#include "yb/yql/redis/redisserver/redis_constants.h"
200
201
using namespace std::literals;
202
using namespace yb::size_literals;
203
204
DEFINE_int32(master_ts_rpc_timeout_ms, 30 * 1000,  // 30 sec
205
             "Timeout used for the Master->TS async rpc calls.");
206
TAG_FLAG(master_ts_rpc_timeout_ms, advanced);
207
208
DEFINE_int32(tablet_creation_timeout_ms, 30 * 1000,  // 30 sec
209
             "Timeout used by the master when attempting to create tablet "
210
             "replicas during table creation.");
211
TAG_FLAG(tablet_creation_timeout_ms, advanced);
212
213
DEFINE_test_flag(bool, disable_tablet_deletion, false,
214
                 "Whether catalog manager should disable tablet deletion.");
215
216
DEFINE_bool(catalog_manager_wait_for_new_tablets_to_elect_leader, true,
217
            "Whether the catalog manager should wait for a newly created tablet to "
218
            "elect a leader before considering it successfully created. "
219
            "This is disabled in some tests where we explicitly manage leader "
220
            "election.");
221
TAG_FLAG(catalog_manager_wait_for_new_tablets_to_elect_leader, hidden);
222
223
DEFINE_int32(catalog_manager_inject_latency_in_delete_table_ms, 0,
224
             "Number of milliseconds that the master will sleep in DeleteTable.");
225
TAG_FLAG(catalog_manager_inject_latency_in_delete_table_ms, hidden);
226
227
DECLARE_int32(catalog_manager_bg_task_wait_ms);
228
229
DEFINE_int32(replication_factor, 3,
230
             "Default number of replicas for tables that do not have the num_replicas set.");
231
TAG_FLAG(replication_factor, advanced);
232
233
DEFINE_int32(max_create_tablets_per_ts, 50,
234
             "The number of tablets per TS that can be requested for a new table.");
235
TAG_FLAG(max_create_tablets_per_ts, advanced);
236
237
DEFINE_int32(catalog_manager_report_batch_size, 1,
238
            "The max number of tablets evaluated in the heartbeat as a single SysCatalog update.");
239
TAG_FLAG(catalog_manager_report_batch_size, advanced);
240
241
DEFINE_int32(master_failover_catchup_timeout_ms, 30 * 1000 * yb::kTimeMultiplier,  // 30 sec
242
             "Amount of time to give a newly-elected leader master to load"
243
             " the previous master's metadata and become active. If this time"
244
             " is exceeded, the node crashes.");
245
TAG_FLAG(master_failover_catchup_timeout_ms, advanced);
246
TAG_FLAG(master_failover_catchup_timeout_ms, experimental);
247
248
DEFINE_bool(master_tombstone_evicted_tablet_replicas, true,
249
            "Whether the Master should tombstone (delete) tablet replicas that "
250
            "are no longer part of the latest reported raft config.");
251
TAG_FLAG(master_tombstone_evicted_tablet_replicas, hidden);
252
DECLARE_bool(master_ignore_deleted_on_load);
253
254
// Temporary.  Can be removed after long-run testing.
255
DEFINE_bool(master_ignore_stale_cstate, true,
256
            "Whether Master processes the raft config when the version is lower.");
257
TAG_FLAG(master_ignore_stale_cstate, hidden);
258
259
DEFINE_bool(catalog_manager_check_ts_count_for_create_table, true,
260
            "Whether the master should ensure that there are enough live tablet "
261
            "servers to satisfy the provided replication count before allowing "
262
            "a table to be created.");
263
TAG_FLAG(catalog_manager_check_ts_count_for_create_table, hidden);
264
265
DEFINE_test_flag(bool, catalog_manager_check_yql_partitions_exist_for_is_create_table_done, true,
266
                 "Whether the master should ensure that all of a table's tablets are "
267
                 "in the YQL system.partitions vtable during the IsCreateTableDone check.");
268
269
METRIC_DEFINE_gauge_uint32(cluster, num_tablet_servers_live,
270
                           "Number of live tservers in the cluster", yb::MetricUnit::kUnits,
271
                           "The number of tablet servers that have responded or done a heartbeat "
272
                           "in the time interval defined by the gflag "
273
                           "FLAGS_tserver_unresponsive_timeout_ms.");
274
275
METRIC_DEFINE_gauge_uint32(cluster, num_tablet_servers_dead,
276
                           "Number of dead tservers in the cluster", yb::MetricUnit::kUnits,
277
                           "The number of tablet servers that have not responded or done a "
278
                           "heartbeat in the time interval defined by the gflag "
279
                           "FLAGS_tserver_unresponsive_timeout_ms.");
280
281
DEFINE_test_flag(uint64, inject_latency_during_remote_bootstrap_secs, 0,
282
                 "Number of seconds to sleep during a remote bootstrap.");
283
284
DEFINE_test_flag(uint64, inject_latency_during_tablet_report_ms, 0,
285
                 "Number of milliseconds to sleep during the processing of a tablet batch.");
286
287
DEFINE_test_flag(bool, catalog_manager_simulate_system_table_create_failure, false,
288
                 "This is only used in tests to simulate a failure where the table information is "
289
                 "persisted in syscatalog, but the tablet information is not yet persisted and "
290
                 "there is a failure.");
291
292
DEFINE_string(cluster_uuid, "", "Cluster UUID to be used by this cluster");
293
TAG_FLAG(cluster_uuid, hidden);
294
295
DECLARE_int32(yb_num_shards_per_tserver);
296
297
DEFINE_int32(transaction_table_num_tablets, 0,
298
             "Number of tablets to use when creating the transaction status table."
299
             "0 to use transaction_table_num_tablets_per_tserver.");
300
301
DEFINE_int32(transaction_table_num_tablets_per_tserver, kAutoDetectNumShardsPerTServer,
302
    "The default number of tablets per tablet server for transaction status table. If the value is "
303
    "-1, the system automatically determines an appropriate value based on number of CPU cores.");
304
305
DEFINE_bool(auto_create_local_transaction_tables, true,
306
            "Whether or not to create local transaction status tables automatically on table "
307
            "creation with a tablespace with placement specified.");
308
309
DEFINE_test_flag(bool, name_transaction_tables_with_tablespace_id, false,
310
                 "This is only used in tests to make associating automatically created transaction "
311
                 "tables with their tablespaces easier, and causes transaction tables created "
312
                 "automatically for tablespaces to include the tablespace oid in their names.");
313
314
DEFINE_bool(master_enable_metrics_snapshotter, false, "Should metrics snapshotter be enabled");
315
316
DEFINE_int32(metrics_snapshots_table_num_tablets, 0,
317
             "Number of tablets to use when creating the metrics snapshots table."
318
             "0 to use the same default num tablets as for regular tables.");
319
320
DEFINE_bool(disable_index_backfill, false,
321
    "A kill switch to disable multi-stage backfill for YCQL indexes.");
322
TAG_FLAG(disable_index_backfill, runtime);
323
TAG_FLAG(disable_index_backfill, hidden);
324
325
DEFINE_bool(disable_index_backfill_for_non_txn_tables, true,
326
    "A kill switch to disable multi-stage backfill for user enforced YCQL indexes. "
327
    "Note that enabling this feature may cause the create index flow to be slow. "
328
    "This is needed to ensure the safety of the index backfill process. See also "
329
    "index_backfill_upperbound_for_user_enforced_txn_duration_ms");
330
TAG_FLAG(disable_index_backfill_for_non_txn_tables, runtime);
331
TAG_FLAG(disable_index_backfill_for_non_txn_tables, hidden);
332
333
DEFINE_bool(enable_transactional_ddl_gc, true,
334
    "A kill switch for transactional DDL GC. Temporary safety measure.");
335
TAG_FLAG(enable_transactional_ddl_gc, runtime);
336
TAG_FLAG(enable_transactional_ddl_gc, hidden);
337
338
DEFINE_bool(
339
    hide_pg_catalog_table_creation_logs, false,
340
    "Whether to hide detailed log messages for PostgreSQL catalog table creation. "
341
    "This cuts down test logs significantly.");
342
TAG_FLAG(hide_pg_catalog_table_creation_logs, hidden);
343
344
DEFINE_test_flag(int32, simulate_slow_table_create_secs, 0,
345
    "Simulates a slow table creation by sleeping after the table has been added to memory.");
346
347
DEFINE_test_flag(int32, simulate_slow_system_tablet_bootstrap_secs, 0,
348
    "Simulates a slow tablet bootstrap by adding a sleep before system tablet init.");
349
350
DEFINE_test_flag(bool, return_error_if_namespace_not_found, false,
351
    "Return an error from ListTables if a namespace id is not found in the map");
352
353
DEFINE_test_flag(bool, hang_on_namespace_transition, false,
354
    "Used in tests to simulate a lapse between issuing a namespace op and final processing.");
355
356
DEFINE_test_flag(bool, simulate_crash_after_table_marked_deleting, false,
357
    "Crash yb-master after table's state is set to DELETING. This skips tablets deletion.");
358
359
DEFINE_bool(master_drop_table_after_task_response, true,
360
            "Mark a table as DELETED as soon as we get all the responses from all the TS.");
361
TAG_FLAG(master_drop_table_after_task_response, advanced);
362
TAG_FLAG(master_drop_table_after_task_response, runtime);
363
364
DEFINE_test_flag(bool, tablegroup_master_only, false,
365
                 "This is only for MasterTest to be able to test tablegroups without the"
366
                 " transaction status table being created.");
367
368
DEFINE_bool(enable_register_ts_from_raft, true, "Whether to register a tserver from the consensus "
369
                                                "information of a reported tablet.");
370
371
DECLARE_int32(tserver_unresponsive_timeout_ms);
372
373
DEFINE_bool(use_create_table_leader_hint, true,
374
            "Whether the Master should hint which replica for each tablet should "
375
            "be leader initially on tablet creation.");
376
TAG_FLAG(use_create_table_leader_hint, runtime);
377
378
DEFINE_test_flag(bool, create_table_leader_hint_min_lexicographic, false,
379
                 "Whether the Master should hint replica with smallest lexicographic rank for each "
380
                 "tablet as leader initially on tablet creation.");
381
382
DEFINE_double(heartbeat_safe_deadline_ratio, .20,
383
              "When the heartbeat deadline has this percentage of time remaining, "
384
              "the master should halt tablet report processing so it can respond in time.");
385
DECLARE_int32(heartbeat_rpc_timeout_ms);
386
DECLARE_CAPABILITY(TabletReportLimit);
387
388
DEFINE_int32(partitions_vtable_cache_refresh_secs, 0,
389
             "Amount of time to wait before refreshing the system.partitions cached vtable. "
390
             "If generate_partitions_vtable_on_changes is set, then this background task will "
391
             "update the cache using the internal map, but won't do any generating of the vtable.");
392
393
DEFINE_int32(txn_table_wait_min_ts_count, 1,
394
             "Minimum Number of TS to wait for before creating the transaction status table."
395
             " Default value is 1. We wait for atleast --replication_factor if this value"
396
             " is smaller than that");
397
TAG_FLAG(txn_table_wait_min_ts_count, advanced);
398
399
DEFINE_bool(enable_ysql_tablespaces_for_placement, true,
400
            "If set, tablespaces will be used for placement of YSQL tables.");
401
TAG_FLAG(enable_ysql_tablespaces_for_placement, runtime);
402
403
DEFINE_int32(ysql_tablespace_info_refresh_secs, 30,
404
             "Frequency at which the table to tablespace information will be updated in master "
405
             "from pg catalog tables. A value of -1 disables the refresh task.");
406
TAG_FLAG(ysql_tablespace_info_refresh_secs, runtime);
407
408
DEFINE_int64(tablet_split_size_threshold_bytes, 0,
409
             "DEPRECATED -- Threshold on tablet size after which tablet should be split. Automated "
410
             "splitting is disabled if this value is set to 0.");
411
TAG_FLAG(tablet_split_size_threshold_bytes, hidden);
412
413
DEFINE_int64(tablet_split_low_phase_shard_count_per_node, 8,
414
             "The per-node tablet count until which a table is splitting at the phase 1 threshold, "
415
             "as defined by tablet_split_low_phase_size_threshold_bytes.");
416
DEFINE_int64(tablet_split_high_phase_shard_count_per_node, 24,
417
             "The per-node tablet count until which a table is splitting at the phase 2 threshold, "
418
             "as defined by tablet_split_high_phase_size_threshold_bytes.");
419
420
DEFINE_int64(tablet_split_low_phase_size_threshold_bytes, 512_MB,
421
             "The tablet size threshold at which to split tablets in phase 1. "
422
             "See tablet_split_low_phase_shard_count_per_node.");
423
DEFINE_int64(tablet_split_high_phase_size_threshold_bytes, 10_GB,
424
             "The tablet size threshold at which to split tablets in phase 2. "
425
             "See tablet_split_high_phase_shard_count_per_node.");
426
DEFINE_int64(tablet_force_split_threshold_bytes, 100_GB,
427
             "The tablet size threshold at which to split tablets regardless of how many tablets "
428
             "exist in the table already. This should be configured to prevent runaway whale "
429
             "tablets from forming in your cluster even if both automatic splitting phases have "
430
             "been finished.");
431
432
DEFINE_test_flag(bool, crash_server_on_sys_catalog_leader_affinity_move, false,
433
                 "When set, crash the master process if it performs a sys catalog leader affinity "
434
                 "move.");
435
DEFINE_int32(blacklist_progress_initial_delay_secs, yb::master::kDelayAfterFailoverSecs,
436
             "When a master leader failsover, the time until which the progress of load movement "
437
             "off the blacklisted tservers is reported as 0. This initial delay "
438
             "gives sufficient time for heartbeats so that we don't report"
439
             " a premature incorrect completion.");
440
TAG_FLAG(blacklist_progress_initial_delay_secs, runtime);
441
442
DEFINE_test_flag(bool, validate_all_tablet_candidates, false,
443
                 "When set to true, consider any tablet a valid candidate for splitting. "
444
                 "Specifically this flag ensures that ValidateSplitCandidateTable and "
445
                 "ValidateSplitCandidateTablet always return OK and all tablets are considered "
446
                 "valid candidates for splitting.");
447
448
DEFINE_test_flag(bool, skip_placement_validation_createtable_api, false,
449
                 "When set, it skips checking that all the tablets of a table have enough tservers"
450
                 " conforming to the table placement policy during CreateTable API call.");
451
TAG_FLAG(TEST_skip_placement_validation_createtable_api, runtime);
452
453
DEFINE_test_flag(int32, slowdown_alter_table_rpcs_ms, 0,
454
                 "Slows down the alter table rpc's send and response handler so that the TServer "
455
                 "has a heartbeat delay and triggers tablet leader change.");
456
457
DEFINE_test_flag(bool, reject_delete_not_serving_tablet_rpc, false,
458
                 "Whether to reject DeleteNotServingTablet RPC.");
459
460
DEFINE_test_flag(double, crash_after_creating_single_split_tablet, 0.0,
461
                 "Crash inside CatalogManager::RegisterNewTabletForSplit after calling Upsert");
462
463
DEFINE_bool(enable_delete_truncate_xcluster_replicated_table, false,
464
            "When set, enables deleting/truncating tables currently in xCluster replication");
465
TAG_FLAG(enable_delete_truncate_xcluster_replicated_table, runtime);
466
467
DEFINE_test_flag(bool, sequential_colocation_ids, false,
468
                 "When set, colocation IDs will be assigned sequentially (starting from 20001) "
469
                 "rather than at random. This is especially useful for making pg_regress "
470
                 "tests output consistent and predictable.");
471
472
namespace yb {
473
namespace master {
474
475
using std::atomic;
476
using std::shared_ptr;
477
using std::string;
478
using std::unique_ptr;
479
using std::vector;
480
481
using namespace std::placeholders;
482
483
using base::subtle::NoBarrier_Load;
484
using base::subtle::NoBarrier_CompareAndSwap;
485
using consensus::kMinimumTerm;
486
using consensus::CONSENSUS_CONFIG_COMMITTED;
487
using consensus::CONSENSUS_CONFIG_ACTIVE;
488
using consensus::COMMITTED_OPID;
489
using consensus::Consensus;
490
using consensus::ConsensusMetadata;
491
using consensus::ConsensusServiceProxy;
492
using consensus::ConsensusStatePB;
493
using consensus::GetConsensusRole;
494
using consensus::PeerMemberType;
495
using consensus::RaftPeerPB;
496
using consensus::StartRemoteBootstrapRequestPB;
497
using rpc::RpcContext;
498
using server::MonitoredTask;
499
using strings::Substitute;
500
using tablet::TABLET_DATA_COPYING;
501
using tablet::TABLET_DATA_DELETED;
502
using tablet::TABLET_DATA_READY;
503
using tablet::TABLET_DATA_TOMBSTONED;
504
using tablet::TabletDataState;
505
using tablet::RaftGroupMetadata;
506
using tablet::RaftGroupMetadataPtr;
507
using tablet::TabletPeer;
508
using tablet::RaftGroupStatePB;
509
using tablet::TabletStatusListener;
510
using tablet::TabletStatusPB;
511
using tserver::HandleReplacingStaleTablet;
512
using tserver::TabletServerErrorPB;
513
using yb::pgwrapper::PgWrapper;
514
using yb::server::MasterAddressesToString;
515
516
using yb::client::YBClient;
517
using yb::client::YBClientBuilder;
518
using yb::client::YBColumnSchema;
519
using yb::client::YBSchema;
520
using yb::client::YBSchemaBuilder;
521
using yb::client::YBTable;
522
using yb::client::YBTableName;
523
524
namespace {
525
526
// Macros to access index information in CATALOG.
527
//
528
// NOTES from file master.proto for SysTablesEntryPB.
529
// - For index table: [to be deprecated and replaced by "index_info"]
530
//     optional bytes indexed_table_id = 13; // Indexed table id of this index.
531
//     optional bool is_local_index = 14 [ default = false ];  // Whether this is a local index.
532
//     optional bool is_unique_index = 15 [ default = false ]; // Whether this is a unique index.
533
// - During transition period, we have to consider both fields and the following macros help
534
//   avoiding duplicate protobuf version check thru out our code.
535
536
47.3k
const std::string& GetIndexedTableId(const SysTablesEntryPB& pb) {
537
47.3k
  return pb.has_index_info() ? pb.index_info().indexed_table_id() : 
pb.indexed_table_id()0
;
538
47.3k
}
539
540
#define PROTO_GET_IS_LOCAL(tabpb) \
541
5.80k
  (tabpb.has_index_info() ? tabpb.index_info().is_local() \
542
5.80k
                          : 
tabpb.is_local_index()0
)
543
544
#define PROTO_GET_IS_UNIQUE(tabpb) \
545
5.80k
  (tabpb.has_index_info() ? tabpb.index_info().is_unique() \
546
5.80k
                          : 
tabpb.is_unique_index()0
)
547
548
template <class PB>
549
59.7k
bool IsIndex(const PB& pb) {
550
59.7k
  return pb.has_index_info() || 
!pb.indexed_table_id().empty()43.2k
;
551
59.7k
}
catalog_manager.cc:bool yb::master::(anonymous namespace)::IsIndex<yb::master::SysTablesEntryPB>(yb::master::SysTablesEntryPB const&)
Line
Count
Source
549
51.1k
bool IsIndex(const PB& pb) {
550
51.1k
  return pb.has_index_info() || 
!pb.indexed_table_id().empty()35.7k
;
551
51.1k
}
catalog_manager.cc:bool yb::master::(anonymous namespace)::IsIndex<yb::master::CreateTableRequestPB>(yb::master::CreateTableRequestPB const&)
Line
Count
Source
549
8.65k
bool IsIndex(const PB& pb) {
550
8.65k
  return pb.has_index_info() || 
!pb.indexed_table_id().empty()7.43k
;
551
8.65k
}
552
553
13.9k
bool IsTable(const SysTablesEntryPB& pb) {
554
13.9k
  return !IsIndex(pb);
555
13.9k
}
556
557
#define PROTO_PTR_IS_INDEX(tabpb) \
558
  (tabpb->has_index_info() || !tabpb->indexed_table_id().empty())
559
560
#define PROTO_PTR_IS_TABLE(tabpb) \
561
8.45k
  (!tabpb->has_index_info() && 
tabpb->indexed_table_id().empty()7.26k
)
562
563
#if (0)
564
// Once the deprecated fields are obsolete, the above macros should be defined as the following.
565
#define GetIndexedTableId(tabpb) (tabpb.index_info().indexed_table_id())
566
#define PROTO_GET_IS_LOCAL(tabpb) (tabpb.index_info().is_local())
567
#define PROTO_GET_IS_UNIQUE(tabpb) (tabpb.index_info().is_unique())
568
#define PROTO_IS_INDEX(tabpb) (tabpb.has_index_info())
569
#define PROTO_IS_TABLE(tabpb) (!tabpb.has_index_info())
570
#define PROTO_PTR_IS_INDEX(tabpb) (tabpb->has_index_info())
571
#define PROTO_PTR_IS_TABLE(tabpb) (!tabpb->has_index_info())
572
573
#endif
574
575
class IndexInfoBuilder {
576
 public:
577
18
  explicit IndexInfoBuilder(IndexInfoPB* index_info) : index_info_(*index_info) {
578
18
    DVLOG
(3) << " After " << __PRETTY_FUNCTION__ << " index_info_ is " << yb::ToString(index_info_)0
;
579
18
  }
580
581
18
  void ApplyProperties(const TableId& indexed_table_id, bool is_local, bool is_unique) {
582
18
    index_info_.set_indexed_table_id(indexed_table_id);
583
18
    index_info_.set_version(0);
584
18
    index_info_.set_is_local(is_local);
585
18
    index_info_.set_is_unique(is_unique);
586
18
    DVLOG
(3) << " After " << __PRETTY_FUNCTION__ << " index_info_ is " << yb::ToString(index_info_)0
;
587
18
  }
588
589
18
  CHECKED_STATUS ApplyColumnMapping(const Schema& indexed_schema, const Schema& index_schema) {
590
72
    for (size_t i = 0; i < index_schema.num_columns(); 
i++54
) {
591
54
      const auto& col_name = index_schema.column(i).name();
592
54
      const auto indexed_col_idx = indexed_schema.find_column(col_name);
593
54
      if (PREDICT_FALSE(indexed_col_idx == Schema::kColumnNotFound)) {
594
0
        return STATUS(NotFound, "The indexed table column does not exist", col_name);
595
0
      }
596
54
      auto* col = index_info_.add_columns();
597
54
      col->set_column_id(index_schema.column_id(i));
598
54
      col->set_indexed_column_id(indexed_schema.column_id(indexed_col_idx));
599
54
    }
600
18
    index_info_.set_hash_column_count(narrow_cast<uint32_t>(index_schema.num_hash_key_columns()));
601
18
    index_info_.set_range_column_count(narrow_cast<uint32_t>(index_schema.num_range_key_columns()));
602
603
36
    for (size_t i = 0; i < indexed_schema.num_hash_key_columns(); 
i++18
) {
604
18
      index_info_.add_indexed_hash_column_ids(indexed_schema.column_id(i));
605
18
    }
606
18
    for (size_t i = indexed_schema.num_hash_key_columns(); i < indexed_schema.num_key_columns();
607
18
        
i++0
) {
608
0
      index_info_.add_indexed_range_column_ids(indexed_schema.column_id(i));
609
0
    }
610
18
    DVLOG
(3) << " After " << __PRETTY_FUNCTION__ << " index_info_ is " << yb::ToString(index_info_)0
;
611
18
    return Status::OK();
612
18
  }
613
614
 private:
615
  IndexInfoPB& index_info_;
616
};
617
618
template<class Lock>
619
469k
Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock) {
620
  // This covers both in progress and fully deleted objects.
621
469k
  if (lock->started_deleting()) {
622
136
    return STATUS_EC_FORMAT(
623
136
        NotFound, MasterError(MasterErrorPB::OBJECT_NOT_FOUND),
624
136
        "The object '$0.$1' does not exist", lock->namespace_id(), lock->name());
625
136
  }
626
469k
  if (!lock->visible_to_client()) {
627
1
    return STATUS_EC_FORMAT(
628
1
        ServiceUnavailable, MasterError(MasterErrorPB::OBJECT_NOT_FOUND),
629
1
        "The object '$0.$1' is not running", lock->namespace_id(), lock->name());
630
1
  }
631
469k
  return Status::OK();
632
469k
}
catalog_manager.cc:yb::Status yb::master::(anonymous namespace)::CheckIfTableDeletedOrNotVisibleToClient<yb::CowWriteLock<yb::master::PersistentTableInfo> >(yb::CowWriteLock<yb::master::PersistentTableInfo> const&)
Line
Count
Source
619
7.09k
Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock) {
620
  // This covers both in progress and fully deleted objects.
621
7.09k
  if (lock->started_deleting()) {
622
0
    return STATUS_EC_FORMAT(
623
0
        NotFound, MasterError(MasterErrorPB::OBJECT_NOT_FOUND),
624
0
        "The object '$0.$1' does not exist", lock->namespace_id(), lock->name());
625
0
  }
626
7.09k
  if (!lock->visible_to_client()) {
627
0
    return STATUS_EC_FORMAT(
628
0
        ServiceUnavailable, MasterError(MasterErrorPB::OBJECT_NOT_FOUND),
629
0
        "The object '$0.$1' is not running", lock->namespace_id(), lock->name());
630
0
  }
631
7.09k
  return Status::OK();
632
7.09k
}
catalog_manager.cc:yb::Status yb::master::(anonymous namespace)::CheckIfTableDeletedOrNotVisibleToClient<yb::CowReadLock<yb::master::PersistentTableInfo> >(yb::CowReadLock<yb::master::PersistentTableInfo> const&)
Line
Count
Source
619
462k
Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock) {
620
  // This covers both in progress and fully deleted objects.
621
462k
  if (lock->started_deleting()) {
622
136
    return STATUS_EC_FORMAT(
623
136
        NotFound, MasterError(MasterErrorPB::OBJECT_NOT_FOUND),
624
136
        "The object '$0.$1' does not exist", lock->namespace_id(), lock->name());
625
136
  }
626
462k
  if (!lock->visible_to_client()) {
627
1
    return STATUS_EC_FORMAT(
628
1
        ServiceUnavailable, MasterError(MasterErrorPB::OBJECT_NOT_FOUND),
629
1
        "The object '$0.$1' is not running", lock->namespace_id(), lock->name());
630
1
  }
631
462k
  return Status::OK();
632
462k
}
633
634
template<class Lock, class RespClass>
635
429k
Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) {
636
429k
  auto status = CheckIfTableDeletedOrNotVisibleToClient(lock);
637
429k
  if (!status.ok()) {
638
52
    return SetupError(resp->mutable_error(), status);
639
52
  }
640
429k
  return Status::OK();
641
429k
}
catalog_manager.cc:yb::Status yb::master::(anonymous namespace)::CheckIfTableDeletedOrNotVisibleToClient<yb::CowWriteLock<yb::master::PersistentTableInfo>, yb::master::CreateTableResponsePB>(yb::CowWriteLock<yb::master::PersistentTableInfo> const&, yb::master::CreateTableResponsePB*)
Line
Count
Source
635
1.19k
Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) {
636
1.19k
  auto status = CheckIfTableDeletedOrNotVisibleToClient(lock);
637
1.19k
  if (!status.ok()) {
638
0
    return SetupError(resp->mutable_error(), status);
639
0
  }
640
1.19k
  return Status::OK();
641
1.19k
}
catalog_manager.cc:yb::Status yb::master::(anonymous namespace)::CheckIfTableDeletedOrNotVisibleToClient<yb::CowReadLock<yb::master::PersistentTableInfo>, yb::master::CreateTableResponsePB>(yb::CowReadLock<yb::master::PersistentTableInfo> const&, yb::master::CreateTableResponsePB*)
Line
Count
Source
635
1.20k
Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) {
636
1.20k
  auto status = CheckIfTableDeletedOrNotVisibleToClient(lock);
637
1.20k
  if (!status.ok()) {
638
1
    return SetupError(resp->mutable_error(), status);
639
1
  }
640
1.20k
  return Status::OK();
641
1.20k
}
catalog_manager.cc:yb::Status yb::master::(anonymous namespace)::CheckIfTableDeletedOrNotVisibleToClient<yb::CowReadLock<yb::master::PersistentTableInfo>, yb::master::GetTransactionStatusTabletsResponsePB>(yb::CowReadLock<yb::master::PersistentTableInfo> const&, yb::master::GetTransactionStatusTabletsResponsePB*)
Line
Count
Source
635
3.31k
Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) {
636
3.31k
  auto status = CheckIfTableDeletedOrNotVisibleToClient(lock);
637
3.31k
  if (!status.ok()) {
638
0
    return SetupError(resp->mutable_error(), status);
639
0
  }
640
3.31k
  return Status::OK();
641
3.31k
}
catalog_manager.cc:yb::Status yb::master::(anonymous namespace)::CheckIfTableDeletedOrNotVisibleToClient<yb::CowReadLock<yb::master::PersistentTableInfo>, yb::master::TruncateTableResponsePB>(yb::CowReadLock<yb::master::PersistentTableInfo> const&, yb::master::TruncateTableResponsePB*)
Line
Count
Source
635
7.17k
Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) {
636
7.17k
  auto status = CheckIfTableDeletedOrNotVisibleToClient(lock);
637
7.17k
  if (!status.ok()) {
638
0
    return SetupError(resp->mutable_error(), status);
639
0
  }
640
7.17k
  return Status::OK();
641
7.17k
}
catalog_manager.cc:yb::Status yb::master::(anonymous namespace)::CheckIfTableDeletedOrNotVisibleToClient<yb::CowReadLock<yb::master::PersistentTableInfo>, yb::master::IsTruncateTableDoneResponsePB>(yb::CowReadLock<yb::master::PersistentTableInfo> const&, yb::master::IsTruncateTableDoneResponsePB*)
Line
Count
Source
635
10.5k
Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) {
636
10.5k
  auto status = CheckIfTableDeletedOrNotVisibleToClient(lock);
637
10.5k
  if (!status.ok()) {
638
0
    return SetupError(resp->mutable_error(), status);
639
0
  }
640
10.5k
  return Status::OK();
641
10.5k
}
catalog_manager.cc:yb::Status yb::master::(anonymous namespace)::CheckIfTableDeletedOrNotVisibleToClient<yb::CowWriteLock<yb::master::PersistentTableInfo>, yb::master::AlterTableResponsePB>(yb::CowWriteLock<yb::master::PersistentTableInfo> const&, yb::master::AlterTableResponsePB*)
Line
Count
Source
635
5.89k
Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) {
636
5.89k
  auto status = CheckIfTableDeletedOrNotVisibleToClient(lock);
637
5.89k
  if (!status.ok()) {
638
0
    return SetupError(resp->mutable_error(), status);
639
0
  }
640
5.89k
  return Status::OK();
641
5.89k
}
catalog_manager.cc:yb::Status yb::master::(anonymous namespace)::CheckIfTableDeletedOrNotVisibleToClient<yb::CowReadLock<yb::master::PersistentTableInfo>, yb::master::IsAlterTableDoneResponsePB>(yb::CowReadLock<yb::master::PersistentTableInfo> const&, yb::master::IsAlterTableDoneResponsePB*)
Line
Count
Source
635
1.39k
Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) {
636
1.39k
  auto status = CheckIfTableDeletedOrNotVisibleToClient(lock);
637
1.39k
  if (!status.ok()) {
638
0
    return SetupError(resp->mutable_error(), status);
639
0
  }
640
1.39k
  return Status::OK();
641
1.39k
}
catalog_manager.cc:yb::Status yb::master::(anonymous namespace)::CheckIfTableDeletedOrNotVisibleToClient<yb::CowReadLock<yb::master::PersistentTableInfo>, yb::master::GetTableSchemaResponsePB>(yb::CowReadLock<yb::master::PersistentTableInfo> const&, yb::master::GetTableSchemaResponsePB*)
Line
Count
Source
635
170k
Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) {
636
170k
  auto status = CheckIfTableDeletedOrNotVisibleToClient(lock);
637
170k
  if (!status.ok()) {
638
21
    return SetupError(resp->mutable_error(), status);
639
21
  }
640
169k
  return Status::OK();
641
170k
}
Unexecuted instantiation: catalog_manager.cc:yb::Status yb::master::(anonymous namespace)::CheckIfTableDeletedOrNotVisibleToClient<yb::CowReadLock<yb::master::PersistentTableInfo>, yb::master::GetColocatedTabletSchemaResponsePB>(yb::CowReadLock<yb::master::PersistentTableInfo> const&, yb::master::GetColocatedTabletSchemaResponsePB*)
catalog_manager.cc:yb::Status yb::master::(anonymous namespace)::CheckIfTableDeletedOrNotVisibleToClient<yb::CowReadLock<yb::master::PersistentTableInfo>, yb::master::GetTableLocationsResponsePB>(yb::CowReadLock<yb::master::PersistentTableInfo> const&, yb::master::GetTableLocationsResponsePB*)
Line
Count
Source
635
228k
Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) {
636
228k
  auto status = CheckIfTableDeletedOrNotVisibleToClient(lock);
637
228k
  if (!status.ok()) {
638
30
    return SetupError(resp->mutable_error(), status);
639
30
  }
640
228k
  return Status::OK();
641
228k
}
642
643
10.6k
#define VERIFY_NAMESPACE_FOUND(expr, resp) RESULT_CHECKER_HELPER( \
644
10.0k
    expr, \
645
10.0k
    if (!__result.ok()) { \
646
10.0k
        return SetupError((resp)->mutable_error(), __result.status()); \
647
10.0k
    });
648
649
2
MasterErrorPB_Code NamespaceMasterError(SysNamespaceEntryPB_State state) {
650
2
  switch (state) {
651
2
    case SysNamespaceEntryPB::PREPARING: FALLTHROUGH_INTENDED;
652
2
    case SysNamespaceEntryPB::DELETING:
653
2
      return MasterErrorPB::IN_TRANSITION_CAN_RETRY;
654
0
    case SysNamespaceEntryPB::DELETED: FALLTHROUGH_INTENDED;
655
0
    case SysNamespaceEntryPB::FAILED: FALLTHROUGH_INTENDED;
656
0
    case SysNamespaceEntryPB::RUNNING:
657
0
      return MasterErrorPB::INTERNAL_ERROR;
658
0
    default:
659
0
      FATAL_INVALID_ENUM_VALUE(SysNamespaceEntryPB_State, state);
660
2
  }
661
2
}
662
663
299k
size_t GetNameMapperIndex(YQLDatabase db_type) {
664
299k
  switch (db_type) {
665
0
    case YQL_DATABASE_UNKNOWN: break;
666
286k
    case YQL_DATABASE_CQL: return 1;
667
5.00k
    case YQL_DATABASE_PGSQL: return 2;
668
8.09k
    case YQL_DATABASE_REDIS: return 3;
669
299k
  }
670
0
  CHECK(false) << "Unexpected db type " << db_type;
671
0
  return 0;
672
299k
}
673
674
9.23k
bool IsIndexBackfillEnabled(TableType table_type, bool is_transactional) {
675
  // Fetch the runtime flag to prevent any issues from the updates to flag while processing.
676
9.23k
  const bool disabled =
677
9.23k
      (table_type == PGSQL_TABLE_TYPE
678
9.23k
          ? 
GetAtomicFlag(&FLAGS_ysql_disable_index_backfill)5.58k
679
9.23k
          : 
GetAtomicFlag(&FLAGS_disable_index_backfill)3.65k
||
680
3.65k
      (!is_transactional && 
GetAtomicFlag(&FLAGS_disable_index_backfill_for_non_txn_tables)3.04k
));
681
9.23k
  return !disabled;
682
9.23k
}
683
684
constexpr auto kDefaultYQLPartitionsRefreshBgTaskSleep = 10s;
685
686
void FillRetainedBySnapshotSchedules(
687
      const SnapshotSchedulesToObjectIdsMap& schedules_to_tables_map,
688
      const TableId& table_id,
689
5.91k
      RepeatedBytes* retained_by_snapshot_schedules) {
690
5.91k
  for (const auto& entry : schedules_to_tables_map) {
691
4
    if (std::binary_search(entry.second.begin(), entry.second.end(), table_id)) {
692
4
      retained_by_snapshot_schedules->Add()->assign(
693
4
          entry.first.AsSlice().cdata(), entry.first.size());
694
4
    }
695
4
  }
696
5.91k
}
697
698
7.97k
int GetTransactionTableNumShardsPerTServer() {
699
7.97k
  int value = 8;
700
7.97k
  if (IsTsan()) {
701
0
    value = 2;
702
7.97k
  } else if (base::NumCPUs() <= 2) {
703
0
    value = 4;
704
0
  }
705
7.97k
  return value;
706
7.97k
}
707
708
8.07k
void InitMasterFlags() {
709
8.07k
  yb::InitCommonFlags();
710
8.07k
  if (GetAtomicFlag(&FLAGS_transaction_table_num_tablets_per_tserver) ==
711
8.07k
      kAutoDetectNumShardsPerTServer) {
712
7.97k
    const auto value = GetTransactionTableNumShardsPerTServer();
713
7.97k
    VLOG
(1) << "Auto setting FLAGS_transaction_table_num_tablets_per_tserver to " << value0
;
714
7.97k
    SetAtomicFlag(value, &FLAGS_transaction_table_num_tablets_per_tserver);
715
7.97k
  }
716
8.07k
}
717
718
13.6k
Result<bool> DoesTableExist(const Result<TableInfoPtr>& result) {
719
13.6k
  if (result.ok()) {
720
12.4k
    return true;
721
12.4k
  }
722
1.20k
  if (result.status().IsNotFound()
723
1.20k
      && MasterError(result.status()) == MasterErrorPB::OBJECT_NOT_FOUND) {
724
1.20k
    return false;
725
1.20k
  }
726
0
  return result.status();
727
1.20k
}
728
729
}  // anonymous namespace
730
731
////////////////////////////////////////////////////////////
732
// CatalogManager
733
////////////////////////////////////////////////////////////
734
735
CatalogManager::NamespaceInfoMap& CatalogManager::NamespaceNameMapper::operator[](
736
19.5k
    YQLDatabase db_type) {
737
19.5k
  return typed_maps_[GetNameMapperIndex(db_type)];
738
19.5k
}
739
740
const CatalogManager::NamespaceInfoMap& CatalogManager::NamespaceNameMapper::operator[](
741
279k
    YQLDatabase db_type) const {
742
279k
  return typed_maps_[GetNameMapperIndex(db_type)];
743
279k
}
744
745
3.75k
void CatalogManager::NamespaceNameMapper::clear() {
746
15.0k
  for (auto& m : typed_maps_) {
747
15.0k
    m.clear();
748
15.0k
  }
749
3.75k
}
750
751
CatalogManager::CatalogManager(Master* master)
752
    : master_(master),
753
      tablet_exists_(false),
754
      state_(kConstructed),
755
      leader_ready_term_(-1),
756
      leader_lock_(RWMutex::Priority::PREFER_WRITING),
757
      load_balance_policy_(std::make_unique<ClusterLoadBalancer>(this)),
758
      permissions_manager_(std::make_unique<PermissionsManager>(this)),
759
      tasks_tracker_(new TasksTracker(IsUserInitiated::kFalse)),
760
      jobs_tracker_(new TasksTracker(IsUserInitiated::kTrue)),
761
      encryption_manager_(new EncryptionManager()),
762
      tablespace_manager_(std::make_shared<YsqlTablespaceManager>(nullptr, nullptr)),
763
      tablespace_bg_task_running_(false),
764
8.07k
      tablet_split_manager_(this, this, this) {
765
8.07k
  InitMasterFlags();
766
8.07k
  CHECK_OK(ThreadPoolBuilder("leader-initialization")
767
8.07k
           .set_max_threads(1)
768
8.07k
           .Build(&leader_initialization_pool_));
769
8.07k
  CHECK_OK(ThreadPoolBuilder("CatalogManagerBGTasks").Build(&background_tasks_thread_pool_));
770
8.07k
  CHECK_OK(ThreadPoolBuilder("async-tasks").Build(&async_task_pool_));
771
772
8.07k
  if (master_) {
773
8.07k
    sys_catalog_.reset(new SysCatalogTable(
774
8.07k
        master_, master_->metric_registry(),
775
8.07k
        Bind(&CatalogManager::ElectedAsLeaderCb, Unretained(this))));
776
8.07k
  }
777
8.07k
}
778
779
92
CatalogManager::~CatalogManager() {
780
92
  if (StartShutdown()) {
781
0
    CompleteShutdown();
782
0
  }
783
92
}
784
785
8.03k
Status CatalogManager::Init() {
786
8.03k
  {
787
8.03k
    std::lock_guard<simple_spinlock> l(state_lock_);
788
8.03k
    CHECK_EQ(kConstructed, state_);
789
8.03k
    state_ = kStarting;
790
8.03k
  }
791
792
8.03k
  if (master_) {
793
8.03k
    ysql_transaction_ = std::make_unique<YsqlTransactionDdl>(
794
8.03k
        sys_catalog_.get(), master_->async_client_initializer().get_client_future(),
795
8.03k
        background_tasks_thread_pool_.get());
796
8.03k
  }
797
798
  // Initialize the metrics emitted by the catalog manager.
799
8.03k
  metric_num_tablet_servers_live_ =
800
8.03k
    METRIC_num_tablet_servers_live.Instantiate(master_->metric_entity_cluster(), 0);
801
802
8.03k
  metric_num_tablet_servers_dead_ =
803
8.03k
    METRIC_num_tablet_servers_dead.Instantiate(master_->metric_entity_cluster(), 0);
804
805
8.03k
  RETURN_NOT_OK_PREPEND(InitSysCatalogAsync(),
806
8.03k
                        "Failed to initialize sys tables async");
807
808
8.02k
  if (PREDICT_FALSE(FLAGS_TEST_simulate_slow_system_tablet_bootstrap_secs > 0)) {
809
9
    LOG_WITH_PREFIX(INFO) << "Simulating slow system tablet bootstrap";
810
9
    SleepFor(MonoDelta::FromSeconds(FLAGS_TEST_simulate_slow_system_tablet_bootstrap_secs));
811
9
  }
812
813
  // WaitUntilRunning() must run outside of the lock as to prevent
814
  // deadlock. This is safe as WaitUntilRunning waits for another
815
  // thread to finish its work and doesn't itself depend on any state
816
  // within CatalogManager. Need not start sys catalog or background tasks
817
  // when we are started in shell mode.
818
8.02k
  if (!master_->opts().IsShellMode()) {
819
7.88k
    RETURN_NOT_OK_PREPEND(sys_catalog_->WaitUntilRunning(),
820
7.88k
                          "Failed waiting for the catalog tablet to run");
821
7.88k
    std::vector<consensus::RaftPeerPB> masters_raft;
822
7.88k
    RETURN_NOT_OK(master_->ListRaftConfigMasters(&masters_raft));
823
7.88k
    std::vector<HostPort> hps;
824
20.6k
    for (const auto& peer : masters_raft) {
825
20.6k
      if (NodeInstance().permanent_uuid() == peer.permanent_uuid()) {
826
7.88k
        continue;
827
7.88k
      }
828
12.8k
      HostPort hp = HostPortFromPB(DesiredHostPort(peer, master_->MakeCloudInfoPB()));
829
12.8k
      hps.push_back(hp);
830
12.8k
    }
831
7.88k
    universe_key_client_ = std::make_unique<client::UniverseKeyClient>(
832
12.7k
        hps, &master_->proxy_cache(), [&] (const encryption::UniverseKeysPB& universe_keys) {
833
12.7k
          encryption_manager_->PopulateUniverseKeys(universe_keys);
834
12.7k
        });
835
7.88k
    universe_key_client_->GetUniverseKeyRegistryAsync();
836
7.88k
    RETURN_NOT_OK(EnableBgTasks());
837
7.88k
  }
838
839
  // Cache the server registration even for shell mode masters. See
840
  // https://github.com/yugabyte/yugabyte-db/issues/8065.
841
8.02k
  RETURN_NOT_OK(GetRegistration(&server_registration_));
842
843
8.02k
  {
844
8.02k
    std::lock_guard<simple_spinlock> l(state_lock_);
845
8.02k
    CHECK_EQ(kStarting, state_);
846
8.02k
    state_ = kRunning;
847
8.02k
  }
848
849
8.02k
  Started();
850
851
8.02k
  return Status::OK();
852
8.02k
}
853
854
Status CatalogManager::ChangeEncryptionInfo(const ChangeEncryptionInfoRequestPB* req,
855
0
                                            ChangeEncryptionInfoResponsePB* resp) {
856
0
  return STATUS(InvalidCommand, "Command only supported in enterprise build.");
857
0
}
858
859
3.02k
Status CatalogManager::ElectedAsLeaderCb() {
860
3.02k
  time_elected_leader_.store(MonoTime::Now());
861
3.02k
  return leader_initialization_pool_->SubmitClosure(
862
3.02k
      Bind(&CatalogManager::LoadSysCatalogDataTask, Unretained(this)));
863
3.02k
}
864
865
3.02k
Status CatalogManager::WaitUntilCaughtUpAsLeader(const MonoDelta& timeout) {
866
3.02k
  string uuid = master_->fs_manager()->uuid();
867
3.02k
  Consensus* consensus = tablet_peer()->consensus();
868
3.02k
  ConsensusStatePB cstate = consensus->ConsensusState(CONSENSUS_CONFIG_ACTIVE);
869
3.02k
  if (!cstate.has_leader_uuid() || cstate.leader_uuid() != uuid) {
870
6
    return STATUS_SUBSTITUTE(IllegalState,
871
6
        "Node $0 not leader. Consensus state: $1", uuid, cstate.ShortDebugString());
872
6
  }
873
874
  // Wait for all transactions to be committed.
875
3.02k
  const CoarseTimePoint deadline = CoarseMonoClock::now() + timeout;
876
3.02k
  {
877
3.02k
    tablet::HistoryCutoffPropagationDisabler disabler(tablet_peer()->tablet()->RetentionPolicy());
878
3.02k
    RETURN_NOT_OK(tablet_peer()->operation_tracker()->WaitForAllToFinish(timeout));
879
3.02k
  }
880
881
3.02k
  RETURN_NOT_OK(tablet_peer()->consensus()->WaitForLeaderLeaseImprecise(deadline));
882
3.01k
  return Status::OK();
883
3.02k
}
884
885
3.02k
void CatalogManager::LoadSysCatalogDataTask() {
886
3.02k
  auto consensus = tablet_peer()->shared_consensus();
887
3.02k
  const int64_t term = consensus->ConsensusState(CONSENSUS_CONFIG_ACTIVE).current_term();
888
3.02k
  Status s = WaitUntilCaughtUpAsLeader(
889
3.02k
      MonoDelta::FromMilliseconds(FLAGS_master_failover_catchup_timeout_ms));
890
891
3.02k
  int64_t term_after_wait = consensus->ConsensusState(CONSENSUS_CONFIG_ACTIVE).current_term();
892
3.02k
  if (term_after_wait != term) {
893
    // If we got elected leader again while waiting to catch up then we will get another callback to
894
    // update state from sys_catalog, so bail now.
895
    //
896
    // If we failed when waiting, i.e. could not acquire a leader lease, this could be due to us
897
    // becoming a follower. If we're not partitioned away, we'll know about a new term soon.
898
5
    LOG_WITH_PREFIX(INFO)
899
5
        << "Term change from " << term << " to " << term_after_wait
900
5
        << " while waiting for master leader catchup. Not loading sys catalog metadata. "
901
5
        << "Status of waiting: " << s;
902
5
    return;
903
5
  }
904
905
3.02k
  if (!s.ok()) {
906
    // This could happen e.g. if we are a partitioned-away leader that failed to acquire a leader
907
    // lease.
908
    //
909
    // TODO: handle this cleanly by transitioning to a follower without crashing.
910
6
    LOG_WITH_PREFIX(WARNING) << "Failed waiting for node to catch up after master election: " << s;
911
912
6
    if (s.IsTimedOut()) {
913
0
      LOG_WITH_PREFIX(FATAL) << "Shutting down due to unavailability of other masters after"
914
0
                             << " election. TODO: Abdicate instead.";
915
0
    }
916
6
    return;
917
6
  }
918
919
3.01k
  LOG_WITH_PREFIX(INFO) << "Loading table and tablet metadata into memory for term " << term;
920
3.01k
  LOG_SLOW_EXECUTION(WARNING, 1000, LogPrefix() + "Loading metadata into memory") {
921
3.01k
    Status status = VisitSysCatalog(term);
922
3.01k
    if (!status.ok()) {
923
4
      {
924
4
        std::lock_guard<simple_spinlock> l(state_lock_);
925
4
        if (state_ == kClosing) {
926
0
          LOG_WITH_PREFIX(INFO)
927
0
              << "Error loading sys catalog; because shutdown is in progress. term " << term
928
0
              << " status : " << status;
929
0
          return;
930
0
        }
931
4
      }
932
4
      auto new_term = consensus->ConsensusState(CONSENSUS_CONFIG_ACTIVE).current_term();
933
4
      if (new_term != term) {
934
0
        LOG_WITH_PREFIX(INFO)
935
0
            << "Error loading sys catalog; but that's OK as term was changed from " << term
936
0
            << " to " << new_term << ": " << status;
937
0
        return;
938
0
      }
939
4
      LOG_WITH_PREFIX(FATAL) << "Failed to load sys catalog: " << status;
940
4
    }
941
3.01k
  }
942
943
3.01k
  {
944
3.01k
    std::lock_guard<simple_spinlock> l(state_lock_);
945
3.01k
    leader_ready_term_ = term;
946
3.01k
    LOG_WITH_PREFIX(INFO) << "Completed load of sys catalog in term " << term;
947
3.01k
  }
948
3.01k
  SysCatalogLoaded(term);
949
  // Once we have loaded the SysCatalog, reset and regenerate the yql partitions table in order to
950
  // regenerate entries for previous tables.
951
3.01k
  GetYqlPartitionsVtable().ResetAndRegenerateCache();
952
3.01k
}
953
954
1.01k
CHECKED_STATUS CatalogManager::WaitForWorkerPoolTests(const MonoDelta& timeout) const {
955
1.01k
  if (!async_task_pool_->WaitFor(timeout)) {
956
0
    return STATUS(TimedOut, "Worker Pool hasn't finished processing tasks");
957
0
  }
958
1.01k
  return Status::OK();
959
1.01k
}
960
961
3.01k
Status CatalogManager::VisitSysCatalog(int64_t term) {
962
  // Block new catalog operations, and wait for existing operations to finish.
963
3.01k
  LOG_WITH_PREFIX_AND_FUNC(INFO)
964
3.01k
      << "Wait on leader_lock_ for any existing operations to finish. Term: " << term;
965
3.01k
  auto start = std::chrono::steady_clock::now();
966
3.01k
  std::lock_guard<RWMutex> leader_lock_guard(leader_lock_);
967
3.01k
  auto finish = std::chrono::steady_clock::now();
968
969
3.01k
  static const auto kLongLockAcquisitionLimit = RegularBuildVsSanitizers(100ms, 750ms);
970
3.01k
  if (finish > start + kLongLockAcquisitionLimit) {
971
0
    LOG_WITH_PREFIX(WARNING) << "Long wait on leader_lock_: " << yb::ToString(finish - start);
972
0
  }
973
974
3.01k
  LOG_WITH_PREFIX(INFO)
975
3.01k
      << __func__ << ": Acquire catalog manager lock_ before loading sys catalog.";
976
3.01k
  LockGuard lock(mutex_);
977
3.01k
  
VLOG_WITH_FUNC0
(3) << "Acquired the catalog manager lock"0
;
978
979
  // Abort any outstanding tasks. All TableInfos are orphaned below, so
980
  // it's important to end their tasks now; otherwise Shutdown() will
981
  // destroy master state used by these tasks.
982
3.01k
  std::vector<scoped_refptr<TableInfo>> tables;
983
3.01k
  AppendValuesFromMap(*table_ids_map_, &tables);
984
3.01k
  AbortAndWaitForAllTasks(tables);
985
986
  // Clear internal maps and run data loaders.
987
3.01k
  RETURN_NOT_OK(RunLoaders(term));
988
989
  // Prepare various default system configurations.
990
3.01k
  RETURN_NOT_OK(PrepareDefaultSysConfig(term));
991
992
3.01k
  if ((FLAGS_use_initial_sys_catalog_snapshot || 
FLAGS_enable_ysql3.01k
) &&
993
3.01k
      
!FLAGS_initial_sys_catalog_snapshot_path.empty()981
&&
994
3.01k
      
!FLAGS_create_initial_sys_catalog_snapshot767
) {
995
765
    if (!namespace_ids_map_.empty() || 
!system_tablets_.empty()746
) {
996
19
      LOG_WITH_PREFIX(INFO)
997
19
          << "This is an existing cluster, not initializing from a sys catalog snapshot.";
998
746
    } else {
999
746
      Result<bool> dir_exists =
1000
746
          Env::Default()->DoesDirectoryExist(FLAGS_initial_sys_catalog_snapshot_path);
1001
746
      if (dir_exists.ok() && *dir_exists) {
1002
746
        bool initdb_was_already_done = false;
1003
746
        {
1004
746
          auto l = ysql_catalog_config_->LockForRead();
1005
746
          initdb_was_already_done = l->pb.ysql_catalog_config().initdb_done();
1006
746
        }
1007
746
        if (initdb_was_already_done) {
1008
0
          LOG_WITH_PREFIX(INFO)
1009
0
              << "initdb has been run before, no need to restore sys catalog from "
1010
0
              << "the initial snapshot";
1011
746
        } else {
1012
746
          LOG_WITH_PREFIX(INFO) << "Restoring snapshot in sys catalog";
1013
746
          Status restore_status = RestoreInitialSysCatalogSnapshot(
1014
746
              FLAGS_initial_sys_catalog_snapshot_path,
1015
746
              sys_catalog_->tablet_peer().get(),
1016
746
              term);
1017
746
          if (!restore_status.ok()) {
1018
0
            LOG_WITH_PREFIX(ERROR) << "Failed restoring snapshot in sys catalog";
1019
0
            return restore_status;
1020
0
          }
1021
1022
746
          LOG_WITH_PREFIX(INFO) << "Re-initializing cluster config";
1023
746
          {
1024
746
            std::lock_guard<decltype(config_mutex_)> lock(config_mutex_);
1025
746
            cluster_config_.reset();
1026
746
          }
1027
746
          RETURN_NOT_OK(PrepareDefaultClusterConfig(term));
1028
1029
746
          LOG_WITH_PREFIX(INFO) << "Restoring snapshot completed, considering initdb finished";
1030
746
          RETURN_NOT_OK(InitDbFinished(Status::OK(), term));
1031
746
          RETURN_NOT_OK(RunLoaders(term));
1032
746
        }
1033
746
      } else {
1034
0
        LOG_WITH_PREFIX(WARNING)
1035
0
            << "Initial sys catalog snapshot directory does not exist: "
1036
0
            << FLAGS_initial_sys_catalog_snapshot_path
1037
0
            << (dir_exists.ok() ? "" : ", status: " + dir_exists.status().ToString());
1038
0
      }
1039
746
    }
1040
765
  }
1041
1042
  // Create the system namespaces (created only if they don't already exist).
1043
3.01k
  RETURN_NOT_OK(PrepareDefaultNamespaces(term));
1044
1045
  // Create the system tables (created only if they don't already exist).
1046
3.01k
  RETURN_NOT_OK(PrepareSystemTables(term));
1047
1048
  // Create the default cassandra (created only if they don't already exist).
1049
3.01k
  RETURN_NOT_OK(permissions_manager_->PrepareDefaultRoles(term));
1050
1051
  // If this is the first time we start up, we have no config information as default. We write an
1052
  // empty version 0.
1053
3.00k
  RETURN_NOT_OK(PrepareDefaultClusterConfig(term));
1054
1055
3.00k
  permissions_manager_->BuildRecursiveRoles();
1056
1057
3.00k
  if (FLAGS_enable_ysql) {
1058
    // Number of TS to wait for before creating the txn table.
1059
977
    auto wait_ts_count = std::max(FLAGS_txn_table_wait_min_ts_count, FLAGS_replication_factor);
1060
1061
977
    LOG_WITH_PREFIX(INFO)
1062
977
        << "YSQL is enabled, will create the transaction status table when "
1063
977
        << wait_ts_count << " tablet servers are online";
1064
977
    master_->ts_manager()->SetTSCountCallback(wait_ts_count, [this, wait_ts_count] {
1065
901
      LOG_WITH_PREFIX(INFO)
1066
901
          << wait_ts_count
1067
901
          << " tablet servers registered, creating the transaction status table";
1068
      // Retry table creation until it succeedes. It might fail initially because placement UUID
1069
      // of live replicas is set through an RPC from YugaWare, and we won't be able to calculate
1070
      // the number of primary (non-read-replica) tablet servers until that happens.
1071
923
      while (true) {
1072
903
        const auto s = CreateGlobalTransactionStatusTableIfNeeded(/* rpc */ nullptr);
1073
903
        if (s.ok()) {
1074
881
          break;
1075
881
        }
1076
22
        LOG_WITH_PREFIX(WARNING) << "Failed creating transaction status table, waiting: " << s;
1077
22
        if (s.IsShutdownInProgress()) {
1078
0
          return;
1079
0
        }
1080
22
        auto role = Role();
1081
22
        if (role != PeerRole::LEADER) {
1082
0
          LOG_WITH_PREFIX(WARNING)
1083
0
              << "Cancel creating transaction because of role: " << PeerRole_Name(role);
1084
0
          return;
1085
0
        }
1086
22
        SleepFor(MonoDelta::FromSeconds(1));
1087
22
      }
1088
901
      LOG_WITH_PREFIX(INFO) << "Finished creating transaction status table asynchronously";
1089
901
    });
1090
977
  }
1091
1092
3.00k
  if (!StartRunningInitDbIfNeeded(term)) {
1093
    // If we are not running initdb, this is an existing cluster, and we need to check whether we
1094
    // need to do a one-time migration to make YSQL system catalog tables transactional.
1095
3.00k
    RETURN_NOT_OK(MakeYsqlSysCatalogTablesTransactional(
1096
3.00k
      table_ids_map_.CheckOut().get_ptr(), sys_catalog_.get(), ysql_catalog_config_.get(), term));
1097
3.00k
  }
1098
1099
3.00k
  return Status::OK();
1100
3.00k
}
1101
1102
template <class Loader>
1103
30.0k
Status CatalogManager::Load(const std::string& title, const int64_t term) {
1104
30.0k
  LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory.";
1105
30.0k
  std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term);
1106
30.0k
  RETURN_NOT_OK_PREPEND(
1107
30.0k
      sys_catalog_->Visit(loader.get()),
1108
30.0k
      "Failed while visiting " + title + " in sys catalog");
1109
30.0k
  return Status::OK();
1110
30.0k
}
yb::Status yb::master::CatalogManager::Load<yb::master::RoleLoader>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long)
Line
Count
Source
1103
3.75k
Status CatalogManager::Load(const std::string& title, const int64_t term) {
1104
3.75k
  LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory.";
1105
3.75k
  std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term);
1106
3.75k
  RETURN_NOT_OK_PREPEND(
1107
3.75k
      sys_catalog_->Visit(loader.get()),
1108
3.75k
      "Failed while visiting " + title + " in sys catalog");
1109
3.75k
  return Status::OK();
1110
3.75k
}
yb::Status yb::master::CatalogManager::Load<yb::master::SysConfigLoader>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long)
Line
Count
Source
1103
3.75k
Status CatalogManager::Load(const std::string& title, const int64_t term) {
1104
3.75k
  LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory.";
1105
3.75k
  std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term);
1106
3.75k
  RETURN_NOT_OK_PREPEND(
1107
3.75k
      sys_catalog_->Visit(loader.get()),
1108
3.75k
      "Failed while visiting " + title + " in sys catalog");
1109
3.75k
  return Status::OK();
1110
3.75k
}
yb::Status yb::master::CatalogManager::Load<yb::master::TableLoader>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long)
Line
Count
Source
1103
3.75k
Status CatalogManager::Load(const std::string& title, const int64_t term) {
1104
3.75k
  LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory.";
1105
3.75k
  std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term);
1106
3.75k
  RETURN_NOT_OK_PREPEND(
1107
3.75k
      sys_catalog_->Visit(loader.get()),
1108
3.75k
      "Failed while visiting " + title + " in sys catalog");
1109
3.75k
  return Status::OK();
1110
3.75k
}
yb::Status yb::master::CatalogManager::Load<yb::master::TabletLoader>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long)
Line
Count
Source
1103
3.75k
Status CatalogManager::Load(const std::string& title, const int64_t term) {
1104
3.75k
  LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory.";
1105
3.75k
  std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term);
1106
3.75k
  RETURN_NOT_OK_PREPEND(
1107
3.75k
      sys_catalog_->Visit(loader.get()),
1108
3.75k
      "Failed while visiting " + title + " in sys catalog");
1109
3.75k
  return Status::OK();
1110
3.75k
}
yb::Status yb::master::CatalogManager::Load<yb::master::NamespaceLoader>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long)
Line
Count
Source
1103
3.75k
Status CatalogManager::Load(const std::string& title, const int64_t term) {
1104
3.75k
  LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory.";
1105
3.75k
  std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term);
1106
3.75k
  RETURN_NOT_OK_PREPEND(
1107
3.75k
      sys_catalog_->Visit(loader.get()),
1108
3.75k
      "Failed while visiting " + title + " in sys catalog");
1109
3.75k
  return Status::OK();
1110
3.75k
}
yb::Status yb::master::CatalogManager::Load<yb::master::UDTypeLoader>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long)
Line
Count
Source
1103
3.75k
Status CatalogManager::Load(const std::string& title, const int64_t term) {
1104
3.75k
  LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory.";
1105
3.75k
  std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term);
1106
3.75k
  RETURN_NOT_OK_PREPEND(
1107
3.75k
      sys_catalog_->Visit(loader.get()),
1108
3.75k
      "Failed while visiting " + title + " in sys catalog");
1109
3.75k
  return Status::OK();
1110
3.75k
}
yb::Status yb::master::CatalogManager::Load<yb::master::ClusterConfigLoader>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long)
Line
Count
Source
1103
3.75k
Status CatalogManager::Load(const std::string& title, const int64_t term) {
1104
3.75k
  LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory.";
1105
3.75k
  std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term);
1106
3.75k
  RETURN_NOT_OK_PREPEND(
1107
3.75k
      sys_catalog_->Visit(loader.get()),
1108
3.75k
      "Failed while visiting " + title + " in sys catalog");
1109
3.75k
  return Status::OK();
1110
3.75k
}
yb::Status yb::master::CatalogManager::Load<yb::master::RedisConfigLoader>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long)
Line
Count
Source
1103
3.75k
Status CatalogManager::Load(const std::string& title, const int64_t term) {
1104
3.75k
  LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory.";
1105
3.75k
  std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term);
1106
3.75k
  RETURN_NOT_OK_PREPEND(
1107
3.75k
      sys_catalog_->Visit(loader.get()),
1108
3.75k
      "Failed while visiting " + title + " in sys catalog");
1109
3.75k
  return Status::OK();
1110
3.75k
}
1111
1112
3.75k
Status CatalogManager::RunLoaders(int64_t term) {
1113
  // Clear the table and tablet state.
1114
3.75k
  table_names_map_.clear();
1115
3.75k
  transaction_table_ids_set_.clear();
1116
3.75k
  auto table_ids_map_checkout = table_ids_map_.CheckOut();
1117
3.75k
  table_ids_map_checkout->clear();
1118
1119
3.75k
  auto tablet_map_checkout = tablet_map_.CheckOut();
1120
3.75k
  tablet_map_checkout->clear();
1121
1122
  // Clear the namespace mappings.
1123
3.75k
  namespace_ids_map_.clear();
1124
3.75k
  namespace_names_mapper_.clear();
1125
1126
  // Clear the type mappings.
1127
3.75k
  udtype_ids_map_.clear();
1128
3.75k
  udtype_names_map_.clear();
1129
1130
  // Clear the current cluster config.
1131
3.75k
  {
1132
3.75k
    std::lock_guard<decltype(config_mutex_)> lock(config_mutex_);
1133
3.75k
    cluster_config_.reset();
1134
3.75k
  }
1135
1136
  // Clear redis config mapping.
1137
3.75k
  redis_config_map_.clear();
1138
1139
  // Clear ysql catalog config.
1140
3.75k
  ysql_catalog_config_.reset();
1141
1142
  // Clear transaction tables config.
1143
3.75k
  transaction_tables_config_.reset();
1144
1145
  // Clear recent tasks.
1146
3.75k
  tasks_tracker_->Reset();
1147
1148
  // Clear recent jobs.
1149
3.75k
  jobs_tracker_->Reset();
1150
1151
3.75k
  std::vector<std::shared_ptr<TSDescriptor>> descs;
1152
3.75k
  master_->ts_manager()->GetAllDescriptors(&descs);
1153
3.75k
  for (const auto& ts_desc : descs) {
1154
52
    ts_desc->set_has_tablet_report(false);
1155
52
  }
1156
1157
3.75k
  {
1158
3.75k
    LockGuard lock(permissions_manager()->mutex());
1159
1160
    // Clear the roles mapping.
1161
3.75k
    permissions_manager()->ClearRolesUnlocked();
1162
3.75k
    RETURN_NOT_OK(Load<RoleLoader>("roles", term));
1163
3.75k
    RETURN_NOT_OK(Load<SysConfigLoader>("sys config", term));
1164
3.75k
  }
1165
  // Clear the hidden tablets vector.
1166
3.75k
  hidden_tablets_.clear();
1167
1168
3.75k
  RETURN_NOT_OK(Load<TableLoader>("tables", term));
1169
3.75k
  RETURN_NOT_OK(Load<TabletLoader>("tablets", term));
1170
3.75k
  RETURN_NOT_OK(Load<NamespaceLoader>("namespaces", term));
1171
3.75k
  RETURN_NOT_OK(Load<UDTypeLoader>("user-defined types", term));
1172
3.75k
  RETURN_NOT_OK(Load<ClusterConfigLoader>("cluster configuration", term));
1173
3.75k
  RETURN_NOT_OK(Load<RedisConfigLoader>("Redis config", term));
1174
1175
3.75k
  if (!transaction_tables_config_) {
1176
2.91k
    RETURN_NOT_OK(InitializeTransactionTablesConfig(term));
1177
2.91k
  }
1178
1179
3.75k
  return Status::OK();
1180
3.75k
}
1181
1182
Status CatalogManager::CheckResource(
1183
    const GrantRevokePermissionRequestPB* req,
1184
721
    GrantRevokePermissionResponsePB* resp) {
1185
721
  scoped_refptr<TableInfo> table;
1186
1187
  // Checking if resources exist.
1188
721
  if (req->resource_type() == ResourceType::TABLE ||
1189
721
      
req->resource_type() == ResourceType::KEYSPACE522
) {
1190
    // We can't match Apache Cassandra's error because when a namespace is not provided, the error
1191
    // is detected by the semantic analysis in PTQualifiedName::AnalyzeName.
1192
435
    DCHECK(req->has_namespace_());
1193
435
    const auto& namespace_info = req->namespace_();
1194
435
    auto ns = FindNamespace(namespace_info);
1195
1196
435
    if (req->resource_type() == ResourceType::KEYSPACE) {
1197
236
      if (!ns.ok()) {
1198
        // Matches Apache Cassandra's error.
1199
0
        Status s = STATUS_SUBSTITUTE(
1200
0
            NotFound, "Resource <keyspace $0> doesn't exist", namespace_info.name());
1201
0
        return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s);
1202
0
      }
1203
236
    } else {
1204
199
      if (ns.ok()) {
1205
199
        CatalogManager::SharedLock l(mutex_);
1206
199
        table = FindPtrOrNull(table_names_map_, {(**ns).id(), req->resource_name()});
1207
199
      }
1208
199
      if (table == nullptr) {
1209
        // Matches Apache Cassandra's error.
1210
0
        Status s = STATUS_SUBSTITUTE(
1211
0
            NotFound, "Resource <object '$0.$1'> doesn't exist",
1212
0
            namespace_info.name(), req->resource_name());
1213
0
        return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
1214
0
      }
1215
199
    }
1216
435
  }
1217
721
  return Status::OK();
1218
721
}
1219
1220
3.74k
Status CatalogManager::PrepareDefaultClusterConfig(int64_t term) {
1221
3.74k
  std::lock_guard<decltype(config_mutex_)> lock(config_mutex_);
1222
3.74k
  if (cluster_config_) {
1223
845
    LOG_WITH_PREFIX(INFO)
1224
845
        << "Cluster configuration has already been set up, skipping re-initialization.";
1225
845
    return Status::OK();
1226
845
  }
1227
1228
  // Create default.
1229
2.90k
  SysClusterConfigEntryPB config;
1230
2.90k
  config.set_version(0);
1231
1232
2.90k
  std::string cluster_uuid_source;
1233
2.90k
  if (!FLAGS_cluster_uuid.empty()) {
1234
1
    RETURN_NOT_OK(Uuid::FromString(FLAGS_cluster_uuid));
1235
0
    config.set_cluster_uuid(FLAGS_cluster_uuid);
1236
0
    cluster_uuid_source = "from the --cluster_uuid flag";
1237
2.90k
  } else {
1238
2.90k
    auto uuid = Uuid::Generate();
1239
2.90k
    config.set_cluster_uuid(uuid.ToString());
1240
2.90k
    cluster_uuid_source = "(randomly generated)";
1241
2.90k
  }
1242
2.90k
  LOG_WITH_PREFIX(INFO)
1243
2.90k
      << "Setting cluster UUID to " << config.cluster_uuid() << " " << cluster_uuid_source;
1244
1245
  // Create in memory object.
1246
2.90k
  cluster_config_ = std::make_shared<ClusterConfigInfo>();
1247
1248
  // Prepare write.
1249
2.90k
  auto l = cluster_config_->LockForWrite();
1250
2.90k
  l.mutable_data()->pb = std::move(config);
1251
1252
  // Write to sys_catalog and in memory.
1253
2.90k
  RETURN_NOT_OK(sys_catalog_->Upsert(term, cluster_config_.get()));
1254
2.90k
  l.Commit();
1255
1256
2.90k
  return Status::OK();
1257
2.90k
}
1258
1259
29.4k
std::vector<std::string> CatalogManager::GetMasterAddresses() {
1260
29.4k
  std::vector<std::string> result;
1261
29.4k
  consensus::ConsensusStatePB state;
1262
29.4k
  auto status = GetCurrentConfig(&state);
1263
29.4k
  if (!status.ok()) {
1264
17.6k
    LOG(WARNING) << "Failed to get current config: " << status;
1265
17.6k
    return result;
1266
17.6k
  }
1267
32.5k
  
for (const auto& peer : state.config().peers())11.7k
{
1268
32.5k
    std::vector<std::string> peer_addresses;
1269
65.0k
    for (const auto& list : {peer.last_known_private_addr(), peer.last_known_broadcast_addr()}) {
1270
65.0k
      for (const auto& entry : list) {
1271
32.8k
        peer_addresses.push_back(HostPort::FromPB(entry).ToString());
1272
32.8k
      }
1273
65.0k
    }
1274
32.5k
    if (!peer_addresses.empty()) {
1275
32.5k
      result.push_back(JoinStrings(peer_addresses, ","));
1276
32.5k
    }
1277
32.5k
  }
1278
11.7k
  return result;
1279
29.4k
}
1280
1281
3.01k
Status CatalogManager::PrepareDefaultSysConfig(int64_t term) {
1282
3.01k
  {
1283
3.01k
    LockGuard lock(permissions_manager()->mutex());
1284
3.01k
    RETURN_NOT_OK(permissions_manager()->PrepareDefaultSecurityConfigUnlocked(term));
1285
3.01k
  }
1286
1287
3.01k
  if (!ysql_catalog_config_) {
1288
2.90k
    SysYSQLCatalogConfigEntryPB ysql_catalog_config;
1289
2.90k
    ysql_catalog_config.set_version(0);
1290
1291
    // Create in memory objects.
1292
2.90k
    ysql_catalog_config_ = new SysConfigInfo(kYsqlCatalogConfigType);
1293
1294
    // Prepare write.
1295
2.90k
    auto l = ysql_catalog_config_->LockForWrite();
1296
2.90k
    *l.mutable_data()->pb.mutable_ysql_catalog_config() = std::move(ysql_catalog_config);
1297
1298
    // Write to sys_catalog and in memory.
1299
2.90k
    RETURN_NOT_OK(sys_catalog_->Upsert(term, ysql_catalog_config_));
1300
2.90k
    l.Commit();
1301
2.90k
  }
1302
1303
3.01k
  if (!transaction_tables_config_) {
1304
0
    RETURN_NOT_OK(InitializeTransactionTablesConfig(term));
1305
0
  }
1306
1307
3.01k
  return Status::OK();
1308
3.01k
}
1309
1310
3.00k
bool CatalogManager::StartRunningInitDbIfNeeded(int64_t term) {
1311
3.00k
  if (!ShouldAutoRunInitDb(ysql_catalog_config_.get(), pg_proc_exists_)) {
1312
3.00k
    return false;
1313
3.00k
  }
1314
1315
2
  string master_addresses_str = MasterAddressesToString(
1316
2
      *master_->opts().GetMasterAddresses());
1317
1318
2
  initdb_future_ = std::async(std::launch::async, [this, master_addresses_str, term] {
1319
2
    if (FLAGS_create_initial_sys_catalog_snapshot) {
1320
2
      initial_snapshot_writer_.emplace();
1321
2
    }
1322
1323
2
    Status status = PgWrapper::InitDbForYSQL(
1324
2
        master_addresses_str, "/tmp", master_->GetSharedMemoryFd());
1325
1326
2
    if (FLAGS_create_initial_sys_catalog_snapshot && status.ok()) {
1327
2
      Status write_snapshot_status = initial_snapshot_writer_->WriteSnapshot(
1328
2
          sys_catalog_->tablet_peer()->tablet(),
1329
2
          FLAGS_initial_sys_catalog_snapshot_path);
1330
2
      if (!write_snapshot_status.ok()) {
1331
0
        status = write_snapshot_status;
1332
0
      }
1333
2
    }
1334
2
    Status finish_status = InitDbFinished(status, term);
1335
2
    if (!finish_status.ok()) {
1336
0
      if (status.ok()) {
1337
0
        status = finish_status;
1338
0
      }
1339
0
      LOG_WITH_PREFIX(WARNING)
1340
0
          << "Failed to set initdb as finished in sys catalog: " << finish_status;
1341
0
    }
1342
2
    return status;
1343
2
  });
1344
2
  return true;
1345
3.00k
}
1346
1347
3.01k
Status CatalogManager::PrepareDefaultNamespaces(int64_t term) {
1348
3.01k
  RETURN_NOT_OK(PrepareNamespace(
1349
3.01k
      YQL_DATABASE_CQL, kSystemNamespaceName, kSystemNamespaceId, term));
1350
3.01k
  RETURN_NOT_OK(PrepareNamespace(
1351
3.01k
      YQL_DATABASE_CQL, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term));
1352
3.01k
  RETURN_NOT_OK(PrepareNamespace(
1353
3.01k
      YQL_DATABASE_CQL, kSystemAuthNamespaceName, kSystemAuthNamespaceId, term));
1354
3.01k
  return Status::OK();
1355
3.01k
}
1356
1357
3.00k
Status CatalogManager::PrepareSystemTables(int64_t term) {
1358
  // Prepare sys catalog table.
1359
3.00k
  RETURN_NOT_OK(PrepareSysCatalogTable(term));
1360
1361
  // Create the required system tables here.
1362
3.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<PeersVTable>(
1363
3.00k
      kSystemPeersTableName, kSystemNamespaceName, kSystemNamespaceId, term)));
1364
3.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<LocalVTable>(
1365
3.00k
      kSystemLocalTableName, kSystemNamespaceName, kSystemNamespaceId, term)));
1366
3.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLKeyspacesVTable>(
1367
3.00k
      kSystemSchemaKeyspacesTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId,
1368
3.00k
      term)));
1369
3.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLTablesVTable>(
1370
3.00k
      kSystemSchemaTablesTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term)));
1371
3.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLColumnsVTable>(
1372
3.00k
      kSystemSchemaColumnsTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term)));
1373
3.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLSizeEstimatesVTable>(
1374
3.00k
      kSystemSizeEstimatesTableName, kSystemNamespaceName, kSystemNamespaceId, term)));
1375
1376
  // Empty tables.
1377
3.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLAggregatesVTable>(
1378
3.00k
      kSystemSchemaAggregatesTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId,
1379
3.00k
      term)));
1380
3.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLFunctionsVTable>(
1381
3.00k
      kSystemSchemaFunctionsTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId,
1382
3.00k
      term)));
1383
3.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLIndexesVTable>(
1384
3.00k
      kSystemSchemaIndexesTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term)));
1385
3.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLTriggersVTable>(
1386
3.00k
      kSystemSchemaTriggersTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term)));
1387
3.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLViewsVTable>(
1388
3.00k
      kSystemSchemaViewsTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term)));
1389
3.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<QLTypesVTable>(
1390
3.00k
      kSystemSchemaTypesTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term)));
1391
3.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLPartitionsVTable>(
1392
3.00k
      kSystemPartitionsTableName, kSystemNamespaceName, kSystemNamespaceId, term)));
1393
1394
  // System auth tables.
1395
3.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLAuthRolesVTable>(
1396
3.00k
      kSystemAuthRolesTableName, kSystemAuthNamespaceName, kSystemAuthNamespaceId, term)));
1397
3.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLAuthRolePermissionsVTable>(
1398
3.00k
      kSystemAuthRolePermissionsTableName, kSystemAuthNamespaceName, kSystemAuthNamespaceId,
1399
3.00k
      term)));
1400
3.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLAuthResourceRolePermissionsIndexVTable>(
1401
3.00k
      kSystemAuthResourceRolePermissionsIndexTableName, kSystemAuthNamespaceName,
1402
3.00k
      kSystemAuthNamespaceId, term)));
1403
1404
  // Ensure kNumSystemTables is in-sync with the system tables created.
1405
3.00k
  LOG_IF(DFATAL, system_tablets_.size() != kNumSystemTables)
1406
1
      << "kNumSystemTables is " << kNumSystemTables << " but " << system_tablets_.size()
1407
1
      << " tables were created";
1408
1409
  // Cache the system.partitions tablet so we can access it in RebuildYQLSystemPartitions.
1410
3.00k
  RETURN_NOT_OK(GetYQLPartitionsVTable(&system_partitions_tablet_));
1411
1412
3.00k
  return Status::OK();
1413
3.00k
}
1414
1415
3.00k
Status CatalogManager::PrepareSysCatalogTable(int64_t term) {
1416
  // Prepare sys catalog table info.
1417
3.00k
  auto sys_catalog_table_iter = table_ids_map_->find(kSysCatalogTableId);
1418
3.00k
  if (sys_catalog_table_iter == table_ids_map_->end()) {
1419
2.16k
    scoped_refptr<TableInfo> table = NewTableInfo(kSysCatalogTableId);
1420
2.16k
    table->mutable_metadata()->StartMutation();
1421
2.16k
    SysTablesEntryPB& metadata = table->mutable_metadata()->mutable_dirty()->pb;
1422
2.16k
    metadata.set_state(SysTablesEntryPB::RUNNING);
1423
2.16k
    metadata.set_namespace_id(kSystemSchemaNamespaceId);
1424
2.16k
    metadata.set_name(kSysCatalogTableName);
1425
2.16k
    metadata.set_table_type(TableType::YQL_TABLE_TYPE);
1426
2.16k
    SchemaToPB(*sys_catalog_->schema_, metadata.mutable_schema());
1427
2.16k
    metadata.set_version(0);
1428
1429
2.16k
    auto table_ids_map_checkout = table_ids_map_.CheckOut();
1430
2.16k
    sys_catalog_table_iter = table_ids_map_checkout->emplace(table->id(), table).first;
1431
2.16k
    table_names_map_[{kSystemSchemaNamespaceId, kSysCatalogTableName}] = table;
1432
2.16k
    table->set_is_system();
1433
1434
2.16k
    RETURN_NOT_OK(sys_catalog_->Upsert(term, table));
1435
2.16k
    table->mutable_metadata()->CommitMutation();
1436
2.16k
  }
1437
1438
  // Prepare sys catalog tablet info.
1439
3.00k
  if (tablet_map_->count(kSysCatalogTabletId) == 0) {
1440
2.16k
    scoped_refptr<TableInfo> table = sys_catalog_table_iter->second;
1441
2.16k
    scoped_refptr<TabletInfo> tablet(new TabletInfo(table, kSysCatalogTabletId));
1442
2.16k
    tablet->mutable_metadata()->StartMutation();
1443
2.16k
    SysTabletsEntryPB& metadata = tablet->mutable_metadata()->mutable_dirty()->pb;
1444
2.16k
    metadata.set_state(SysTabletsEntryPB::RUNNING);
1445
1446
2.16k
    auto l = table->LockForRead();
1447
2.16k
    PartitionSchema partition_schema;
1448
2.16k
    RETURN_NOT_OK(PartitionSchema::FromPB(l->pb.partition_schema(),
1449
2.16k
                                          *sys_catalog_->schema_,
1450
2.16k
                                          &partition_schema));
1451
2.16k
    vector<Partition> partitions;
1452
2.16k
    RETURN_NOT_OK(partition_schema.CreatePartitions(1, &partitions));
1453
2.16k
    partitions[0].ToPB(metadata.mutable_partition());
1454
2.16k
    metadata.set_table_id(table->id());
1455
2.16k
    metadata.add_table_ids(table->id());
1456
1457
2.16k
    table->set_is_system();
1458
2.16k
    table->AddTablet(tablet.get());
1459
1460
2.16k
    auto tablet_map_checkout = tablet_map_.CheckOut();
1461
2.16k
    (*tablet_map_checkout)[tablet->tablet_id()] = tablet;
1462
1463
2.16k
    RETURN_NOT_OK(sys_catalog_->Upsert(term, tablet));
1464
2.16k
    tablet->mutable_metadata()->CommitMutation();
1465
2.16k
  }
1466
1467
3.00k
  system_tablets_[kSysCatalogTabletId] = sys_catalog_->tablet_peer_->shared_tablet();
1468
1469
3.00k
  return Status::OK();
1470
3.00k
}
1471
1472
template <class T>
1473
Status CatalogManager::PrepareSystemTableTemplate(const TableName& table_name,
1474
                                                  const NamespaceName& namespace_name,
1475
                                                  const NamespaceId& namespace_id,
1476
48.1k
                                                  int64_t term) {
1477
48.1k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1478
48.1k
  return PrepareSystemTable(
1479
48.1k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1480
48.1k
}
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::PeersVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long)
Line
Count
Source
1476
3.00k
                                                  int64_t term) {
1477
3.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1478
3.00k
  return PrepareSystemTable(
1479
3.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1480
3.00k
}
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::LocalVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long)
Line
Count
Source
1476
3.00k
                                                  int64_t term) {
1477
3.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1478
3.00k
  return PrepareSystemTable(
1479
3.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1480
3.00k
}
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLKeyspacesVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long)
Line
Count
Source
1476
3.00k
                                                  int64_t term) {
1477
3.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1478
3.00k
  return PrepareSystemTable(
1479
3.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1480
3.00k
}
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLTablesVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long)
Line
Count
Source
1476
3.00k
                                                  int64_t term) {
1477
3.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1478
3.00k
  return PrepareSystemTable(
1479
3.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1480
3.00k
}
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLColumnsVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long)
Line
Count
Source
1476
3.00k
                                                  int64_t term) {
1477
3.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1478
3.00k
  return PrepareSystemTable(
1479
3.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1480
3.00k
}
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLSizeEstimatesVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long)
Line
Count
Source
1476
3.00k
                                                  int64_t term) {
1477
3.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1478
3.00k
  return PrepareSystemTable(
1479
3.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1480
3.00k
}
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLAggregatesVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long)
Line
Count
Source
1476
3.00k
                                                  int64_t term) {
1477
3.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1478
3.00k
  return PrepareSystemTable(
1479
3.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1480
3.00k
}
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLFunctionsVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long)
Line
Count
Source
1476
3.00k
                                                  int64_t term) {
1477
3.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1478
3.00k
  return PrepareSystemTable(
1479
3.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1480
3.00k
}
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLIndexesVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long)
Line
Count
Source
1476
3.00k
                                                  int64_t term) {
1477
3.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1478
3.00k
  return PrepareSystemTable(
1479
3.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1480
3.00k
}
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLTriggersVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long)
Line
Count
Source
1476
3.00k
                                                  int64_t term) {
1477
3.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1478
3.00k
  return PrepareSystemTable(
1479
3.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1480
3.00k
}
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLViewsVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long)
Line
Count
Source
1476
3.00k
                                                  int64_t term) {
1477
3.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1478
3.00k
  return PrepareSystemTable(
1479
3.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1480
3.00k
}
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::QLTypesVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long)
Line
Count
Source
1476
3.00k
                                                  int64_t term) {
1477
3.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1478
3.00k
  return PrepareSystemTable(
1479
3.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1480
3.00k
}
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLPartitionsVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long)
Line
Count
Source
1476
3.00k
                                                  int64_t term) {
1477
3.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1478
3.00k
  return PrepareSystemTable(
1479
3.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1480
3.00k
}
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLAuthRolesVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long)
Line
Count
Source
1476
3.00k
                                                  int64_t term) {
1477
3.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1478
3.00k
  return PrepareSystemTable(
1479
3.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1480
3.00k
}
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLAuthRolePermissionsVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long)
Line
Count
Source
1476
3.00k
                                                  int64_t term) {
1477
3.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1478
3.00k
  return PrepareSystemTable(
1479
3.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1480
3.00k
}
yb::Status yb::master::CatalogManager::PrepareSystemTableTemplate<yb::master::YQLAuthResourceRolePermissionsIndexVTable>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long long)
Line
Count
Source
1476
3.00k
                                                  int64_t term) {
1477
3.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1478
3.00k
  return PrepareSystemTable(
1479
3.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1480
3.00k
}
1481
1482
Status CatalogManager::PrepareSystemTable(const TableName& table_name,
1483
                                          const NamespaceName& namespace_name,
1484
                                          const NamespaceId& namespace_id,
1485
                                          const Schema& schema,
1486
                                          int64_t term,
1487
48.1k
                                          YQLVirtualTable* vtable) {
1488
48.1k
  std::unique_ptr<YQLVirtualTable> yql_storage(vtable);
1489
1490
48.1k
  scoped_refptr<TableInfo> table = FindPtrOrNull(table_names_map_,
1491
48.1k
                                                 std::make_pair(namespace_id, table_name));
1492
48.1k
  bool create_table = true;
1493
48.1k
  if (table != nullptr) {
1494
13.5k
    LOG_WITH_PREFIX(INFO) << "Table " << namespace_name << "." << table_name << " already created";
1495
1496
    // Mark the table as a system table.
1497
13.5k
    table->set_is_system();
1498
1499
13.5k
    Schema persisted_schema;
1500
13.5k
    RETURN_NOT_OK(table->GetSchema(&persisted_schema));
1501
13.5k
    if (!persisted_schema.Equals(schema)) {
1502
6
      LOG_WITH_PREFIX(INFO)
1503
6
          << "Updating schema of " << namespace_name << "." << table_name << " ...";
1504
6
      auto l = table->LockForWrite();
1505
6
      SchemaToPB(schema, l.mutable_data()->pb.mutable_schema());
1506
6
      l.mutable_data()->pb.set_version(l->pb.version() + 1);
1507
6
      l.mutable_data()->pb.set_updates_only_index_permissions(false);
1508
1509
      // Update sys-catalog with the new table schema.
1510
6
      RETURN_NOT_OK(sys_catalog_->Upsert(term, table));
1511
6
      l.Commit();
1512
6
    }
1513
1514
    // There might have been a failure after writing the table but before writing the tablets. As
1515
    // a result, if we don't find any tablets, we try to create the tablets only again.
1516
13.5k
    auto tablets = table->GetTablets();
1517
13.5k
    if (!tablets.empty()) {
1518
      // Initialize the appropriate system tablet.
1519
13.5k
      DCHECK_EQ(1, tablets.size());
1520
13.5k
      auto tablet = tablets[0];
1521
13.5k
      system_tablets_[tablet->tablet_id()] =
1522
13.5k
          std::make_shared<SystemTablet>(schema, std::move(yql_storage), tablet->tablet_id());
1523
13.5k
      return Status::OK();
1524
13.5k
    } else {
1525
      // Table is already created, only need to create tablets now.
1526
1
      LOG_WITH_PREFIX(INFO)
1527
1
          << "Creating tablets for " << namespace_name << "." << table_name << " ...";
1528
1
      create_table = false;
1529
1
    }
1530
13.5k
  }
1531
1532
  // Create partitions.
1533
34.5k
  vector<Partition> partitions;
1534
34.5k
  PartitionSchemaPB partition_schema_pb;
1535
34.5k
  partition_schema_pb.set_hash_schema(PartitionSchemaPB::MULTI_COLUMN_HASH_SCHEMA);
1536
34.5k
  PartitionSchema partition_schema;
1537
34.5k
  RETURN_NOT_OK(PartitionSchema::FromPB(partition_schema_pb, schema, &partition_schema));
1538
34.5k
  RETURN_NOT_OK(partition_schema.CreatePartitions(1, &partitions));
1539
1540
34.5k
  TabletInfos tablets;
1541
1542
34.5k
  if (create_table) {
1543
    // Fill in details for the system table.
1544
34.5k
    CreateTableRequestPB req;
1545
34.5k
    req.set_name(table_name);
1546
34.5k
    req.set_table_type(TableType::YQL_TABLE_TYPE);
1547
1548
34.5k
    RETURN_NOT_OK(CreateTableInMemory(
1549
34.5k
        req, schema, partition_schema, namespace_id, namespace_name,
1550
34.5k
        partitions, nullptr, &tablets, nullptr, &table));
1551
    // Mark the table as a system table.
1552
34.5k
    LOG_WITH_PREFIX(INFO) << "Inserted new " << namespace_name << "." << table_name
1553
34.5k
                          << " table info into CatalogManager maps";
1554
    // Update the on-disk table state to "running".
1555
34.5k
    table->mutable_metadata()->mutable_dirty()->pb.set_state(SysTablesEntryPB::RUNNING);
1556
34.5k
    RETURN_NOT_OK(sys_catalog_->Upsert(term, table));
1557
34.5k
    LOG_WITH_PREFIX(INFO) << "Wrote table to system catalog: " << ToString(table) << ", tablets: "
1558
34.5k
                          << ToString(tablets);
1559
34.5k
  } else {
1560
    // Still need to create the tablets.
1561
1
    tablets = VERIFY_RESULT(CreateTabletsFromTable(partitions, table));
1562
1
  }
1563
1564
34.5k
  DCHECK_EQ(1, tablets.size());
1565
  // We use LOG_ASSERT here since this is expected to crash in some unit tests.
1566
34.5k
  LOG_ASSERT(!FLAGS_TEST_catalog_manager_simulate_system_table_create_failure);
1567
1568
  // Write Tablets to sys-tablets (in "running" state since we don't want the loadbalancer to
1569
  // assign these tablets since this table is virtual).
1570
34.5k
  for (const auto& tablet : tablets) {
1571
34.5k
    tablet->mutable_metadata()->mutable_dirty()->pb.set_state(SysTabletsEntryPB::RUNNING);
1572
34.5k
  }
1573
34.5k
  RETURN_NOT_OK(sys_catalog_->Upsert(term, tablets));
1574
34.5k
  LOG_WITH_PREFIX(INFO) << "Wrote tablets to system catalog: " << ToString(tablets);
1575
1576
  // Commit the in-memory state.
1577
34.5k
  if (create_table) {
1578
34.5k
    table->mutable_metadata()->CommitMutation();
1579
34.5k
  }
1580
1581
34.5k
  for (const auto& tablet : tablets) {
1582
34.5k
    tablet->mutable_metadata()->CommitMutation();
1583
34.5k
  }
1584
  // Mark the table as a system table.
1585
34.5k
  table->set_is_system();
1586
1587
  // Finally create the appropriate tablet object.
1588
34.5k
  auto tablet = tablets[0];
1589
34.5k
  system_tablets_[tablet->tablet_id()] =
1590
34.5k
      std::make_shared<SystemTablet>(schema, std::move(yql_storage), tablet->tablet_id());
1591
34.5k
  return Status::OK();
1592
34.5k
}
1593
1594
55.8k
bool IsYcqlNamespace(const NamespaceInfo& ns) {
1595
55.8k
  return ns.database_type() == YQLDatabase::YQL_DATABASE_CQL;
1596
55.8k
}
1597
1598
1.28M
bool IsYcqlTable(const TableInfo& table) {
1599
1.28M
  return table.GetTableType() == TableType::YQL_TABLE_TYPE && 
table.id() != kSysCatalogTableId665k
;
1600
1.28M
}
1601
1602
Status CatalogManager::PrepareNamespace(
1603
9.02k
    YQLDatabase db_type, const NamespaceName& name, const NamespaceId& id, int64_t term) {
1604
1605
9.02k
  scoped_refptr<NamespaceInfo> ns = FindPtrOrNull(namespace_ids_map_, id);
1606
9.02k
  if (ns != nullptr) {
1607
2.53k
    LOG_WITH_PREFIX(INFO)
1608
2.53k
        << "Keyspace " << ns->ToString() << " already created, skipping initialization";
1609
2.53k
    return Status::OK();
1610
2.53k
  }
1611
1612
  // Create entry.
1613
6.48k
  SysNamespaceEntryPB ns_entry;
1614
6.48k
  ns_entry.set_name(name);
1615
6.48k
  ns_entry.set_database_type(db_type);
1616
6.48k
  ns_entry.set_state(SysNamespaceEntryPB::RUNNING);
1617
1618
  // Create in memory object.
1619
6.48k
  ns = new NamespaceInfo(id);
1620
1621
  // Prepare write.
1622
6.48k
  auto l = ns->LockForWrite();
1623
6.48k
  l.mutable_data()->pb = std::move(ns_entry);
1624
1625
6.48k
  namespace_ids_map_[id] = ns;
1626
6.48k
  namespace_names_mapper_[db_type][l.mutable_data()->pb.name()] = ns;
1627
1628
  // Write to sys_catalog and in memory.
1629
6.48k
  RETURN_NOT_OK(sys_catalog_->Upsert(term, ns));
1630
6.48k
  l.Commit();
1631
1632
6.48k
  LOG_WITH_PREFIX(INFO) << "Created default keyspace: " << ns->ToString();
1633
6.48k
  return Status::OK();
1634
6.48k
}
1635
1636
7.94k
Status CatalogManager::CheckLocalHostInMasterAddresses() {
1637
7.94k
  auto local_hostport = master_->first_rpc_address();
1638
7.94k
  std::vector<IpAddress> local_addrs;
1639
1640
7.94k
  if (local_hostport.address().is_unspecified()) {
1641
0
    auto status = GetLocalAddresses(&local_addrs, AddressFilter::ANY);
1642
0
    if (!status.ok() || local_addrs.empty()) {
1643
0
      LOG(WARNING) << "Could not enumerate network interfaces due to " << status << ", found "
1644
0
                   << local_addrs.size() << " local addresses.";
1645
0
      return Status::OK();
1646
0
    }
1647
7.94k
  } else {
1648
7.94k
    for (auto const &addr : master_->rpc_addresses()) {
1649
7.94k
      local_addrs.push_back(addr.address());
1650
7.94k
    }
1651
7.94k
  }
1652
1653
7.94k
  auto resolved_addresses = VERIFY_RESULT(server::ResolveMasterAddresses(
1654
7.94k
      *master_->opts().GetMasterAddresses()));
1655
1656
14.1k
  for (auto const &addr : resolved_addresses) {
1657
14.1k
    if (addr.address().is_unspecified() ||
1658
14.1k
        std::find(local_addrs.begin(), local_addrs.end(), addr.address()) !=
1659
14.1k
            local_addrs.end()) {
1660
7.94k
      return Status::OK();
1661
7.94k
    }
1662
14.1k
  }
1663
0
  return STATUS_SUBSTITUTE(IllegalState,
1664
7.94k
      "None of the local addresses are present in master_addresses $0.",
1665
7.94k
      master_->opts().master_addresses_flag);
1666
7.94k
}
1667
1668
8.03k
Status CatalogManager::InitSysCatalogAsync() {
1669
8.03k
  LockGuard lock(mutex_);
1670
1671
  // Optimistically try to load data from disk.
1672
8.03k
  Status s = sys_catalog_->Load(master_->fs_manager());
1673
1674
8.03k
  if (!s.ok() && 
s.IsNotFound()8.00k
) {
1675
    // We have yet to intialize the syscatalog metadata, need to create the metadata file.
1676
8.00k
    LOG(INFO) << "Did not find previous SysCatalogTable data on disk. " << s;
1677
1678
8.00k
    if (!master_->opts().AreMasterAddressesProvided()) {
1679
58
      master_->SetShellMode(true);
1680
58
      LOG(INFO) << "Starting master in shell mode.";
1681
58
      return Status::OK();
1682
58
    }
1683
1684
7.94k
    RETURN_NOT_OK(CheckLocalHostInMasterAddresses());
1685
7.94k
    RETURN_NOT_OK_PREPEND(sys_catalog_->CreateNew(master_->fs_manager()),
1686
7.94k
        Substitute("Encountered errors during system catalog initialization:"
1687
7.94k
                   "\n\tError on Load: $0\n\tError on CreateNew: ", s.ToString()));
1688
1689
7.93k
    return Status::OK();
1690
7.94k
  }
1691
1692
31
  return s;
1693
8.03k
}
1694
1695
31.7M
bool CatalogManager::IsInitialized() const {
1696
31.7M
  std::lock_guard<simple_spinlock> l(state_lock_);
1697
31.7M
  return state_ == kRunning;
1698
31.7M
}
1699
1700
// TODO - delete this API after HandleReportedTablet() usage is removed.
1701
458k
Status CatalogManager::CheckIsLeaderAndReady() const {
1702
458k
  std::lock_guard<simple_spinlock> l(state_lock_);
1703
458k
  if (PREDICT_FALSE(state_ != kRunning)) {
1704
39
    return STATUS_SUBSTITUTE(ServiceUnavailable,
1705
39
        "Catalog manager is shutting down. State: $0", state_);
1706
39
  }
1707
458k
  string uuid = master_->fs_manager()->uuid();
1708
458k
  if (master_->opts().IsShellMode()) {
1709
    // Consensus and other internal fields should not be checked when is shell mode.
1710
0
    return STATUS_SUBSTITUTE(IllegalState,
1711
0
        "Catalog manager of $0 is in shell mode, not the leader", uuid);
1712
0
  }
1713
458k
  Consensus* consensus = tablet_peer()->consensus();
1714
458k
  if (consensus == nullptr) {
1715
0
    return STATUS(IllegalState, "Consensus has not been initialized yet");
1716
0
  }
1717
458k
  ConsensusStatePB cstate = consensus->ConsensusState(CONSENSUS_CONFIG_COMMITTED);
1718
458k
  if (PREDICT_FALSE(!cstate.has_leader_uuid() || cstate.leader_uuid() != uuid)) {
1719
6
    return STATUS_SUBSTITUTE(IllegalState,
1720
6
        "Not the leader. Local UUID: $0, Consensus state: $1", uuid, cstate.ShortDebugString());
1721
6
  }
1722
458k
  if (PREDICT_FALSE(leader_ready_term_ != cstate.current_term())) {
1723
0
    return STATUS_SUBSTITUTE(ServiceUnavailable,
1724
0
        "Leader not yet ready to serve requests: ready term $0 vs cstate term $1",
1725
0
        leader_ready_term_, cstate.current_term());
1726
0
  }
1727
458k
  return Status::OK();
1728
458k
}
1729
1730
34.5M
std::shared_ptr<tablet::TabletPeer> CatalogManager::tablet_peer() const {
1731
34.5M
  return sys_catalog_->tablet_peer();
1732
34.5M
}
1733
1734
28.8M
PeerRole CatalogManager::Role() const {
1735
28.8M
  if (
!IsInitialized()28.8M
|| master_->opts().IsShellMode()) {
1736
317
    return PeerRole::NON_PARTICIPANT;
1737
317
  }
1738
1739
28.8M
  return tablet_peer()->consensus()->role();
1740
28.8M
}
1741
1742
278
bool CatalogManager::StartShutdown() {
1743
278
  {
1744
278
    std::lock_guard<simple_spinlock> l(state_lock_);
1745
278
    if (state_ == kClosing) {
1746
178
      VLOG
(2) << "CatalogManager already shut down"0
;
1747
178
      return false;
1748
178
    }
1749
100
    state_ = kClosing;
1750
100
  }
1751
1752
0
  refresh_yql_partitions_task_.StartShutdown();
1753
1754
100
  refresh_ysql_tablespace_info_task_.StartShutdown();
1755
1756
100
  if (sys_catalog_) {
1757
100
    sys_catalog_->StartShutdown();
1758
100
  }
1759
1760
100
  return true;
1761
278
}
1762
1763
94
void CatalogManager::CompleteShutdown() {
1764
  // Shutdown the Catalog Manager background thread (load balancing).
1765
94
  refresh_yql_partitions_task_.CompleteShutdown();
1766
94
  refresh_ysql_tablespace_info_task_.CompleteShutdown();
1767
1768
94
  if (background_tasks_) {
1769
83
    background_tasks_->Shutdown();
1770
83
  }
1771
94
  if (background_tasks_thread_pool_) {
1772
93
    background_tasks_thread_pool_->Shutdown();
1773
93
  }
1774
94
  if (leader_initialization_pool_) {
1775
93
    leader_initialization_pool_->Shutdown();
1776
93
  }
1777
94
  if (async_task_pool_) {
1778
93
    async_task_pool_->Shutdown();
1779
93
  }
1780
1781
  // Mark all outstanding table tasks as aborted and wait for them to fail.
1782
  //
1783
  // There may be an outstanding table visitor thread modifying the table map,
1784
  // so we must make a copy of it before we iterate. It's OK if the visitor
1785
  // adds more entries to the map even after we finish; it won't start any new
1786
  // tasks for those entries.
1787
94
  vector<scoped_refptr<TableInfo>> copy;
1788
94
  {
1789
94
    SharedLock lock(mutex_);
1790
94
    AppendValuesFromMap(*table_ids_map_, &copy);
1791
94
  }
1792
94
  AbortAndWaitForAllTasks(copy);
1793
1794
  // Shut down the underlying storage for tables and tablets.
1795
94
  if (sys_catalog_) {
1796
92
    sys_catalog_->CompleteShutdown();
1797
92
  }
1798
1799
  // Reset the jobs/tasks tracker.
1800
94
  tasks_tracker_->Reset();
1801
94
  jobs_tracker_->Reset();
1802
1803
94
  if (initdb_future_ && 
initdb_future_->wait_for(0s) != std::future_status::ready0
) {
1804
0
    LOG(WARNING) << "initdb is still running, waiting for it to complete.";
1805
0
    initdb_future_->wait();
1806
0
    LOG(INFO) << "Finished running initdb, proceeding with catalog manager shutdown.";
1807
0
  }
1808
94
}
1809
1810
Status CatalogManager::AbortTableCreation(TableInfo* table,
1811
                                          const TabletInfos& tablets,
1812
                                          const Status& s,
1813
7
                                          CreateTableResponsePB* resp) {
1814
7
  LOG(WARNING) << s;
1815
1816
7
  const TableId table_id = table->id();
1817
7
  const TableName table_name = table->mutable_metadata()->mutable_dirty()->pb.name();
1818
7
  const NamespaceId table_namespace_id =
1819
7
      table->mutable_metadata()->mutable_dirty()->pb.namespace_id();
1820
7
  vector<string> tablet_ids_to_erase;
1821
14
  for (const auto& tablet : tablets) {
1822
14
    tablet_ids_to_erase.push_back(tablet->tablet_id());
1823
14
  }
1824
1825
7
  LOG(INFO) << "Aborting creation of table '" << table_name << "', erasing table and tablets (" <<
1826
7
      JoinStrings(tablet_ids_to_erase, ",") << ") from in-memory state.";
1827
1828
  // Since this is a failed creation attempt, it's safe to just abort
1829
  // all tasks, as (by definition) no tasks may be pending against a
1830
  // table that has failed to successfully create.
1831
7
  table->AbortTasksAndClose();
1832
7
  table->WaitTasksCompletion();
1833
1834
7
  LockGuard lock(mutex_);
1835
1836
  // Call AbortMutation() manually, as otherwise the lock won't be released.
1837
14
  for (const auto& tablet : tablets) {
1838
14
    tablet->mutable_metadata()->AbortMutation();
1839
14
  }
1840
7
  table->mutable_metadata()->AbortMutation();
1841
7
  auto tablet_map_checkout = tablet_map_.CheckOut();
1842
14
  for (const TabletId& tablet_id_to_erase : tablet_ids_to_erase) {
1843
14
    CHECK_EQ(tablet_map_checkout->erase(tablet_id_to_erase), 1)
1844
0
        << "Unable to erase tablet " << tablet_id_to_erase << " from tablet map.";
1845
14
  }
1846
1847
7
  auto table_ids_map_checkout = table_ids_map_.CheckOut();
1848
7
  table_names_map_.erase({table_namespace_id, table_name}); // Not present if PGSQL table.
1849
7
  CHECK_EQ(table_ids_map_checkout->erase(table_id), 1)
1850
0
      << "Unable to erase table with id " << table_id << " from table ids map.";
1851
1852
7
  if (IsYcqlTable(*table)) {
1853
7
    GetYqlPartitionsVtable().RemoveFromCache(table->id());
1854
7
  }
1855
7
  return CheckIfNoLongerLeaderAndSetupError(s, resp);
1856
7
}
1857
1858
Result<ReplicationInfoPB> CatalogManager::GetTableReplicationInfo(
1859
  const ReplicationInfoPB& table_replication_info,
1860
56.9k
  const TablespaceId& tablespace_id) {
1861
1862
56.9k
  if (IsReplicationInfoSet(table_replication_info)) {
1863
    // The table has custom replication info set for it, return it if valid.
1864
5
    RETURN_NOT_OK(ValidateTableReplicationInfo(table_replication_info));
1865
5
    return table_replication_info;
1866
5
  }
1867
  // Table level replication info not set. Check whether the table is
1868
  // associated with a tablespace and if so, return the tablespace
1869
  // replication info.
1870
56.8k
  if (GetAtomicFlag(&FLAGS_enable_ysql_tablespaces_for_placement)) {
1871
56.8k
    boost::optional<ReplicationInfoPB> tablespace_pb =
1872
56.8k
      VERIFY_RESULT(GetTablespaceReplicationInfoWithRetry(tablespace_id));
1873
56.8k
    if (tablespace_pb) {
1874
      // Return the tablespace placement.
1875
728
      return tablespace_pb.value();
1876
728
    }
1877
56.8k
  }
1878
1879
  // Neither table nor tablespace info set. Return cluster level replication info.
1880
56.1k
  auto l = ClusterConfig()->LockForRead();
1881
56.1k
  return l->pb.replication_info();
1882
56.8k
}
1883
1884
1.00M
std::shared_ptr<YsqlTablespaceManager> CatalogManager::GetTablespaceManager() const {
1885
1.00M
  SharedLock lock(tablespace_mutex_);
1886
1.00M
  return tablespace_manager_;
1887
1.00M
}
1888
1889
Result<boost::optional<TablespaceId>> CatalogManager::GetTablespaceForTable(
1890
1
    const scoped_refptr<TableInfo>& table) {
1891
1892
1
  auto tablespace_manager = GetTablespaceManager();
1893
1
  return tablespace_manager->GetTablespaceForTable(table);
1894
1
}
1895
1896
Result<boost::optional<ReplicationInfoPB>> CatalogManager::GetTablespaceReplicationInfoWithRetry(
1897
57.0k
  const TablespaceId& tablespace_id) {
1898
1899
57.0k
  auto tablespace_manager = GetTablespaceManager();
1900
57.0k
  auto replication_info_result = tablespace_manager->GetTablespaceReplicationInfo(tablespace_id);
1901
1902
57.0k
  if (replication_info_result) {
1903
56.9k
    return replication_info_result;
1904
56.9k
  }
1905
1906
  // We failed to find the tablespace placement policy. Refresh the tablespace info and try again.
1907
19
  auto tablespace_map = VERIFY_RESULT(GetYsqlTablespaceInfo());
1908
1909
  // We clone the tablespace_manager and update the clone with the new tablespace_map that we
1910
  // fetched above. We do this instead of updating the tablespace_manager object in-place because
1911
  // other clients may have a shared_ptr to it through 'GetTablespaceManager()'.
1912
0
  tablespace_manager = tablespace_manager->CreateCloneWithTablespaceMap(tablespace_map);
1913
19
  {
1914
19
    LockGuard lock(tablespace_mutex_);
1915
19
    tablespace_manager_ = tablespace_manager;
1916
19
  }
1917
1918
19
  return tablespace_manager->GetTablespaceReplicationInfo(tablespace_id);
1919
19
}
1920
1921
239k
bool CatalogManager::IsReplicationInfoSet(const ReplicationInfoPB& replication_info) {
1922
239k
  const auto& live_placement_info = replication_info.live_replicas();
1923
239k
  if (!(live_placement_info.placement_blocks().empty() &&
1924
239k
        
live_placement_info.num_replicas() <= 0239k
&&
1925
239k
        
live_placement_info.placement_uuid().empty()238k
) ||
1926
239k
      
!replication_info.read_replicas().empty()238k
||
1927
239k
      
!replication_info.affinitized_leaders().empty()238k
) {
1928
1929
621
      return true;
1930
621
  }
1931
239k
  return false;
1932
239k
}
1933
1934
428
Status CatalogManager::ValidateTableReplicationInfo(const ReplicationInfoPB& replication_info) {
1935
428
  if (!IsReplicationInfoSet(replication_info)) {
1936
0
    return STATUS(InvalidArgument, "No replication info set.");
1937
0
  }
1938
  // We don't support setting any other fields other than live replica placements for now.
1939
428
  if (!replication_info.read_replicas().empty() ||
1940
428
      !replication_info.affinitized_leaders().empty()) {
1941
1942
0
      return STATUS(InvalidArgument, "Only live placement info can be set for table "
1943
0
          "level replication info.");
1944
0
  }
1945
  // Today we support setting table level replication info only in clusters where read replica
1946
  // placements is not set. Return error if the cluster has read replica placements set.
1947
428
  auto l = ClusterConfig()->LockForRead();
1948
428
  const ReplicationInfoPB& cluster_replication_info = l->pb.replication_info();
1949
  // TODO(bogdan): figure this out when we expand on geopartition support.
1950
  // if (!cluster_replication_info.read_replicas().empty() ||
1951
  //     !cluster_replication_info.affinitized_leaders().empty()) {
1952
1953
  //     return STATUS(InvalidArgument, "Setting table level replication info is not supported "
1954
  //         "for clusters with read replica placements");
1955
  // }
1956
  // If the replication info has placement_uuid set, verify that it matches the cluster
1957
  // placement_uuid.
1958
428
  if (replication_info.live_replicas().placement_uuid().empty()) {
1959
426
    return Status::OK();
1960
426
  }
1961
2
  if (replication_info.live_replicas().placement_uuid() !=
1962
2
      cluster_replication_info.live_replicas().placement_uuid()) {
1963
1964
0
      return STATUS(InvalidArgument, "Placement uuid for table level replication info "
1965
0
          "must match that of the cluster's live placement info.");
1966
0
  }
1967
2
  return Status::OK();
1968
2
}
1969
1970
4.15k
Result<shared_ptr<TablespaceIdToReplicationInfoMap>> CatalogManager::GetYsqlTablespaceInfo() {
1971
4.15k
  auto table_info = GetTableInfo(kPgTablespaceTableId);
1972
4.15k
  if (table_info == nullptr) {
1973
166
    return STATUS(InternalError, "pg_tablespace table info not found");
1974
166
  }
1975
1976
3.98k
  auto tablespace_map = VERIFY_RESULT(sys_catalog_->ReadPgTablespaceInfo());
1977
1978
  // The tablespace options do not usually contain the placement uuid.
1979
  // Populate the current cluster placement uuid into the placement information for
1980
  // each tablespace.
1981
0
  string placement_uuid;
1982
3.98k
  {
1983
3.98k
    auto l = ClusterConfig()->LockForRead();
1984
    // TODO(deepthi.srinivasan): Read-replica placements are not supported as
1985
    // of now.
1986
3.98k
    placement_uuid = l->pb.replication_info().live_replicas().placement_uuid();
1987
3.98k
  }
1988
3.98k
  if (!placement_uuid.empty()) {
1989
4
    for (auto& iter : *tablespace_map) {
1990
4
      if (iter.second) {
1991
0
        iter.second.value().mutable_live_replicas()->set_placement_uuid(placement_uuid);
1992
0
      }
1993
4
    }
1994
2
  }
1995
1996
  // Before updating the tablespace placement map, validate the
1997
  // placement policies.
1998
8.39k
  for (auto& iter : *tablespace_map) {
1999
8.39k
    if (iter.second) {
2000
419
      RETURN_NOT_OK(ValidateTableReplicationInfo(iter.second.value()));
2001
419
    }
2002
8.39k
  }
2003
2004
3.98k
  return tablespace_map;
2005
3.98k
}
2006
2007
boost::optional<TablespaceId> CatalogManager::GetTransactionStatusTableTablespace(
2008
6.90k
    const scoped_refptr<TableInfo>& table) {
2009
6.90k
  auto lock = table->LockForRead();
2010
6.90k
  if (lock->pb.table_type() != TRANSACTION_STATUS_TABLE_TYPE) {
2011
2
    return boost::none;
2012
2
  }
2013
2014
6.90k
  if (!lock->pb.has_transaction_table_tablespace_id()) {
2015
5.63k
    return boost::none;
2016
5.63k
  }
2017
2018
1.26k
  return lock->pb.transaction_table_tablespace_id();
2019
6.90k
}
2020
2021
7
void CatalogManager::ClearTransactionStatusTableTablespace(const scoped_refptr<TableInfo>& table) {
2022
7
  auto lock = table->LockForWrite();
2023
7
  if (lock->pb.table_type() != TRANSACTION_STATUS_TABLE_TYPE) {
2024
0
    return;
2025
0
  }
2026
2027
7
  lock.mutable_data()->pb.clear_transaction_table_tablespace_id();
2028
7
  lock.mutable_data()->pb.set_version(lock.mutable_data()->pb.version() + 1);
2029
7
  lock.Commit();
2030
7
}
2031
2032
bool CatalogManager::CheckTransactionStatusTablesWithMissingTablespaces(
2033
3.96k
    const TablespaceIdToReplicationInfoMap& tablespace_info) {
2034
3.96k
  SharedLock lock(mutex_);
2035
3.96k
  for (const auto& table_id : transaction_table_ids_set_) {
2036
2.02k
    auto table = table_ids_map_->find(table_id);
2037
2.02k
    if (table == table_ids_map_->end()) {
2038
0
      LOG(DFATAL) << "Table uuid " << table_id
2039
0
                  << " in transaction_table_ids_set_ but not in table_ids_map_";
2040
0
      continue;
2041
0
    }
2042
2.02k
    auto tablespace_id = GetTransactionStatusTableTablespace(table->second);
2043
2.02k
    if (tablespace_id) {
2044
271
      if (!tablespace_info.count(*tablespace_id)) {
2045
3
        return true;
2046
3
      }
2047
271
    }
2048
2.02k
  }
2049
3.96k
  return false;
2050
3.96k
}
2051
2052
Status CatalogManager::UpdateTransactionStatusTableTablespaces(
2053
3.96k
    const TablespaceIdToReplicationInfoMap& tablespace_info) {
2054
3.96k
  if (CheckTransactionStatusTablesWithMissingTablespaces(tablespace_info)) {
2055
3
    {
2056
3
      LockGuard lock(mutex_);
2057
21
      for (const auto& table_id : transaction_table_ids_set_) {
2058
21
        auto table = table_ids_map_->find(table_id);
2059
21
        if (table == table_ids_map_->end()) {
2060
0
          LOG(DFATAL) << "Table uuid " << table_id
2061
0
                      << " in transaction_table_ids_set_ but not in table_ids_map_";
2062
0
          continue;
2063
0
        }
2064
21
        auto tablespace_id = GetTransactionStatusTableTablespace(table->second);
2065
21
        if (tablespace_id) {
2066
12
          if (!tablespace_info.count(*tablespace_id)) {
2067
            // TODO: We should also delete the transaction table, see #11123.
2068
7
            LOG(INFO) << "Found transaction status table for tablespace id " << *tablespace_id
2069
7
                      << " which doesn't exist, clearing tablespace id";
2070
7
            ClearTransactionStatusTableTablespace(table->second);
2071
7
          }
2072
12
        }
2073
21
      }
2074
3
    }
2075
2076
    // A tablespace id has been cleared, meaning a transaction table's placement has changed,
2077
    // and thus the transaction tables version needs to be incremented.
2078
3
    RETURN_NOT_OK(IncrementTransactionTablesVersion());
2079
3
  }
2080
2081
3.96k
  return Status::OK();
2082
3.96k
}
2083
2084
Result<shared_ptr<TableToTablespaceIdMap>> CatalogManager::GetYsqlTableToTablespaceMap(
2085
201
    const TablespaceIdToReplicationInfoMap& tablespace_info) {
2086
201
  auto table_to_tablespace_map = std::make_shared<TableToTablespaceIdMap>();
2087
2088
  // First fetch all namespaces. This is because the table_to_tablespace information is only
2089
  // found in the pg_class catalog table. There exists a separate pg_class table in each
2090
  // namespace. To build in-memory state for all tables, process pg_class table for each
2091
  // namespace.
2092
201
  vector<NamespaceId> namespace_id_vec;
2093
201
  set<NamespaceId> colocated_namespaces;
2094
201
  {
2095
201
    SharedLock lock(mutex_);
2096
1.78k
    for (const auto& ns : namespace_ids_map_) {
2097
1.78k
      if (ns.second->database_type() != YQL_DATABASE_PGSQL) {
2098
603
        continue;
2099
603
      }
2100
2101
1.18k
      if (ns.first == kPgSequencesDataNamespaceId) {
2102
        // Skip the database created for sequences system table.
2103
124
        continue;
2104
124
      }
2105
2106
1.05k
      if (ns.second->colocated()) {
2107
20
        colocated_namespaces.insert(ns.first);
2108
20
      }
2109
2110
      // TODO (Deepthi): Investigate if safe to skip template0 and template1 as well.
2111
1.05k
      namespace_id_vec.emplace_back(ns.first);
2112
1.05k
    }
2113
2114
    // Add local transaction tables corresponding to tablespaces.
2115
487
    for (const auto& table_id : transaction_table_ids_set_) {
2116
487
      auto table = table_ids_map_->find(table_id);
2117
487
      if (table == table_ids_map_->end()) {
2118
0
        LOG(DFATAL) << "Table uuid " << table_id
2119
0
                    << " in transaction_table_ids_set_ but not in table_ids_map_";
2120
0
        continue;
2121
0
      }
2122
487
      auto tablespace_id = GetTransactionStatusTableTablespace(table->second);
2123
487
      if (tablespace_id) {
2124
271
        if (tablespace_info.count(*tablespace_id)) {
2125
271
          (*table_to_tablespace_map)[table_id] = *tablespace_id;
2126
271
        } else {
2127
          // It's possible that a new tablespace had its transaction table created then deleted
2128
          // between when we checked tablespace ids and now; we ignore it here, and it will be
2129
          // caught and cleared in the next tablespace update.
2130
0
          LOG(INFO) << "Found transaction status table for tablespace id " << *tablespace_id
2131
0
                    << " which doesn't exist, ignoring";
2132
0
        }
2133
271
      }
2134
487
    }
2135
201
  }
2136
2137
  // For each namespace, fetch the table->tablespace information by reading pg_class
2138
  // table for each namespace.
2139
1.05k
  for (const NamespaceId& nsid : namespace_id_vec) {
2140
1.05k
    VLOG
(1) << "Refreshing placement information for namespace " << nsid0
;
2141
1.05k
    const uint32_t database_oid = CHECK_RESULT(GetPgsqlDatabaseOid(nsid));
2142
1.05k
    const bool is_colocated_database = colocated_namespaces.count(nsid) > 0;
2143
1.05k
    Status table_tablespace_status = sys_catalog_->ReadPgClassInfo(database_oid,
2144
1.05k
                                                                   is_colocated_database,
2145
1.05k
                                                                   table_to_tablespace_map.get());
2146
1.05k
    if (!table_tablespace_status.ok()) {
2147
7
      LOG(WARNING) << "Refreshing table->tablespace info failed for namespace "
2148
7
                   << nsid << " with error: " << table_tablespace_status.ToString();
2149
7
    }
2150
2151
1.05k
    const bool pg_yb_tablegroup_exists = VERIFY_RESULT(DoesTableExist(FindTableById(
2152
1.05k
      GetPgsqlTableId(database_oid, kPgYbTablegroupTableOid))));
2153
2154
    // no pg_yb_tablegroup means we only need to check pg_class
2155
1.05k
    if (table_tablespace_status.ok() && 
!pg_yb_tablegroup_exists1.04k
) {
2156
0
      VLOG(5) << "Successfully refreshed placement information for namespace " << nsid
2157
0
              << " from pg_class";
2158
0
      continue;
2159
0
    }
2160
2161
1.05k
    Status tablegroup_tablespace_status = sys_catalog_->ReadTablespaceInfoFromPgYbTablegroup(
2162
1.05k
      database_oid,
2163
1.05k
      table_to_tablespace_map.get());
2164
1.05k
    if (!tablegroup_tablespace_status.ok()) {
2165
7
      LOG(WARNING) << "Refreshing tablegroup->tablespace info failed for namespace "
2166
7
                  << nsid << " with error: " << tablegroup_tablespace_status.ToString();
2167
7
    }
2168
1.05k
    if (table_tablespace_status.ok() && 
tablegroup_tablespace_status.ok()1.04k
) {
2169
1.04k
      VLOG(5) << "Successfully refreshed placement information for namespace " << nsid
2170
0
              << " from pg_class and pg_yb_tablegroup";
2171
1.04k
    }
2172
1.05k
  }
2173
2174
201
  return table_to_tablespace_map;
2175
201
}
2176
2177
Status CatalogManager::CreateTransactionStatusTablesForTablespaces(
2178
    const TablespaceIdToReplicationInfoMap& tablespace_info,
2179
201
    const TableToTablespaceIdMap& table_to_tablespace_map) {
2180
201
  if (!GetAtomicFlag(&FLAGS_enable_ysql_tablespaces_for_placement) ||
2181
201
      !GetAtomicFlag(&FLAGS_auto_create_local_transaction_tables)) {
2182
0
    return Status::OK();
2183
0
  }
2184
2185
201
  std::unordered_set<TablespaceId> valid_tablespaces;
2186
1.72k
  for (const auto& entry : table_to_tablespace_map) {
2187
1.72k
    if (entry.second) {
2188
902
      valid_tablespaces.insert(*entry.second);
2189
902
    }
2190
1.72k
  }
2191
787
  for (const auto& entry : tablespace_info) {
2192
787
    if (!entry.second) {
2193
406
      valid_tablespaces.erase(entry.first);
2194
406
    }
2195
787
  }
2196
2197
272
  for (const auto& tablespace_id : valid_tablespaces) {
2198
272
    RETURN_NOT_OK(CreateLocalTransactionStatusTableIfNeeded(nullptr /* rpc */, tablespace_id));
2199
272
  }
2200
2201
201
  return Status::OK();
2202
201
}
2203
2204
117k
void CatalogManager::StartTablespaceBgTaskIfStopped() {
2205
117k
  if (GetAtomicFlag(&FLAGS_ysql_tablespace_info_refresh_secs) <= 0 ||
2206
117k
      !GetAtomicFlag(&FLAGS_enable_ysql_tablespaces_for_placement)) {
2207
    // The tablespace bg task is disabled. Nothing to do.
2208
127
    return;
2209
127
  }
2210
2211
117k
  const bool is_task_running = tablespace_bg_task_running_.exchange(true);
2212
117k
  if (is_task_running) {
2213
    // Task already running, nothing to do.
2214
116k
    return;
2215
116k
  }
2216
2217
934
  ScheduleRefreshTablespaceInfoTask(true /* schedule_now */);
2218
934
}
2219
2220
5.06k
void CatalogManager::ScheduleRefreshTablespaceInfoTask(const bool schedule_now) {
2221
5.06k
  int wait_time = 0;
2222
2223
5.06k
  if (!schedule_now) {
2224
4.13k
    wait_time = GetAtomicFlag(&FLAGS_ysql_tablespace_info_refresh_secs);
2225
4.13k
    if (wait_time <= 0) {
2226
      // The tablespace refresh task has been disabled.
2227
0
      tablespace_bg_task_running_ = false;
2228
0
      return;
2229
0
    }
2230
4.13k
  }
2231
2232
5.06k
  refresh_ysql_tablespace_info_task_.Schedule([this](const Status& status) {
2233
4.18k
    Status s = background_tasks_thread_pool_->SubmitFunc(
2234
4.18k
      std::bind(&CatalogManager::RefreshTablespaceInfoPeriodically, this));
2235
4.18k
    if (!s.IsOk()) {
2236
      // Failed to submit task to the thread pool. Mark that the task is now
2237
      // no longer running.
2238
0
      LOG(WARNING) << "Failed to schedule: RefreshTablespaceInfoPeriodically";
2239
0
      tablespace_bg_task_running_ = false;
2240
0
    }
2241
4.18k
  }, wait_time * 1s);
2242
5.06k
}
2243
2244
4.18k
void CatalogManager::RefreshTablespaceInfoPeriodically() {
2245
4.18k
  if (!GetAtomicFlag(&FLAGS_enable_ysql_tablespaces_for_placement)) {
2246
2
    tablespace_bg_task_running_ = false;
2247
2
    return;
2248
2
  }
2249
2250
4.17k
  if (!CheckIsLeaderAndReady().IsOk()) {
2251
44
    LOG(INFO) << "No longer the leader, so cancelling tablespace info task";
2252
44
    tablespace_bg_task_running_ = false;
2253
44
    return;
2254
44
  }
2255
2256
  // Refresh the tablespace info in memory.
2257
4.13k
  Status s = DoRefreshTablespaceInfo();
2258
4.13k
  if (!s.IsOk()) {
2259
166
    LOG(WARNING) << "Tablespace refresh task failed with error " << s.ToString();
2260
166
  }
2261
2262
  // Schedule the next iteration of the task.
2263
4.13k
  ScheduleRefreshTablespaceInfoTask();
2264
4.13k
}
2265
2266
4.13k
Status CatalogManager::DoRefreshTablespaceInfo() {
2267
4.13k
  VLOG
(2) << "Running RefreshTablespaceInfoPeriodically task"0
;
2268
2269
  // First refresh the tablespace info in memory.
2270
4.13k
  auto tablespace_info = 
VERIFY_RESULT3.96k
(GetYsqlTablespaceInfo());3.96k
2271
2272
  // Clear tablespace ids for transaction tables mapped to missing tablespaces.
2273
3.96k
  RETURN_NOT_OK(UpdateTransactionStatusTableTablespaces(*tablespace_info));
2274
2275
3.96k
  shared_ptr<TableToTablespaceIdMap> table_to_tablespace_map = nullptr;
2276
2277
3.96k
  if (tablespace_info->size() > kYsqlNumDefaultTablespaces) {
2278
    // There exist custom tablespaces in the system. Fetch the table->tablespace
2279
    // map from PG catalog tables.
2280
201
    table_to_tablespace_map = VERIFY_RESULT(GetYsqlTableToTablespaceMap(*tablespace_info));
2281
201
  }
2282
2283
  // Update tablespace_manager_.
2284
3.96k
  {
2285
3.96k
    LockGuard lock(tablespace_mutex_);
2286
3.96k
    tablespace_manager_ = std::make_shared<YsqlTablespaceManager>(tablespace_info,
2287
3.96k
                                                                  table_to_tablespace_map);
2288
3.96k
  }
2289
2290
3.96k
  if (table_to_tablespace_map) {
2291
    // Trigger transaction table creates for tablespaces with tables and no transaction tables.
2292
201
    RETURN_NOT_OK(CreateTransactionStatusTablesForTablespaces(
2293
201
        *tablespace_info, *table_to_tablespace_map));
2294
201
  }
2295
2296
3.96k
  VLOG
(3) << "Refreshed tablespace information in memory"0
;
2297
3.96k
  return Status::OK();
2298
3.96k
}
2299
2300
Status CatalogManager::AddIndexInfoToTable(const scoped_refptr<TableInfo>& indexed_table,
2301
                                           const IndexInfoPB& index_info,
2302
1.19k
                                           CreateTableResponsePB* resp) {
2303
1.19k
  LOG(INFO) << "AddIndexInfoToTable to " << indexed_table->ToString() << "  IndexInfo "
2304
1.19k
            << yb::ToString(index_info);
2305
1.19k
  TRACE("Locking indexed table");
2306
1.19k
  auto l = DCHECK_NOTNULL(indexed_table)->LockForWrite();
2307
1.19k
  RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp));
2308
2309
  // Make sure that the index appears to not have been added to the table until the tservers apply
2310
  // the alter and respond back.
2311
  // Heed issue #6233.
2312
1.19k
  if (!l->pb.has_fully_applied_schema()) {
2313
1.18k
    MultiStageAlterTable::CopySchemaDetailsToFullyApplied(&l.mutable_data()->pb);
2314
1.18k
  }
2315
2316
  // Add index info to indexed table and increment schema version.
2317
1.19k
  auto& pb = l.mutable_data()->pb;
2318
1.19k
  pb.add_indexes()->CopyFrom(index_info);
2319
1.19k
  pb.set_version(l.mutable_data()->pb.version() + 1);
2320
1.19k
  pb.set_updates_only_index_permissions(false);
2321
1.19k
  l.mutable_data()->set_state(
2322
1.19k
      SysTablesEntryPB::ALTERING,
2323
1.19k
      Format("Add index info version=$0 ts=$1", pb.version(), LocalTimeAsString()));
2324
2325
  // Update sys-catalog with the new indexed table info.
2326
1.19k
  TRACE("Updating indexed table metadata on disk");
2327
1.19k
  RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), indexed_table));
2328
2329
  // Update the in-memory state.
2330
1.19k
  TRACE("Committing in-memory state");
2331
1.19k
  l.Commit();
2332
2333
1.19k
  RETURN_NOT_OK(SendAlterTableRequest(indexed_table));
2334
2335
1.19k
  return Status::OK();
2336
1.19k
}
2337
2338
Status CatalogManager::CreateCopartitionedTable(const CreateTableRequestPB& req,
2339
                                                CreateTableResponsePB* resp,
2340
                                                rpc::RpcContext* rpc,
2341
                                                Schema schema,
2342
0
                                                scoped_refptr<NamespaceInfo> ns) {
2343
0
  scoped_refptr<TableInfo> parent_table_info;
2344
0
  Status s;
2345
0
  PartitionSchema partition_schema;
2346
0
  std::vector<Partition> partitions;
2347
2348
0
  const NamespaceId& namespace_id = ns->id();
2349
0
  const NamespaceName& namespace_name = ns->name();
2350
2351
0
  LockGuard lock(mutex_);
2352
0
  TRACE("Acquired catalog manager lock");
2353
0
  parent_table_info = FindPtrOrNull(*table_ids_map_,
2354
0
                                    schema.table_properties().CopartitionTableId());
2355
0
  if (parent_table_info == nullptr) {
2356
0
    s = STATUS(NotFound, "The object does not exist: copartitioned table with id",
2357
0
               schema.table_properties().CopartitionTableId());
2358
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
2359
0
  }
2360
2361
0
  TableInfoPtr this_table_info;
2362
  // Verify that the table does not exist.
2363
0
  this_table_info = FindPtrOrNull(table_names_map_, {namespace_id, req.name()});
2364
2365
0
  if (this_table_info != nullptr) {
2366
0
    s = STATUS_SUBSTITUTE(AlreadyPresent,
2367
0
        "Object '$0.$1' already exists",
2368
0
        GetNamespaceNameUnlocked(this_table_info), this_table_info->name());
2369
0
    LOG(WARNING) << "Found table: " << this_table_info->ToStringWithState()
2370
0
                 << ". Failed creating copartitioned table with error: "
2371
0
                 << s.ToString() << " Request:\n" << req.DebugString();
2372
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_ALREADY_PRESENT, s);
2373
0
  }
2374
  // Don't add copartitioned tables to Namespaces that aren't running.
2375
0
  if (ns->state() != SysNamespaceEntryPB::RUNNING) {
2376
0
    Status s = STATUS_SUBSTITUTE(TryAgain,
2377
0
        "Namespace not running (State=$0).  Cannot create $1.$2",
2378
0
        ns->state(), ns->name(), req.name() );
2379
0
    return SetupError(resp->mutable_error(), NamespaceMasterError(ns->state()), s);
2380
0
  }
2381
2382
  // TODO: pass index_info for copartitioned index.
2383
0
  RETURN_NOT_OK(CreateTableInMemory(
2384
0
      req, schema, partition_schema, namespace_id, namespace_name,
2385
0
      partitions, nullptr, nullptr, resp, &this_table_info));
2386
2387
0
  TRACE("Inserted new table info into CatalogManager maps");
2388
2389
  // NOTE: the table is already locked for write at this point,
2390
  // since the CreateTableInfo function leave it in that state.
2391
  // It will get committed at the end of this function.
2392
  // Sanity check: the table should be in "preparing" state.
2393
0
  CHECK_EQ(SysTablesEntryPB::PREPARING, this_table_info->metadata().dirty().pb.state());
2394
0
  TabletInfos tablets = parent_table_info->GetTablets();
2395
0
  for (auto tablet : tablets) {
2396
0
    tablet->mutable_metadata()->StartMutation();
2397
0
    tablet->mutable_metadata()->mutable_dirty()->pb.add_table_ids(this_table_info->id());
2398
0
  }
2399
2400
  // Update Tablets about new table id to sys-tablets.
2401
0
  s = sys_catalog_->Upsert(leader_ready_term(), tablets);
2402
0
  if (PREDICT_FALSE(!s.ok())) {
2403
0
    return AbortTableCreation(this_table_info.get(), tablets, s.CloneAndPrepend(
2404
0
        Substitute("An error occurred while inserting to sys-tablets: $0", s.ToString())), resp);
2405
0
  }
2406
0
  TRACE("Wrote tablets to system table");
2407
2408
  // Update the on-disk table state to "running".
2409
0
  this_table_info->AddTablets(tablets);
2410
0
  this_table_info->mutable_metadata()->mutable_dirty()->pb.set_state(SysTablesEntryPB::RUNNING);
2411
0
  s = sys_catalog_->Upsert(leader_ready_term(), this_table_info);
2412
0
  if (PREDICT_FALSE(!s.ok())) {
2413
0
    return AbortTableCreation(this_table_info.get(), tablets, s.CloneAndPrepend(
2414
0
        Substitute("An error occurred while inserting to sys-tablets: $0",
2415
0
                   s.ToString())), resp);
2416
0
  }
2417
0
  TRACE("Wrote table to system table");
2418
2419
  // Commit the in-memory state.
2420
0
  this_table_info->mutable_metadata()->CommitMutation();
2421
2422
0
  for (const auto& tablet : tablets) {
2423
0
    tablet->mutable_metadata()->CommitMutation();
2424
0
  }
2425
2426
0
  for (const auto& tablet : tablets) {
2427
0
    SendCopartitionTabletRequest(tablet, this_table_info);
2428
0
  }
2429
2430
0
  LOG(INFO) << "Successfully created table " << this_table_info->ToString()
2431
0
            << " per request from " << RequestorString(rpc);
2432
0
  return Status::OK();
2433
0
}
2434
2435
2436
template <class Req, class Resp, class Action>
2437
0
Status CatalogManager::PerformOnSysCatalogTablet(const Req& req, Resp* resp, const Action& action) {
2438
0
  auto tablet_peer = sys_catalog_->tablet_peer();
2439
0
  auto shared_tablet = tablet_peer ? tablet_peer->shared_tablet() : nullptr;
2440
0
  if (!shared_tablet) {
2441
0
    return SetupError(
2442
0
        resp->mutable_error(),
2443
0
        MasterErrorPB::TABLET_NOT_RUNNING,
2444
0
        STATUS(NotFound, "The sys catalog tablet was not found."));
2445
0
  }
2446
2447
0
  auto s = action(shared_tablet);
2448
0
  if (!s.ok()) {
2449
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INTERNAL_ERROR, s);
2450
0
  }
2451
2452
0
  return Status::OK();
2453
0
}
Unexecuted instantiation: catalog_manager.cc:yb::Status yb::master::CatalogManager::PerformOnSysCatalogTablet<yb::master::FlushSysCatalogRequestPB const*, yb::master::FlushSysCatalogResponsePB, yb::master::CatalogManager::FlushSysCatalog(yb::master::FlushSysCatalogRequestPB const*, yb::master::FlushSysCatalogResponsePB*, yb::rpc::RpcContext*)::$_4>(yb::master::FlushSysCatalogRequestPB const* const&, yb::master::FlushSysCatalogResponsePB*, yb::master::CatalogManager::FlushSysCatalog(yb::master::FlushSysCatalogRequestPB const*, yb::master::FlushSysCatalogResponsePB*, yb::rpc::RpcContext*)::$_4 const&)
Unexecuted instantiation: catalog_manager.cc:yb::Status yb::master::CatalogManager::PerformOnSysCatalogTablet<yb::master::CompactSysCatalogRequestPB const*, yb::master::CompactSysCatalogResponsePB, yb::master::CatalogManager::CompactSysCatalog(yb::master::CompactSysCatalogRequestPB const*, yb::master::CompactSysCatalogResponsePB*, yb::rpc::RpcContext*)::$_5>(yb::master::CompactSysCatalogRequestPB const* const&, yb::master::CompactSysCatalogResponsePB*, yb::master::CatalogManager::CompactSysCatalog(yb::master::CompactSysCatalogRequestPB const*, yb::master::CompactSysCatalogResponsePB*, yb::rpc::RpcContext*)::$_5 const&)
2454
2455
Status CatalogManager::FlushSysCatalog(
2456
    const FlushSysCatalogRequestPB* req,
2457
    FlushSysCatalogResponsePB* resp,
2458
0
    rpc::RpcContext* context) {
2459
0
  return PerformOnSysCatalogTablet(req, resp, [](auto shared_tablet) {
2460
0
    return shared_tablet->Flush(tablet::FlushMode::kSync);
2461
0
  });
2462
0
}
2463
2464
Status CatalogManager::CompactSysCatalog(
2465
    const CompactSysCatalogRequestPB* req,
2466
    CompactSysCatalogResponsePB* resp,
2467
0
    rpc::RpcContext* context) {
2468
0
  return PerformOnSysCatalogTablet(req, resp, [](auto shared_tablet) {
2469
0
    return shared_tablet->ForceFullRocksDBCompact();
2470
0
  });
2471
0
}
2472
2473
namespace {
2474
2475
Result<std::array<PartitionPB, kNumSplitParts>> CreateNewTabletsPartition(
2476
142
    const TabletInfo& tablet_info, const std::string& split_partition_key) {
2477
142
  const auto& source_partition = tablet_info.LockForRead()->pb.partition();
2478
2479
142
  if (split_partition_key <= source_partition.partition_key_start() ||
2480
142
      (!source_partition.partition_key_end().empty() &&
2481
142
       
split_partition_key >= source_partition.partition_key_end()78
)) {
2482
0
    return STATUS_FORMAT(
2483
0
        InvalidArgument,
2484
0
        "Can't split tablet $0 (partition_key_start: $1 partition_key_end: $2) by partition "
2485
0
        "boundary (split_key: $3)",
2486
0
        tablet_info.tablet_id(), source_partition.partition_key_start(),
2487
0
        source_partition.partition_key_end(), split_partition_key);
2488
0
  }
2489
2490
142
  std::array<PartitionPB, kNumSplitParts> new_tablets_partition;
2491
2492
142
  new_tablets_partition.fill(source_partition);
2493
2494
142
  new_tablets_partition[0].set_partition_key_end(split_partition_key);
2495
142
  new_tablets_partition[1].set_partition_key_start(split_partition_key);
2496
142
  static_assert(kNumSplitParts == 2, "We expect tablet to be split into 2 new tablets here");
2497
2498
142
  return new_tablets_partition;
2499
142
}
2500
2501
}  // namespace
2502
2503
CHECKED_STATUS CatalogManager::TEST_SplitTablet(
2504
    const TabletId& tablet_id, const std::string& split_encoded_key,
2505
0
    const std::string& split_partition_key) {
2506
0
  auto source_tablet_info = VERIFY_RESULT(GetTabletInfo(tablet_id));
2507
0
  return DoSplitTablet(source_tablet_info, split_encoded_key, split_partition_key,
2508
0
      true /* select_all_tablets_for_split */);
2509
0
}
2510
2511
Status CatalogManager::TEST_SplitTablet(
2512
0
    const scoped_refptr<TabletInfo>& source_tablet_info, docdb::DocKeyHash split_hash_code) {
2513
0
  return DoSplitTablet(source_tablet_info, split_hash_code,
2514
0
      true /* select_all_tablets_for_split */);
2515
0
}
2516
2517
0
Status CatalogManager::TEST_IncrementTablePartitionListVersion(const TableId& table_id) {
2518
0
  auto table_info = GetTableInfo(table_id);
2519
0
  SCHECK(table_info != nullptr, NotFound, Format("Table $0 not found", table_id));
2520
2521
0
  LockGuard lock(mutex_);
2522
0
  auto table_lock = table_info->LockForWrite();
2523
0
  auto& table_pb = table_lock.mutable_data()->pb;
2524
0
  table_pb.set_partition_list_version(table_pb.partition_list_version() + 1);
2525
0
  RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), table_info));
2526
0
  table_lock.Commit();
2527
0
  return Status::OK();
2528
0
}
2529
2530
Result<ReplicationInfoPB> CatalogManager::GetTableReplicationInfo(
2531
640
    const TabletInfo& tablet_info) const {
2532
640
  auto table = tablet_info.table();
2533
640
  {
2534
640
    auto table_lock = table->LockForRead();
2535
640
    if (table_lock->pb.has_replication_info()) {
2536
0
      return table_lock->pb.replication_info();
2537
0
    }
2538
640
  }
2539
2540
640
  auto replication_info_opt = VERIFY_RESULT(
2541
640
      GetTablespaceManager()->GetTableReplicationInfo(table));
2542
640
  if (replication_info_opt) {
2543
0
    return replication_info_opt.value();
2544
0
  }
2545
2546
640
  return ClusterConfig()->LockForRead()->pb.replication_info();
2547
640
}
2548
2549
bool CatalogManager::ShouldSplitValidCandidate(
2550
3.49M
    const TabletInfo& tablet_info, const TabletReplicaDriveInfo& drive_info) const {
2551
3.49M
  if (drive_info.may_have_orphaned_post_split_data) {
2552
232k
    return false;
2553
232k
  }
2554
3.26M
  ssize_t size = drive_info.sst_files_size;
2555
3.26M
  DCHECK
(size >= 0) << "Detected overflow in casting sst_files_size to signed int."0
;
2556
3.26M
  if (size < FLAGS_tablet_split_low_phase_size_threshold_bytes) {
2557
3.26M
    return false;
2558
3.26M
  }
2559
640
  TSDescriptorVector ts_descs = GetAllLiveNotBlacklistedTServers();
2560
2561
640
  size_t num_servers = 0;
2562
640
  auto table_replication_info_or_status = GetTableReplicationInfo(tablet_info);
2563
2564
  // If there is custom placement information present then
2565
  // only count the tservers which the table has access to
2566
  // according to the placement policy
2567
640
  if (table_replication_info_or_status.ok()
2568
640
      && table_replication_info_or_status->has_live_replicas()) {
2569
0
    auto pb = table_replication_info_or_status->live_replicas();
2570
0
    auto valid_tservers_res = FindTServersForPlacementInfo(
2571
0
      table_replication_info_or_status->live_replicas(), ts_descs);
2572
0
    if (!valid_tservers_res.ok()) {
2573
0
      num_servers = ts_descs.size();
2574
0
    } else {
2575
0
      num_servers = valid_tservers_res.get().size();
2576
0
    }
2577
640
  } else {
2578
640
    num_servers = ts_descs.size();
2579
640
  }
2580
2581
640
  int64 num_tablets_per_server = tablet_info.table()->NumPartitions() / num_servers;
2582
2583
640
  if (num_tablets_per_server < FLAGS_tablet_split_low_phase_shard_count_per_node) {
2584
0
    return size > FLAGS_tablet_split_low_phase_size_threshold_bytes;
2585
0
  }
2586
640
  if (num_tablets_per_server < FLAGS_tablet_split_high_phase_shard_count_per_node) {
2587
0
    return size > FLAGS_tablet_split_high_phase_size_threshold_bytes;
2588
0
  }
2589
640
  return size > FLAGS_tablet_force_split_threshold_bytes;
2590
640
}
2591
2592
Status CatalogManager::DoSplitTablet(
2593
    const scoped_refptr<TabletInfo>& source_tablet_info, std::string split_encoded_key,
2594
142
    std::string split_partition_key, bool select_all_tablets_for_split) {
2595
142
  auto source_table_lock = source_tablet_info->table()->LockForWrite();
2596
142
  auto source_tablet_lock = source_tablet_info->LockForWrite();
2597
2598
  // We must re-validate the split candidate here *after* grabbing locks on the table and tablet to
2599
  // ensure a backfill does not happen before we modify catalog metadata to include new subtablets.
2600
  // This process adds new subtablets in the CREATING state, which if encountered by backfill code
2601
  // will block the backfill process.
2602
142
  RETURN_NOT_OK(tablet_split_manager_.ValidateSplitCandidateTable(*source_tablet_info->table()));
2603
142
  RETURN_NOT_OK(tablet_split_manager_.ValidateSplitCandidateTablet(*source_tablet_info));
2604
2605
142
  auto drive_info = VERIFY_RESULT(source_tablet_info->GetLeaderReplicaDriveInfo());
2606
142
  if (!select_all_tablets_for_split &&
2607
142
      
!ShouldSplitValidCandidate(*source_tablet_info, drive_info)133
) {
2608
    // It is possible that we queued up a split candidate in TabletSplitManager which was, at the
2609
    // time, a valid split candidate, but by the time the candidate was actually processed here, the
2610
    // cluster may have changed, putting us in a new split threshold phase, and it may no longer be
2611
    // a valid candidate. This is not an unexpected error, but we should bail out of splitting this
2612
    // tablet regardless.
2613
0
    return STATUS_FORMAT(
2614
0
        InvalidArgument,
2615
0
        "Tablet split candidate $0 is no longer a valid split candidate.",
2616
0
        source_tablet_info->tablet_id());
2617
0
  }
2618
2619
  // Check if at least one child tablet already registered
2620
142
  if (source_tablet_lock->pb.split_tablet_ids().size() > 0) {
2621
98
    const auto child_tablet_id = source_tablet_lock->pb.split_tablet_ids(0);
2622
98
    const auto child_tablet = VERIFY_RESULT(GetTabletInfo(child_tablet_id));
2623
0
    const auto parent_partition = source_tablet_lock->pb.partition();
2624
98
    const auto child_partition = child_tablet->LockForRead()->pb.partition();
2625
2626
98
    if (parent_partition.partition_key_start() == child_partition.partition_key_start()) {
2627
98
      split_partition_key = child_partition.partition_key_end();
2628
98
    } else {
2629
0
      SCHECK_EQ(parent_partition.partition_key_end(), child_partition.partition_key_end(),
2630
0
        IllegalState, "Parent partion key end does not equal child partition key end");
2631
0
      split_partition_key = child_partition.partition_key_start();
2632
0
    }
2633
2634
    // Re-compute the encoded key
2635
    // to ensure we use the same partition boundary for both child tablets
2636
98
    split_encoded_key = PartitionSchema::GetEncodedKeyPrefix(
2637
98
      split_partition_key, source_table_lock->pb.partition_schema());
2638
98
  }
2639
2640
142
  LOG(INFO) << "Starting tablet split: " << source_tablet_info->ToString()
2641
142
            << " by partition key: " << Slice(split_partition_key).ToDebugHexString();
2642
2643
142
  std::array<PartitionPB, kNumSplitParts> new_tablets_partition = VERIFY_RESULT(
2644
142
      CreateNewTabletsPartition(*source_tablet_info, split_partition_key));
2645
2646
0
  std::array<TabletId, kNumSplitParts> new_tablet_ids;
2647
424
  for (int i = 0; i < kNumSplitParts; 
++i282
) {
2648
282
    if (i < source_tablet_lock->pb.split_tablet_ids_size()) {
2649
      // Post-split tablet `i` has been already registered.
2650
194
      new_tablet_ids[i] = source_tablet_lock->pb.split_tablet_ids(i);
2651
194
    } else {
2652
88
      auto new_tablet_info = VERIFY_RESULT(RegisterNewTabletForSplit(
2653
88
          source_tablet_info.get(), new_tablets_partition[i],
2654
88
          &source_table_lock, &source_tablet_lock));
2655
2656
0
      new_tablet_ids[i] = new_tablet_info->id();
2657
88
    }
2658
282
  }
2659
142
  source_tablet_lock.Commit();
2660
142
  source_table_lock.Commit();
2661
2662
  // TODO(tsplit): what if source tablet will be deleted before or during TS leader is processing
2663
  // split? Add unit-test.
2664
142
  RETURN_NOT_OK(SendSplitTabletRequest(
2665
142
      source_tablet_info, new_tablet_ids, split_encoded_key, split_partition_key));
2666
2667
142
  return Status::OK();
2668
142
}
2669
2670
Status CatalogManager::DoSplitTablet(
2671
    const scoped_refptr<TabletInfo>& source_tablet_info, docdb::DocKeyHash split_hash_code,
2672
0
    bool select_all_tablets_for_split) {
2673
0
  docdb::KeyBytes split_encoded_key;
2674
0
  docdb::DocKeyEncoderAfterTableIdStep(&split_encoded_key)
2675
0
      .Hash(split_hash_code, std::vector<docdb::PrimitiveValue>());
2676
2677
0
  const auto split_partition_key = PartitionSchema::EncodeMultiColumnHashValue(split_hash_code);
2678
2679
0
  return DoSplitTablet(source_tablet_info, split_encoded_key.ToStringBuffer(), split_partition_key,
2680
0
      select_all_tablets_for_split);
2681
0
}
2682
2683
247
Result<scoped_refptr<TabletInfo>> CatalogManager::GetTabletInfo(const TabletId& tablet_id) {
2684
247
  LockGuard lock(mutex_);
2685
247
  TRACE("Acquired catalog manager lock");
2686
2687
247
  const auto tablet_info = FindPtrOrNull(*tablet_map_, tablet_id);
2688
247
  SCHECK(tablet_info != nullptr, NotFound, Format("Tablet $0 not found", tablet_id));
2689
2690
247
  return tablet_info;
2691
247
}
2692
2693
void CatalogManager::SplitTabletWithKey(
2694
    const scoped_refptr<TabletInfo>& tablet, const std::string& split_encoded_key,
2695
142
    const std::string& split_partition_key, const bool select_all_tablets_for_split) {
2696
  // Note that DoSplitTablet() will trigger an async SplitTablet task, and will only return not OK()
2697
  // if it failed to submit that task. In other words, any failures here are not retriable, and
2698
  // success indicates that an async and automatically retrying task was submitted.
2699
142
  auto s = DoSplitTablet(
2700
142
      tablet, split_encoded_key, split_partition_key, select_all_tablets_for_split);
2701
142
  WARN_NOT_OK(s, Format("Failed to split tablet with GetSplitKey result for tablet: $0",
2702
142
                        tablet->tablet_id()));
2703
142
}
2704
2705
143
Status CatalogManager::SplitTablet(const TabletId& tablet_id, bool select_all_tablets_for_split) {
2706
143
  LOG(INFO) << "Got tablet to split: " << tablet_id;
2707
2708
143
  const auto tablet = VERIFY_RESULT(GetTabletInfo(tablet_id));
2709
2710
0
  VLOG(2) << "Scheduling GetSplitKey request to leader tserver for source tablet ID: "
2711
0
          << tablet->tablet_id();
2712
143
  auto call = std::make_shared<AsyncGetTabletSplitKey>(
2713
143
      master_, AsyncTaskPool(), tablet,
2714
143
      [this, tablet, select_all_tablets_for_split]
2715
143
          (const Result<AsyncGetTabletSplitKey::Data>& result) {
2716
143
        if (result.ok()) {
2717
142
          SplitTabletWithKey(tablet, result->split_encoded_key, result->split_partition_key,
2718
142
              select_all_tablets_for_split);
2719
142
        } else 
if (1
tserver::TabletServerError(result.status()) ==
2720
1
            tserver::TabletServerErrorPB::TABLET_SPLIT_DISABLED_TTL_EXPIRY) {
2721
0
          tablet_split_manager()->MarkTtlTableForSplitIgnore(tablet->table()->id());
2722
0
          LOG(INFO) << "AsyncGetTabletSplitKey task failed for tablet " << tablet->tablet_id()
2723
0
              << ". Tablet split not supported for tablets with TTL file expiration.";
2724
1
        } else {
2725
1
          LOG(WARNING) << "AsyncGetTabletSplitKey task failed with status: " << result.status();
2726
1
        }
2727
143
      });
2728
143
  tablet->table()->AddTask(call);
2729
143
  return ScheduleTask(call);
2730
143
}
2731
2732
Status CatalogManager::SplitTablet(
2733
10
    const SplitTabletRequestPB* req, SplitTabletResponsePB* resp, rpc::RpcContext* rpc) {
2734
10
  const auto source_tablet_id = req->tablet_id();
2735
10
  return SplitTablet(source_tablet_id, true /* select_all_tablets_for_split */);
2736
10
}
2737
2738
Status CatalogManager::DeleteNotServingTablet(
2739
    const DeleteNotServingTabletRequestPB* req, DeleteNotServingTabletResponsePB* resp,
2740
6
    rpc::RpcContext* rpc) {
2741
6
  const auto& tablet_id = req->tablet_id();
2742
6
  const auto tablet_info = VERIFY_RESULT(GetTabletInfo(tablet_id));
2743
2744
6
  if (PREDICT_FALSE(FLAGS_TEST_reject_delete_not_serving_tablet_rpc)) {
2745
0
    TEST_SYNC_POINT("CatalogManager::DeleteNotServingTablet:Reject");
2746
0
    return STATUS(
2747
0
        InvalidArgument, "Rejecting due to FLAGS_TEST_reject_delete_not_serving_tablet_rpc");
2748
0
  }
2749
2750
6
  const auto& table_info = tablet_info->table();
2751
2752
6
  RETURN_NOT_OK(CheckIfForbiddenToDeleteTabletOf(table_info));
2753
2754
6
  RETURN_NOT_OK(CatalogManagerUtil::CheckIfCanDeleteSingleTablet(tablet_info));
2755
2756
6
  auto schedules_to_tables_map = VERIFY_RESULT(
2757
6
      MakeSnapshotSchedulesToObjectIdsMap(SysRowEntryType::TABLE));
2758
0
  RepeatedBytes retained_by_snapshot_schedules;
2759
6
  FillRetainedBySnapshotSchedules(
2760
6
      schedules_to_tables_map, table_info->id(), &retained_by_snapshot_schedules);
2761
2762
6
  return DeleteTabletListAndSendRequests(
2763
6
      { tablet_info }, "Not serving tablet deleted upon request at " + LocalTimeAsString(),
2764
6
      retained_by_snapshot_schedules);
2765
6
}
2766
2767
Status CatalogManager::DdlLog(
2768
1
    const DdlLogRequestPB* req, DdlLogResponsePB* resp, rpc::RpcContext* rpc) {
2769
1
  return sys_catalog_->FetchDdlLog(resp->mutable_entries());
2770
1
}
2771
2772
namespace {
2773
2774
21.8k
CHECKED_STATUS ValidateCreateTableSchema(const Schema& schema, CreateTableResponsePB* resp) {
2775
21.8k
  if (schema.num_key_columns() <= 0) {
2776
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA,
2777
0
                      STATUS(InvalidArgument, "Must specify at least one key column"));
2778
0
  }
2779
61.3k
  
for (size_t i = 0; 21.8k
i < schema.num_key_columns();
i++39.5k
) {
2780
39.5k
    if (!IsTypeAllowableInKey(schema.column(i).type_info())) {
2781
0
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA,
2782
0
                        STATUS(InvalidArgument, "Invalid datatype for primary key column"));
2783
0
    }
2784
39.5k
  }
2785
21.8k
  return Status::OK();
2786
21.8k
}
2787
2788
// Extract a colocation ID from request if explicitly passed, or generate a new valid one.
2789
// Will error if requested ID is taken or invalid.
2790
template<typename ContainsColocationIdFn>
2791
Result<ColocationId> ConceiveColocationId(const CreateTableRequestPB& req,
2792
                                          CreateTableResponsePB* resp,
2793
129
                                          ContainsColocationIdFn contains_colocation_id) {
2794
129
  ColocationId colocation_id;
2795
2796
129
  if (req.has_colocation_id()) {
2797
24
    colocation_id = req.colocation_id();
2798
24
    if (colocation_id < kFirstNormalColocationId) {
2799
0
      Status s = STATUS_SUBSTITUTE(InvalidArgument,
2800
0
                                   "Colocation ID cannot be less than $0",
2801
0
                                   kFirstNormalColocationId);
2802
0
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
2803
0
    }
2804
24
    if (contains_colocation_id(colocation_id)) {
2805
2
      Status s =
2806
2
          STATUS_SUBSTITUTE(InvalidArgument,
2807
2
                            "Colocation group already contains a table with colocation ID $0",
2808
2
                            colocation_id);
2809
2
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
2810
2
    }
2811
105
  } else {
2812
    // Generate a random colocation ID unique within colocation group.
2813
105
    colocation_id = 20000; // In agreement with sequential_colocation_ids flag.
2814
125
    do {
2815
125
      if (PREDICT_FALSE(FLAGS_TEST_sequential_colocation_ids)) {
2816
40
        colocation_id++;
2817
85
      } else {
2818
        // See comment on kFirstNormalColocationId.
2819
85
        colocation_id =
2820
85
            RandomUniformInt<ColocationId>(kFirstNormalColocationId,
2821
85
                                           std::numeric_limits<ColocationId>::max());
2822
85
      }
2823
125
    } while (contains_colocation_id(colocation_id));
2824
105
  }
2825
2826
127
  return colocation_id;
2827
129
}
catalog_manager.cc:yb::Result<unsigned int> yb::master::(anonymous namespace)::ConceiveColocationId<yb::master::CatalogManager::CreateTable(yb::master::CreateTableRequestPB const*, yb::master::CreateTableResponsePB*, yb::rpc::RpcContext*)::$_7>(yb::master::CreateTableRequestPB const&, yb::master::CreateTableResponsePB*, yb::master::CatalogManager::CreateTable(yb::master::CreateTableRequestPB const*, yb::master::CreateTableResponsePB*, yb::rpc::RpcContext*)::$_7)
Line
Count
Source
2793
89
                                          ContainsColocationIdFn contains_colocation_id) {
2794
89
  ColocationId colocation_id;
2795
2796
89
  if (req.has_colocation_id()) {
2797
21
    colocation_id = req.colocation_id();
2798
21
    if (colocation_id < kFirstNormalColocationId) {
2799
0
      Status s = STATUS_SUBSTITUTE(InvalidArgument,
2800
0
                                   "Colocation ID cannot be less than $0",
2801
0
                                   kFirstNormalColocationId);
2802
0
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
2803
0
    }
2804
21
    if (contains_colocation_id(colocation_id)) {
2805
2
      Status s =
2806
2
          STATUS_SUBSTITUTE(InvalidArgument,
2807
2
                            "Colocation group already contains a table with colocation ID $0",
2808
2
                            colocation_id);
2809
2
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
2810
2
    }
2811
68
  } else {
2812
    // Generate a random colocation ID unique within colocation group.
2813
68
    colocation_id = 20000; // In agreement with sequential_colocation_ids flag.
2814
86
    do {
2815
86
      if (PREDICT_FALSE(FLAGS_TEST_sequential_colocation_ids)) {
2816
33
        colocation_id++;
2817
53
      } else {
2818
        // See comment on kFirstNormalColocationId.
2819
53
        colocation_id =
2820
53
            RandomUniformInt<ColocationId>(kFirstNormalColocationId,
2821
53
                                           std::numeric_limits<ColocationId>::max());
2822
53
      }
2823
86
    } while (contains_colocation_id(colocation_id));
2824
68
  }
2825
2826
87
  return colocation_id;
2827
89
}
catalog_manager.cc:yb::Result<unsigned int> yb::master::(anonymous namespace)::ConceiveColocationId<yb::master::CatalogManager::CreateTable(yb::master::CreateTableRequestPB const*, yb::master::CreateTableResponsePB*, yb::rpc::RpcContext*)::$_8>(yb::master::CreateTableRequestPB const&, yb::master::CreateTableResponsePB*, yb::master::CatalogManager::CreateTable(yb::master::CreateTableRequestPB const*, yb::master::CreateTableResponsePB*, yb::rpc::RpcContext*)::$_8)
Line
Count
Source
2793
40
                                          ContainsColocationIdFn contains_colocation_id) {
2794
40
  ColocationId colocation_id;
2795
2796
40
  if (req.has_colocation_id()) {
2797
3
    colocation_id = req.colocation_id();
2798
3
    if (colocation_id < kFirstNormalColocationId) {
2799
0
      Status s = STATUS_SUBSTITUTE(InvalidArgument,
2800
0
                                   "Colocation ID cannot be less than $0",
2801
0
                                   kFirstNormalColocationId);
2802
0
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
2803
0
    }
2804
3
    if (contains_colocation_id(colocation_id)) {
2805
0
      Status s =
2806
0
          STATUS_SUBSTITUTE(InvalidArgument,
2807
0
                            "Colocation group already contains a table with colocation ID $0",
2808
0
                            colocation_id);
2809
0
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
2810
0
    }
2811
37
  } else {
2812
    // Generate a random colocation ID unique within colocation group.
2813
37
    colocation_id = 20000; // In agreement with sequential_colocation_ids flag.
2814
39
    do {
2815
39
      if (PREDICT_FALSE(FLAGS_TEST_sequential_colocation_ids)) {
2816
7
        colocation_id++;
2817
32
      } else {
2818
        // See comment on kFirstNormalColocationId.
2819
32
        colocation_id =
2820
32
            RandomUniformInt<ColocationId>(kFirstNormalColocationId,
2821
32
                                           std::numeric_limits<ColocationId>::max());
2822
32
      }
2823
39
    } while (contains_colocation_id(colocation_id));
2824
37
  }
2825
2826
40
  return colocation_id;
2827
40
}
2828
2829
}  // namespace
2830
2831
Status CatalogManager::CreateYsqlSysTable(const CreateTableRequestPB* req,
2832
13.3k
                                          CreateTableResponsePB* resp) {
2833
13.3k
  LOG(INFO) << "CreateYsqlSysTable: " << req->name();
2834
  // Lookup the namespace and verify if it exists.
2835
13.3k
  TRACE("Looking up namespace");
2836
13.3k
  auto ns = VERIFY_RESULT(FindNamespace(req->namespace_()));
2837
0
  const NamespaceId& namespace_id = ns->id();
2838
13.3k
  const NamespaceName& namespace_name = ns->name();
2839
2840
13.3k
  Schema schema;
2841
13.3k
  RETURN_NOT_OK(SchemaFromPB(req->schema(), &schema));
2842
  // If the schema contains column ids, we are copying a Postgres table from one namespace to
2843
  // another. Anyway, validate the schema.
2844
13.3k
  RETURN_NOT_OK(ValidateCreateTableSchema(schema, resp));
2845
13.3k
  if (!schema.has_column_ids()) {
2846
313
    schema.InitColumnIdsByDefault();
2847
313
  }
2848
13.3k
  schema.mutable_table_properties()->set_is_ysql_catalog_table(true);
2849
2850
  // Verify no hash partition schema is specified.
2851
13.3k
  if (req->partition_schema().has_hash_schema()) {
2852
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA,
2853
0
                      STATUS(InvalidArgument,
2854
0
                             "PostgreSQL system catalog tables are non-partitioned"));
2855
0
  }
2856
2857
13.3k
  if (req->table_type() != TableType::PGSQL_TABLE_TYPE) {
2858
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA,
2859
0
                      STATUS_FORMAT(
2860
0
                          InvalidArgument,
2861
0
                          "Expected table type to be PGSQL_TABLE_TYPE ($0), got $1 ($2)",
2862
0
                          PGSQL_TABLE_TYPE,
2863
0
                          TableType_Name(req->table_type())));
2864
2865
0
  }
2866
2867
  // Create partition schema and one partition.
2868
13.3k
  PartitionSchema partition_schema;
2869
13.3k
  vector<Partition> partitions;
2870
13.3k
  RETURN_NOT_OK(partition_schema.CreatePartitions(1, &partitions));
2871
2872
  // Create table info in memory.
2873
13.3k
  scoped_refptr<TableInfo> table;
2874
13.3k
  scoped_refptr<TabletInfo> sys_catalog_tablet;
2875
13.3k
  {
2876
13.3k
    LockGuard lock(mutex_);
2877
13.3k
    TRACE("Acquired catalog manager lock");
2878
2879
    // Verify that the table does not exist, or has been deleted.
2880
13.3k
    table = FindPtrOrNull(*table_ids_map_, req->table_id());
2881
13.3k
    if (table != nullptr && 
!table->is_deleted()1
) {
2882
0
      Status s = STATUS_SUBSTITUTE(AlreadyPresent,
2883
0
          "YSQL table '$0.$1' (ID: $2) already exists", ns->name(), table->name(), table->id());
2884
0
      LOG(WARNING) << "Found table: " << table->ToStringWithState()
2885
0
                   << ". Failed creating YSQL system table with error: "
2886
0
                   << s.ToString() << " Request:\n" << req->DebugString();
2887
      // Technically, client already knows table ID, but we set it anyway for unified handling of
2888
      // AlreadyPresent errors. See comment in CreateTable()
2889
0
      resp->set_table_id(table->id());
2890
0
      return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_ALREADY_PRESENT, s);
2891
0
    }
2892
2893
13.3k
    RETURN_NOT_OK(CreateTableInMemory(
2894
13.3k
        *req, schema, partition_schema, namespace_id, namespace_name,
2895
13.3k
        partitions, nullptr /* index_info */, nullptr /* tablets */, resp, &table));
2896
2897
13.3k
    sys_catalog_tablet = tablet_map_->find(kSysCatalogTabletId)->second;
2898
13.3k
  }
2899
2900
  // Tables with a transaction should be rolled back if the transaction does not get committed.
2901
  // Store this on the table persistent state until the transaction has been a verified success.
2902
0
  TransactionMetadata txn;
2903
13.3k
  if (req->has_transaction() && 
FLAGS_enable_transactional_ddl_gc55
) {
2904
55
    table->mutable_metadata()->mutable_dirty()->pb.mutable_transaction()->
2905
55
        CopyFrom(req->transaction());
2906
55
    txn = VERIFY_RESULT(TransactionMetadata::FromPB(req->transaction()));
2907
55
    RSTATUS_DCHECK(!txn.status_tablet.empty(), Corruption, "Given incomplete Transaction");
2908
55
  }
2909
2910
13.3k
  {
2911
13.3k
    auto tablet_lock = sys_catalog_tablet->LockForWrite();
2912
13.3k
    tablet_lock.mutable_data()->pb.add_table_ids(table->id());
2913
2914
13.3k
    Status s = sys_catalog_->Upsert(leader_ready_term(), sys_catalog_tablet);
2915
13.3k
    if (PREDICT_FALSE(!s.ok())) {
2916
1
      return AbortTableCreation(table.get(), {}, s.CloneAndPrepend(
2917
1
        "An error occurred while inserting to sys-tablets: "), resp);
2918
1
    }
2919
13.3k
    table->set_is_system();
2920
13.3k
    table->AddTablet(sys_catalog_tablet.get());
2921
13.3k
    tablet_lock.Commit();
2922
13.3k
  }
2923
13.3k
  TRACE("Inserted new table info into CatalogManager maps");
2924
2925
  // Update the on-disk table state to "running".
2926
13.3k
  table->mutable_metadata()->mutable_dirty()->pb.set_state(SysTablesEntryPB::RUNNING);
2927
13.3k
  Status s = sys_catalog_->Upsert(leader_ready_term(), table);
2928
13.3k
  if (PREDICT_FALSE(!s.ok())) {
2929
0
    return AbortTableCreation(table.get(), {}, s.CloneAndPrepend(
2930
0
      "An error occurred while inserting to sys-tablets: "), resp);
2931
0
  }
2932
13.3k
  TRACE("Wrote table to system table");
2933
2934
  // Commit the in-memory state.
2935
13.3k
  table->mutable_metadata()->CommitMutation();
2936
2937
  // Verify Transaction gets committed, which occurs after table create finishes.
2938
13.3k
  if (req->has_transaction() && 
PREDICT_TRUE55
(FLAGS_enable_transactional_ddl_gc)) {
2939
55
    LOG(INFO) << "Enqueuing table for Transaction Verification: " << req->name();
2940
55
    std::function<Status(bool)> when_done =
2941
55
        std::bind(&CatalogManager::VerifyTablePgLayer, this, table, _1);
2942
55
    WARN_NOT_OK(background_tasks_thread_pool_->SubmitFunc(
2943
55
        std::bind(&YsqlTransactionDdl::VerifyTransaction, ysql_transaction_.get(), txn, when_done)),
2944
55
                "Could not submit VerifyTransaction to thread pool");
2945
55
  }
2946
2947
13.3k
  tablet::ChangeMetadataRequestPB change_req;
2948
13.3k
  change_req.set_tablet_id(kSysCatalogTabletId);
2949
13.3k
  auto& add_table = *change_req.mutable_add_table();
2950
2951
13.3k
  add_table.set_table_id(req->table_id());
2952
13.3k
  add_table.set_table_type(TableType::PGSQL_TABLE_TYPE);
2953
13.3k
  add_table.set_table_name(req->name());
2954
13.3k
  SchemaToPB(schema, add_table.mutable_schema());
2955
13.3k
  add_table.set_schema_version(0);
2956
2957
13.3k
  partition_schema.ToPB(add_table.mutable_partition_schema());
2958
2959
13.3k
  RETURN_NOT_OK(tablet::SyncReplicateChangeMetadataOperation(
2960
13.3k
      &change_req, sys_catalog_->tablet_peer().get(), leader_ready_term()));
2961
2962
13.3k
  if (initial_snapshot_writer_) {
2963
1.12k
    initial_snapshot_writer_->AddMetadataChange(change_req);
2964
1.12k
  }
2965
13.3k
  return Status::OK();
2966
13.3k
}
2967
2968
Status CatalogManager::ReservePgsqlOids(const ReservePgsqlOidsRequestPB* req,
2969
                                        ReservePgsqlOidsResponsePB* resp,
2970
805
                                        rpc::RpcContext* rpc) {
2971
805
  VLOG
(1) << "ReservePgsqlOids request: " << req->ShortDebugString()0
;
2972
2973
  // Lookup namespace
2974
805
  scoped_refptr<NamespaceInfo> ns;
2975
805
  {
2976
805
    SharedLock lock(mutex_);
2977
805
    ns = FindPtrOrNull(namespace_ids_map_, req->namespace_id());
2978
805
  }
2979
805
  if (!ns) {
2980
0
    return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND,
2981
0
                      STATUS(NotFound, "Namespace not found", req->namespace_id()));
2982
0
  }
2983
2984
  // Reserve oids.
2985
805
  auto l = ns->LockForWrite();
2986
2987
805
  uint32_t begin_oid = l->pb.next_pg_oid();
2988
805
  if (begin_oid < req->next_oid()) {
2989
709
    begin_oid = req->next_oid();
2990
709
  }
2991
805
  if (begin_oid == std::numeric_limits<uint32_t>::max()) {
2992
0
    LOG(WARNING) << Format("No more object identifier is available for Postgres database $0 ($1)",
2993
0
                           l->pb.name(), req->namespace_id());
2994
0
    return SetupError(resp->mutable_error(), MasterErrorPB::UNKNOWN_ERROR,
2995
0
                      STATUS(InvalidArgument, "No more object identifier is available"));
2996
0
  }
2997
2998
805
  uint32_t end_oid = begin_oid + req->count();
2999
805
  if (end_oid < begin_oid) {
3000
0
    end_oid = std::numeric_limits<uint32_t>::max(); // Handle wraparound.
3001
0
  }
3002
3003
805
  resp->set_begin_oid(begin_oid);
3004
805
  resp->set_end_oid(end_oid);
3005
805
  l.mutable_data()->pb.set_next_pg_oid(end_oid);
3006
3007
  // Update the on-disk state.
3008
805
  const Status s = sys_catalog_->Upsert(leader_ready_term(), ns);
3009
805
  if (!s.ok()) {
3010
0
    return SetupError(resp->mutable_error(), MasterErrorPB::UNKNOWN_ERROR, s);
3011
0
  }
3012
3013
  // Commit the in-memory state.
3014
805
  l.Commit();
3015
3016
805
  VLOG
(1) << "ReservePgsqlOids response: " << resp->ShortDebugString()0
;
3017
3018
805
  return Status::OK();
3019
805
}
3020
3021
Status CatalogManager::GetYsqlCatalogConfig(const GetYsqlCatalogConfigRequestPB* req,
3022
                                            GetYsqlCatalogConfigResponsePB* resp,
3023
22
                                            rpc::RpcContext* rpc) {
3024
22
  VLOG
(1) << "GetYsqlCatalogConfig request: " << req->ShortDebugString()0
;
3025
22
  auto l = CHECK_NOTNULL(ysql_catalog_config_.get())->LockForRead();
3026
22
  resp->set_version(l->pb.ysql_catalog_config().version());
3027
3028
22
  return Status::OK();
3029
22
}
3030
3031
Status CatalogManager::CopyPgsqlSysTables(const NamespaceId& namespace_id,
3032
123
                                          const std::vector<scoped_refptr<TableInfo>>& tables) {
3033
123
  const uint32_t database_oid = CHECK_RESULT(GetPgsqlDatabaseOid(namespace_id));
3034
123
  vector<TableId> source_table_ids;
3035
123
  vector<TableId> target_table_ids;
3036
15.4k
  for (const auto& table : tables) {
3037
15.4k
    CreateTableRequestPB table_req;
3038
15.4k
    CreateTableResponsePB table_resp;
3039
3040
15.4k
    const uint32_t table_oid = VERIFY_RESULT(GetPgsqlTableOid(table->id()));
3041
0
    const TableId table_id = GetPgsqlTableId(database_oid, table_oid);
3042
3043
    // Hold read lock until rows from the table are copied also.
3044
15.4k
    auto l = table->LockForRead();
3045
3046
    // Skip shared table.
3047
15.4k
    if (l->pb.is_pg_shared_table()) {
3048
2.37k
      continue;
3049
2.37k
    }
3050
3051
13.0k
    table_req.set_name(l->pb.name());
3052
13.0k
    table_req.mutable_namespace_()->set_id(namespace_id);
3053
13.0k
    table_req.set_table_type(PGSQL_TABLE_TYPE);
3054
13.0k
    table_req.mutable_schema()->CopyFrom(l->schema());
3055
13.0k
    table_req.set_is_pg_catalog_table(true);
3056
13.0k
    table_req.set_table_id(table_id);
3057
3058
13.0k
    if (IsIndex(l->pb)) {
3059
5.80k
      const uint32_t indexed_table_oid =
3060
5.80k
        VERIFY_RESULT(GetPgsqlTableOid(GetIndexedTableId(l->pb)));
3061
0
      const TableId indexed_table_id = GetPgsqlTableId(database_oid, indexed_table_oid);
3062
3063
      // Set index_info.
3064
      // Previously created INDEX wouldn't have the attribute index_info.
3065
5.80k
      if (l->pb.has_index_info()) {
3066
5.80k
        table_req.mutable_index_info()->CopyFrom(l->pb.index_info());
3067
5.80k
        table_req.mutable_index_info()->set_indexed_table_id(indexed_table_id);
3068
5.80k
      }
3069
3070
      // Set deprecated field for index_info.
3071
5.80k
      table_req.set_indexed_table_id(indexed_table_id);
3072
5.80k
      table_req.set_is_local_index(PROTO_GET_IS_LOCAL(l->pb));
3073
5.80k
      table_req.set_is_unique_index(PROTO_GET_IS_UNIQUE(l->pb));
3074
5.80k
    }
3075
3076
13.0k
    auto s = CreateYsqlSysTable(&table_req, &table_resp);
3077
13.0k
    if (!s.ok()) {
3078
1
      return s.CloneAndPrepend(Substitute(
3079
1
          "Failure when creating PGSQL System Tables: $0", table_resp.error().ShortDebugString()));
3080
1
    }
3081
3082
13.0k
    source_table_ids.push_back(table->id());
3083
13.0k
    target_table_ids.push_back(table_id);
3084
13.0k
  }
3085
122
  RETURN_NOT_OK(
3086
122
      sys_catalog_->CopyPgsqlTables(source_table_ids, target_table_ids, leader_ready_term()));
3087
122
  return Status::OK();
3088
122
}
3089
3090
1.35k
size_t CatalogManager::GetNumLiveTServersForPlacement(const PlacementId& placement_id) {
3091
1.35k
  auto blacklist = BlacklistSetFromPB();
3092
1.35k
  TSDescriptorVector ts_descs;
3093
1.35k
  master_->ts_manager()->GetAllLiveDescriptorsInCluster(
3094
1.35k
      &ts_descs, placement_id, (blacklist.ok() ? *blacklist : 
BlacklistSet()0
));
3095
1.35k
  return ts_descs.size();
3096
1.35k
}
3097
3098
196k
TSDescriptorVector CatalogManager::GetAllLiveNotBlacklistedTServers() const {
3099
196k
  TSDescriptorVector ts_descs;
3100
196k
  auto blacklist = BlacklistSetFromPB();
3101
196k
  master_->ts_manager()->GetAllLiveDescriptors(
3102
196k
      &ts_descs, blacklist.ok() ? *blacklist : 
BlacklistSet()0
);
3103
196k
  return ts_descs;
3104
196k
}
3105
3106
namespace {
3107
3108
429k
size_t GetNumReplicasFromPlacementInfo(const PlacementInfoPB& placement_info) {
3109
429k
  return placement_info.num_replicas() > 0 ?
3110
417k
      
placement_info.num_replicas()11.6k
: FLAGS_replication_factor;
3111
429k
}
3112
3113
Status CheckNumReplicas(const PlacementInfoPB& placement_info,
3114
                        const TSDescriptorVector& ts_descs,
3115
                        const vector<Partition>& partitions,
3116
8.44k
                        CreateTableResponsePB* resp) {
3117
8.44k
  auto max_tablets = FLAGS_max_create_tablets_per_ts * ts_descs.size();
3118
8.44k
  auto num_replicas = GetNumReplicasFromPlacementInfo(placement_info);
3119
8.44k
  if (num_replicas > 1 && 
max_tablets > 07.58k
&&
partitions.size() > max_tablets7.54k
) {
3120
0
    std::string msg = Substitute("The requested number of tablets ($0) is over the permitted "
3121
0
                                 "maximum ($1)", partitions.size(), max_tablets);
3122
0
    Status s = STATUS(InvalidArgument, msg);
3123
0
    LOG(WARNING) << msg;
3124
0
    return SetupError(resp->mutable_error(), MasterErrorPB::TOO_MANY_TABLETS, s);
3125
0
  }
3126
3127
8.44k
  return Status::OK();
3128
8.44k
}
3129
3130
} // namespace
3131
3132
// Create a new table.
3133
// See README file in this directory for a description of the design.
3134
Status CatalogManager::CreateTable(const CreateTableRequestPB* orig_req,
3135
                                   CreateTableResponsePB* resp,
3136
8.77k
                                   rpc::RpcContext* rpc) {
3137
8.77k
  DVLOG
(3) << __PRETTY_FUNCTION__ << " Begin. " << orig_req->DebugString()0
;
3138
3139
8.77k
  const bool is_pg_table = orig_req->table_type() == PGSQL_TABLE_TYPE;
3140
8.77k
  const bool is_pg_catalog_table = is_pg_table && 
orig_req->is_pg_catalog_table()5.22k
;
3141
8.77k
  if (!is_pg_catalog_table || 
!FLAGS_hide_pg_catalog_table_creation_logs313
) {
3142
8.51k
    LOG(INFO) << "CreateTable from " << RequestorString(rpc)
3143
8.51k
                << ":\n" << orig_req->DebugString();
3144
8.51k
  } else {
3145
256
    LOG(INFO) << "CreateTable from " << RequestorString(rpc) << ": " << orig_req->name();
3146
256
  }
3147
3148
8.77k
  const bool is_transactional = orig_req->schema().table_properties().is_transactional();
3149
  // If this is a transactional table, we need to create the transaction status table (if it does
3150
  // not exist already).
3151
8.77k
  if (is_transactional && 
(5.66k
!is_pg_catalog_table5.66k
||
!FLAGS_create_initial_sys_catalog_snapshot313
)) {
3152
5.40k
    Status s = CreateGlobalTransactionStatusTableIfNeeded(rpc);
3153
5.40k
    if (!s.ok()) {
3154
0
      return s.CloneAndPrepend("Error while creating transaction status table");
3155
0
    }
3156
5.40k
  } else {
3157
3.36k
    VLOG(1)
3158
0
        << "Not attempting to create a transaction status table:\n"
3159
0
        << "  " << EXPR_VALUE_FOR_LOG(is_transactional) << "\n "
3160
0
        << "  " << EXPR_VALUE_FOR_LOG(is_pg_catalog_table) << "\n "
3161
0
        << "  " << EXPR_VALUE_FOR_LOG(FLAGS_create_initial_sys_catalog_snapshot);
3162
3.36k
  }
3163
3164
  // If this is a transactional table and there is a associated tablespace, try to create a
3165
  // local transaction status table for the tablespace if there is a placement attached to it
3166
  // (and if it does not exist already).
3167
8.77k
  if (GetAtomicFlag(&FLAGS_auto_create_local_transaction_tables)) {
3168
8.77k
    if (is_transactional && 
orig_req->has_tablespace_id()5.66k
) {
3169
169
      const auto& tablespace_id = orig_req->tablespace_id();
3170
169
      auto tablespace_pb = VERIFY_RESULT(GetTablespaceReplicationInfoWithRetry(tablespace_id));
3171
169
      if (tablespace_pb) {
3172
58
        RETURN_NOT_OK(CreateLocalTransactionStatusTableIfNeeded(rpc, tablespace_id));
3173
111
      } else {
3174
111
        VLOG(1)
3175
0
            << "Not attempting to create a local transaction status table: "
3176
0
            << "tablespace " << EXPR_VALUE_FOR_LOG(tablespace_id) << " has no placement\n";
3177
111
      }
3178
8.60k
    } else {
3179
8.60k
        VLOG(1)
3180
0
            << "Not attempting to create a local transaction status table:\n"
3181
0
            << "  " << EXPR_VALUE_FOR_LOG(is_transactional) << "\n "
3182
0
            << "  " << EXPR_VALUE_FOR_LOG(orig_req->has_tablespace_id());
3183
8.60k
    }
3184
8.77k
  }
3185
3186
8.76k
  if (is_pg_catalog_table) {
3187
313
    return CreateYsqlSysTable(orig_req, resp);
3188
313
  }
3189
3190
8.45k
  Status s;
3191
8.45k
  const char* const object_type = PROTO_PTR_IS_TABLE(orig_req) ? 
"table"7.24k
:
"index"1.20k
;
3192
3193
  // Copy the request, so we can fill in some defaults.
3194
8.45k
  CreateTableRequestPB req = *orig_req;
3195
3196
  // Lookup the namespace and verify if it exists.
3197
8.45k
  TRACE("Looking up namespace");
3198
8.45k
  auto ns = 
VERIFY_RESULT8.44k
(FindNamespace(req.namespace_()));8.44k
3199
0
  bool colocated;
3200
8.44k
  NamespaceId namespace_id;
3201
8.44k
  NamespaceName namespace_name;
3202
8.44k
  {
3203
8.44k
    auto ns_lock = ns->LockForRead();
3204
8.44k
    if (ns->database_type() != GetDatabaseTypeForTable(req.table_type())) {
3205
0
      Status s = STATUS(NotFound, "Namespace not found");
3206
0
      return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s);
3207
0
    }
3208
8.44k
    namespace_id = ns->id();
3209
8.44k
    namespace_name = ns->name();
3210
8.44k
    colocated = ns->colocated();
3211
8.44k
  }
3212
3213
  // For index table, find the table info
3214
0
  scoped_refptr<TableInfo> indexed_table;
3215
8.44k
  if (IsIndex(req)) {
3216
1.20k
    TRACE("Looking up indexed table");
3217
1.20k
    indexed_table = GetTableInfo(req.indexed_table_id());
3218
1.20k
    if (indexed_table == nullptr) {
3219
0
      return STATUS_SUBSTITUTE(
3220
0
            NotFound, "The indexed table $0 does not exist", req.indexed_table_id());
3221
0
    }
3222
3223
1.20k
    TRACE("Locking indexed table");
3224
1.20k
    RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(indexed_table->LockForRead(), resp));
3225
1.20k
  }
3226
3227
  // Determine if this table should be colocated. If not specified, the table should be colocated if
3228
  // and only if the namespace is colocated.
3229
8.44k
  if (!req.colocated()) {
3230
    // Opt out of colocation if the request says so.
3231
3.97k
    colocated = false;
3232
4.47k
  } else if (indexed_table && 
!indexed_table->colocated()1.18k
) {
3233
    // Opt out of colocation if the indexed table opted out of colocation.
3234
1.17k
    colocated = false;
3235
1.17k
  }
3236
3237
  // TODO: If this is a colocated index table in a colocated database, convert any hash partition
3238
  // columns into range partition columns. This is because postgres does not know that this index
3239
  // table is in a colocated database. When we get to the "tablespaces" step where we store this
3240
  // into PG metadata, then PG will know if db/table is colocated and do the work there.
3241
8.44k
  if ((colocated || 
req.has_tablegroup_id()8.38k
) &&
IsIndex(req)203
) {
3242
65
    for (auto& col_pb : *req.mutable_schema()->mutable_columns()) {
3243
65
      col_pb.set_is_hash_key(false);
3244
65
    }
3245
29
  }
3246
3247
  // Validate schema.
3248
8.44k
  Schema schema;
3249
8.44k
  RETURN_NOT_OK(SchemaFromPB(req.schema(), &schema));
3250
8.44k
  RETURN_NOT_OK(ValidateCreateTableSchema(schema, resp));
3251
3252
  // checking that referenced user-defined types (if any) exist.
3253
8.44k
  {
3254
8.44k
    SharedLock lock(mutex_);
3255
31.0k
    for (size_t i = 0; i < schema.num_columns(); 
i++22.6k
) {
3256
22.6k
      for (const auto &udt_id : schema.column(i).type()->GetUserDefinedTypeIds()) {
3257
73
        if (FindPtrOrNull(udtype_ids_map_, udt_id) == nullptr) {
3258
0
          Status s = STATUS(InvalidArgument, "Referenced user-defined type not found");
3259
0
          return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
3260
0
        }
3261
73
      }
3262
22.6k
    }
3263
8.44k
  }
3264
  // TODO (ENG-1860) The referenced namespace and types retrieved/checked above could be deleted
3265
  // some time between this point and table creation below.
3266
3267
  // Usually the column ids are available if it's called on the backup-restoring code path
3268
  // (from CatalogManager::RecreateTable). Else the column ids must be empty in the client schema.
3269
8.44k
  if (!schema.has_column_ids()) {
3270
8.44k
    schema.InitColumnIdsByDefault();
3271
8.44k
  }
3272
3273
8.44k
  if (schema.table_properties().HasCopartitionTableId()) {
3274
0
    return CreateCopartitionedTable(req, resp, rpc, schema, ns);
3275
0
  }
3276
3277
8.44k
  if (colocated || 
req.has_tablegroup_id()8.38k
) {
3278
    // If the table is colocated, then there should be no hash partition columns.
3279
    // Do the same for tables that are being placed in tablegroups.
3280
203
    if (schema.num_hash_key_columns() > 0) {
3281
3
      Status s = STATUS(InvalidArgument, "Cannot colocate hash partitioned table");
3282
3
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
3283
3
    }
3284
8.24k
  } else if (
3285
8.24k
      !req.partition_schema().has_hash_schema() && 
!req.partition_schema().has_range_schema()3.78k
) {
3286
    // If neither hash nor range schema have been specified by the protobuf request, we assume the
3287
    // table uses a hash schema, and we use the table_type and hash_key to determine the hashing
3288
    // scheme (redis or multi-column) that should be used.
3289
3.49k
    if (req.table_type() == REDIS_TABLE_TYPE) {
3290
418
      req.mutable_partition_schema()->set_hash_schema(PartitionSchemaPB::REDIS_HASH_SCHEMA);
3291
3.07k
    } else if (schema.num_hash_key_columns() > 0) {
3292
3.07k
      req.mutable_partition_schema()->set_hash_schema(PartitionSchemaPB::MULTI_COLUMN_HASH_SCHEMA);
3293
3.07k
    } else {
3294
0
      Status s = STATUS(InvalidArgument, "Unknown table type or partitioning method");
3295
0
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
3296
0
    }
3297
3.49k
  }
3298
3299
  // Verify that custom placement policy has not been specified for colocated table.
3300
8.44k
  const bool is_replication_info_set = IsReplicationInfoSet(req.replication_info());
3301
8.44k
  if (is_replication_info_set && 
colocated1
) {
3302
0
    Status s = STATUS(InvalidArgument, "Custom placement policy should not be set for "
3303
0
      "colocated tables");
3304
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_TABLE_REPLICATION_INFO, s);
3305
0
  }
3306
3307
8.44k
  if (is_replication_info_set && 
req.table_type() == PGSQL_TABLE_TYPE1
) {
3308
0
    const Status s = STATUS(InvalidArgument, "Cannot set placement policy for YSQL tables "
3309
0
        "use Tablespaces instead");
3310
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s);
3311
0
  }
3312
3313
  // Get placement info.
3314
8.44k
  const ReplicationInfoPB& replication_info = VERIFY_RESULT(
3315
8.44k
    GetTableReplicationInfo(req.replication_info(), req.tablespace_id()));
3316
0
  const PlacementInfoPB& placement_info = replication_info.live_replicas();
3317
3318
  // Calculate number of tablets to be used. Priorities:
3319
  //   1. Use Internally specified value from 'CreateTableRequestPB::num_tablets'.
3320
  //   2. Use User specified value from
3321
  //      'CreateTableRequestPB::SchemaPB::TablePropertiesPB::num_tablets'.
3322
  //      Note, that the number will be saved in schema stored in the master persistent
3323
  //      SysCatalog irrespective of which way we choose the number of tablets to create.
3324
  //      If nothing is specified in this field, nothing will be stored in the table
3325
  //      TablePropertiesPB for number of tablets
3326
  //   3. Calculate own value.
3327
8.44k
  int num_tablets = 0;
3328
8.44k
  if (req.has_num_tablets()) {
3329
6.69k
    num_tablets = req.num_tablets(); // Internal request.
3330
6.69k
  }
3331
3332
8.44k
  if (num_tablets <= 0 && 
schema.table_properties().HasNumTablets()1.74k
) {
3333
1.15k
    num_tablets = schema.table_properties().num_tablets(); // User request.
3334
1.15k
  }
3335
3336
8.44k
  if (num_tablets <= 0) {
3337
    // Use default as client could have gotten the value before any tserver had heartbeated
3338
    // to (a new) master leader.
3339
599
    const auto num_live_tservers =
3340
599
        GetNumLiveTServersForPlacement(placement_info.placement_uuid());
3341
599
    num_tablets = narrow_cast<int>(
3342
599
        num_live_tservers * (is_pg_table ? 
FLAGS_ysql_num_shards_per_tserver71
3343
599
                                         : 
FLAGS_yb_num_shards_per_tserver528
));
3344
599
    LOG(INFO) << "Setting default tablets to " << num_tablets << " with "
3345
599
              << num_live_tservers << " primary servers";
3346
599
  }
3347
3348
  // Create partitions.
3349
8.44k
  PartitionSchema partition_schema;
3350
8.44k
  vector<Partition> partitions;
3351
8.44k
  if (colocated || 
req.has_tablegroup_id()8.38k
) {
3352
200
    RETURN_NOT_OK(partition_schema.CreatePartitions(1, &partitions));
3353
200
    req.clear_partition_schema();
3354
200
    num_tablets = 1;
3355
8.24k
  } else {
3356
8.24k
    RETURN_NOT_OK(PartitionSchema::FromPB(req.partition_schema(), schema, &partition_schema));
3357
8.24k
    if (req.partitions_size() > 0) {
3358
1
      if (req.partitions_size() != num_tablets) {
3359
0
        Status s = STATUS(InvalidArgument, "Partitions are not defined for all tablets");
3360
0
        return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
3361
0
      }
3362
1
      string last;
3363
2
      for (const auto& p : req.partitions()) {
3364
2
        Partition np;
3365
2
        Partition::FromPB(p, &np);
3366
2
        if (np.partition_key_start() != last) {
3367
0
          Status s = STATUS(InvalidArgument,
3368
0
                            "Partitions does not cover the full partition keyspace");
3369
0
          return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
3370
0
        }
3371
2
        last = np.partition_key_end();
3372
2
        partitions.push_back(std::move(np));
3373
2
      }
3374
8.24k
    } else {
3375
      // Supplied number of partitions is merely a suggestion, actual number of
3376
      // created partitions might differ.
3377
8.24k
      RETURN_NOT_OK(partition_schema.CreatePartitions(num_tablets, &partitions));
3378
8.24k
    }
3379
    // The vector 'partitions' contains real setup partitions, so the variable
3380
    // should be updated.
3381
8.24k
    num_tablets = narrow_cast<int>(partitions.size());
3382
8.24k
  }
3383
3384
8.44k
  TSDescriptorVector all_ts_descs;
3385
8.44k
  master_->ts_manager()->GetAllLiveDescriptors(&all_ts_descs);
3386
8.44k
  RETURN_NOT_OK(CheckNumReplicas(placement_info, all_ts_descs, partitions, resp));
3387
3388
8.44k
  if (!FLAGS_TEST_skip_placement_validation_createtable_api) {
3389
8.43k
    ValidateReplicationInfoRequestPB validate_req;
3390
8.43k
    validate_req.mutable_replication_info()->CopyFrom(replication_info);
3391
8.43k
    ValidateReplicationInfoResponsePB validate_resp;
3392
8.43k
    RETURN_NOT_OK(ValidateReplicationInfo(&validate_req, &validate_resp));
3393
8.43k
  }
3394
3395
8.42k
  LOG(INFO) << "Set number of tablets: " << num_tablets;
3396
8.42k
  req.set_num_tablets(num_tablets);
3397
3398
  // For index table, populate the index info.
3399
8.42k
  IndexInfoPB index_info;
3400
3401
8.42k
  const bool index_backfill_enabled =
3402
8.42k
      IsIndexBackfillEnabled(orig_req->table_type(), is_transactional);
3403
8.42k
  if (req.has_index_info()) {
3404
    // Current message format.
3405
1.18k
    index_info.CopyFrom(req.index_info());
3406
3407
    // Assign column-ids that have just been computed and assigned to "index_info".
3408
1.18k
    if (!is_pg_table) {
3409
444
      DCHECK_EQ(index_info.columns().size(), schema.num_columns())
3410
0
        << "Number of columns are not the same between index_info and index_schema";
3411
2.18k
      for (size_t colidx = 0; colidx < schema.num_columns(); 
colidx++1.73k
) {
3412
1.73k
        index_info.mutable_columns(narrow_cast<int>(colidx))->set_column_id(
3413
1.73k
            schema.column_id(colidx));
3414
1.73k
      }
3415
444
    }
3416
7.24k
  } else if (req.has_indexed_table_id()) {
3417
    // Old client message format when rolling upgrade (Not having "index_info").
3418
18
    IndexInfoBuilder index_info_builder(&index_info);
3419
18
    index_info_builder.ApplyProperties(req.indexed_table_id(),
3420
18
        req.is_local_index(), req.is_unique_index());
3421
18
    if (orig_req->table_type() != PGSQL_TABLE_TYPE) {
3422
18
      Schema indexed_schema;
3423
18
      RETURN_NOT_OK(indexed_table->GetSchema(&indexed_schema));
3424
18
      RETURN_NOT_OK(index_info_builder.ApplyColumnMapping(indexed_schema, schema));
3425
18
    }
3426
18
  }
3427
3428
8.42k
  if ((req.has_index_info() || 
req.has_indexed_table_id()7.24k
) &&
3429
8.42k
      
index_backfill_enabled1.20k
&&
3430
8.42k
      
!req.skip_index_backfill()1.12k
) {
3431
    // Start off the index table with major compactions disabled. We need this to retain the delete
3432
    // markers until the backfill process is completed.  No need to set index_permissions in the
3433
    // index table.
3434
937
    schema.SetRetainDeleteMarkers(true);
3435
937
  }
3436
3437
8.42k
  LOG(INFO) << "CreateTable with IndexInfo " << AsString(index_info);
3438
3439
8.42k
  scoped_refptr<TableInfo> table;
3440
8.42k
  TabletInfos tablets;
3441
8.42k
  bool tablets_exist;
3442
8.42k
  bool tablegroup_tablets_exist = false;
3443
3444
8.42k
  {
3445
8.42k
    LockGuard lock(mutex_);
3446
8.42k
    auto ns_lock = ns->LockForRead();
3447
8.42k
    TRACE("Acquired catalog manager lock");
3448
3449
8.42k
    tablets_exist =
3450
8.42k
        colocated && 
colocated_tablet_ids_map_.find(ns->id()) != colocated_tablet_ids_map_.end()57
;
3451
    // Verify that the table does not exist.
3452
8.42k
    table = FindPtrOrNull(table_names_map_, {namespace_id, req.name()});
3453
3454
8.42k
    if (table != nullptr) {
3455
4
      s = STATUS_SUBSTITUTE(AlreadyPresent,
3456
4
              "Object '$0.$1' already exists", ns->name(), table->name());
3457
4
      LOG(WARNING) << "Found table: " << table->ToStringWithState()
3458
4
                   << ". Failed creating table with error: "
3459
4
                   << s.ToString() << " Request:\n" << orig_req->DebugString();
3460
      // If the table already exists, we set the response table_id field to the id of the table that
3461
      // already exists. This is necessary because before we return the error to the client (or
3462
      // success in case of a "CREATE TABLE IF NOT EXISTS" request) we want to wait for the existing
3463
      // table to be available to receive requests. And we need the table id for that.
3464
4
      resp->set_table_id(table->id());
3465
4
      return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_ALREADY_PRESENT, s);
3466
4
    }
3467
3468
    // Namespace state validity check:
3469
    // 1. Allow Namespaces that are RUNNING
3470
    // 2. Allow Namespaces that are PREPARING under 2 situations
3471
    //    2a. System Namespaces.
3472
    //    2b. The parent table from a Colocated Namespace.
3473
8.42k
    const auto parent_table_name = ns->id() + kColocatedParentTableNameSuffix;
3474
8.42k
    bool valid_ns_state = (ns->state() == SysNamespaceEntryPB::RUNNING) ||
3475
8.42k
      
(18
ns->state() == SysNamespaceEntryPB::PREPARING18
&&
3476
18
        (ns->name() == kSystemNamespaceName || req.name() == parent_table_name));
3477
8.42k
    if (!valid_ns_state) {
3478
1
      Status s = STATUS_SUBSTITUTE(TryAgain, "Invalid Namespace State ($0). Cannot create $1.$2",
3479
1
          SysNamespaceEntryPB::State_Name(ns->state()), ns->name(), req.name());
3480
1
      return SetupError(resp->mutable_error(), NamespaceMasterError(ns->state()), s);
3481
1
    }
3482
3483
    // Check whether this CREATE TABLE request which has a tablegroup_id is for a normal user table
3484
    // or the request to create the parent table for the tablegroup. This is done by checking the
3485
    // catalog manager maps.
3486
8.42k
    if (req.has_tablegroup_id() &&
3487
8.42k
        
tablegroup_tablet_ids_map_.find(ns->id()) != tablegroup_tablet_ids_map_.end()143
&&
3488
8.42k
        tablegroup_tablet_ids_map_[ns->id()].find(req.tablegroup_id()) !=
3489
124
        tablegroup_tablet_ids_map_[ns->id()].end()) {
3490
89
      tablegroup_tablets_exist = true;
3491
89
    }
3492
3493
    // Generate colocation ID in advance in order to fail before CreateTableInMemory is called.
3494
8.42k
    ColocationId colocation_id = kColocationIdNotSet;
3495
8.42k
    if (req.has_tablegroup_id() && 
tablegroup_tablets_exist143
) {
3496
89
      auto tablegroup = tablegroup_ids_map_[req.tablegroup_id()];
3497
3498
89
      colocation_id = 
VERIFY_RESULT87
(
3499
87
          ConceiveColocationId(req, resp, [tablegroup](auto colocation_id) {
3500
87
            return tablegroup->HasChildTable(colocation_id);
3501
87
          }));
3502
8.33k
    } else if (colocated && 
tablets_exist57
) {
3503
40
      auto tablet = colocated_tablet_ids_map_[ns->id()];
3504
40
      auto tablet_lock = tablet->LockForWrite();
3505
3506
40
      std::set<ColocationId> colocation_ids;
3507
40
      if (!req.has_colocation_id()) {
3508
148
        for (const TableId& table_id : tablet_lock.data().pb.table_ids()) {
3509
148
          DCHECK(!table_id.empty());
3510
148
          const auto colocated_table_info = GetTableInfoUnlocked(table_id);
3511
148
          if (!colocated_table_info) {
3512
            // Needed because of #11129, should be replaced with DCHECK after the fix.
3513
0
            continue;
3514
0
          }
3515
148
          Schema colocated_table_schema;
3516
148
          RETURN_NOT_OK(colocated_table_info->GetSchema(&colocated_table_schema));
3517
148
          colocation_ids.insert(colocated_table_schema.colocation_id());
3518
148
        }
3519
37
      }
3520
3521
40
      colocation_id = VERIFY_RESULT(
3522
40
          ConceiveColocationId(req, resp, [&colocation_ids](auto colocation_id) {
3523
40
            return ContainsKey(colocation_ids, colocation_id);
3524
40
          }));
3525
40
    }
3526
3527
8.42k
    RETURN_NOT_OK(CreateTableInMemory(
3528
8.42k
        req, schema, partition_schema, namespace_id, namespace_name, partitions, &index_info,
3529
8.42k
        (!tablets_exist && !tablegroup_tablets_exist) ? &tablets : nullptr, resp, &table));
3530
3531
    // Section is executed when a table is either the parent table or a user table in a tablegroup.
3532
    // It additionally sets the table metadata (and tablet metadata if this is the parent table)
3533
    // to have the colocated property so we can take advantage of code reuse.
3534
8.42k
    if (req.has_tablegroup_id()) {
3535
141
      table->mutable_metadata()->mutable_dirty()->pb.set_colocated(true);
3536
141
      if (tablegroup_tablets_exist) {
3537
        // If the table is not a tablegroup parent table, it performs a lookup for the proper tablet
3538
        // to place the table on as a child table.
3539
87
        auto tablet = tablegroup_tablet_ids_map_[ns->id()][req.tablegroup_id()];
3540
87
        RSTATUS_DCHECK(
3541
87
            tablet->colocated(), InternalError,
3542
87
            "The tablet for tablegroup should be colocated.");
3543
87
        tablets.push_back(tablet.get());
3544
87
        auto tablet_lock = tablet->LockForWrite();
3545
87
        tablet_lock.mutable_data()->pb.add_table_ids(table->id());
3546
87
        RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), tablet));
3547
87
        tablet_lock.Commit();
3548
3549
87
        auto tablegroup = tablegroup_ids_map_[req.tablegroup_id()];
3550
3551
87
        CHECK(colocation_id != kColocationIdNotSet);
3552
87
        table->mutable_metadata()->mutable_dirty()->
3553
87
            pb.mutable_schema()->mutable_colocated_table_id()->set_colocation_id(colocation_id);
3554
3555
87
        tablet->mutable_metadata()->StartMutation();
3556
87
        table->AddTablet(tablet);
3557
87
        tablegroup->AddChildTable(table->id(), colocation_id);
3558
3559
87
        table_tablegroup_ids_map_[table->id()] = tablegroup->id();
3560
87
      } else {
3561
        // If the table is a tablegroup parent table, it creates a dummy tablet for the tablegroup
3562
        // along with updating the catalog manager maps.
3563
54
        RSTATUS_DCHECK_EQ(
3564
54
            tablets.size(), 1U, InternalError,
3565
54
            "Only one tablet should be created for each tablegroup");
3566
54
        tablets[0]->mutable_metadata()->mutable_dirty()->pb.set_colocated(true);
3567
        // Update catalog manager maps for tablegroups
3568
54
        tablegroup_tablet_ids_map_[ns->id()][req.tablegroup_id()] =
3569
54
            tablet_map_->find(tablets[0]->id())->second;
3570
54
      }
3571
8.28k
    } else if (colocated) {
3572
57
      table->mutable_metadata()->mutable_dirty()->pb.set_colocated(true);
3573
      // if the tablet already exists, add the tablet to tablets
3574
57
      if (tablets_exist) {
3575
40
        auto tablet = colocated_tablet_ids_map_[ns->id()];
3576
40
        RSTATUS_DCHECK(
3577
40
            tablet->colocated(), InternalError,
3578
40
            "The tablet for colocated database should be colocated.");
3579
40
        tablets.push_back(tablet.get());
3580
3581
40
        auto tablet_lock = tablet->LockForWrite();
3582
3583
40
        tablet_lock.mutable_data()->pb.add_table_ids(table->id());
3584
40
        RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), tablet));
3585
40
        tablet_lock.Commit();
3586
3587
40
        CHECK(colocation_id != kColocationIdNotSet);
3588
40
        table->mutable_metadata()->mutable_dirty()->
3589
40
            pb.mutable_schema()->mutable_colocated_table_id()->set_colocation_id(colocation_id);
3590
3591
40
        tablet->mutable_metadata()->StartMutation();
3592
40
        table->AddTablet(tablet);
3593
40
      } else {  // Record the tablet
3594
17
        RSTATUS_DCHECK_EQ(
3595
17
            tablets.size(), 1U, InternalError,
3596
17
            "Only one tablet should be created for each colocated database");
3597
17
        tablets[0]->mutable_metadata()->mutable_dirty()->pb.set_colocated(true);
3598
17
        colocated_tablet_ids_map_[ns->id()] = tablet_map_->find(tablets[0]->id())->second;
3599
17
      }
3600
57
    }
3601
8.42k
    if (req.has_matview_pg_table_id()) {
3602
24
      matview_pg_table_ids_map_[req.table_id()] = req.matview_pg_table_id();
3603
24
    }
3604
8.42k
  }
3605
3606
  // For create transaction table requests with tablespace id, save the tablespace id.
3607
0
  const auto is_transaction_status_table =
3608
8.42k
      orig_req->table_type() == TableType::TRANSACTION_STATUS_TABLE_TYPE;
3609
8.42k
  if (is_transaction_status_table && 
req.has_tablespace_id()1.09k
) {
3610
22
    table->mutable_metadata()->mutable_dirty()->pb.set_transaction_table_tablespace_id(
3611
22
        req.tablespace_id());
3612
22
  }
3613
3614
  // Tables with a transaction should be rolled back if the transaction does not get committed.
3615
  // Store this on the table persistent state until the transaction has been a verified success.
3616
8.42k
  TransactionMetadata txn;
3617
8.42k
  if (req.has_transaction() && 
FLAGS_enable_transactional_ddl_gc4.72k
) {
3618
4.72k
    table->mutable_metadata()->mutable_dirty()->pb.mutable_transaction()->
3619
4.72k
        CopyFrom(req.transaction());
3620
4.72k
    txn = VERIFY_RESULT(TransactionMetadata::FromPB(req.transaction()));
3621
4.72k
    RSTATUS_DCHECK(!txn.status_tablet.empty(), Corruption, "Given incomplete Transaction");
3622
4.72k
  }
3623
3624
8.42k
  if (PREDICT_FALSE(FLAGS_TEST_simulate_slow_table_create_secs > 0) &&
3625
8.42k
      
req.table_type() != TableType::TRANSACTION_STATUS_TABLE_TYPE19
) {
3626
12
    LOG(INFO) << "Simulating slow table creation";
3627
12
    SleepFor(MonoDelta::FromSeconds(FLAGS_TEST_simulate_slow_table_create_secs));
3628
12
  }
3629
3630
  // NOTE: the table and tablets are already locked for write at this point,
3631
  // since the CreateTableInfo/CreateTabletInfo functions leave them in that state.
3632
  // They will get committed at the end of this function.
3633
  // Sanity check: the tables and tablets should all be in "preparing" state.
3634
8.42k
  CHECK_EQ(SysTablesEntryPB::PREPARING, table->metadata().dirty().pb.state());
3635
  // Update the on-disk table state to "running".
3636
8.42k
  table->mutable_metadata()->mutable_dirty()->pb.set_state(SysTablesEntryPB::RUNNING);
3637
8.42k
  TRACE("Inserted new table and tablet info into CatalogManager maps");
3638
8.42k
  
VLOG_WITH_PREFIX0
(1) << "Inserted new table and tablet info into CatalogManager maps"0
;
3639
3640
8.42k
  if (!tablets_exist && 
!tablegroup_tablets_exist8.38k
) {
3641
    // Write Tablets to sys-tablets (in "preparing" state).
3642
48.4k
    for (const auto& tablet : tablets) {
3643
48.4k
      CHECK_EQ(SysTabletsEntryPB::PREPARING, tablet->metadata().dirty().pb.state());
3644
48.4k
    }
3645
8.29k
  }
3646
3647
8.42k
  s = sys_catalog_->Upsert(leader_ready_term(), table, tablets);
3648
8.42k
  if (PREDICT_FALSE(!s.ok())) {
3649
6
    return AbortTableCreation(
3650
6
        table.get(), tablets, s.CloneAndPrepend("An error occurred while inserting to sys-tablets"),
3651
6
        resp);
3652
6
  }
3653
8.41k
  TRACE("Wrote table and tablets to system table");
3654
3655
  // For index table, insert index info in the indexed table.
3656
8.41k
  if ((req.has_index_info() || 
req.has_indexed_table_id()7.22k
)) {
3657
1.19k
    if (index_backfill_enabled && 
!req.skip_index_backfill()1.12k
) {
3658
931
      if (is_pg_table) {
3659
        // YSQL: start at some permission before backfill.  The real enforcement happens with
3660
        // pg_index system table's indislive and indisready columns.  Choose WRITE_AND_DELETE
3661
        // because it will probably be less confusing.
3662
552
        index_info.set_index_permissions(INDEX_PERM_WRITE_AND_DELETE);
3663
552
      } else {
3664
        // YCQL
3665
379
        index_info.set_index_permissions(INDEX_PERM_DELETE_ONLY);
3666
379
      }
3667
931
    }
3668
1.19k
    s = AddIndexInfoToTable(indexed_table, index_info, resp);
3669
1.19k
    if (PREDICT_FALSE(!s.ok())) {
3670
0
      return AbortTableCreation(
3671
0
          table.get(), tablets, s.CloneAndPrepend("An error occurred while inserting index info"),
3672
0
          resp);
3673
0
    }
3674
1.19k
  }
3675
3676
  // Commit the in-memory state.
3677
8.41k
  table->mutable_metadata()->CommitMutation();
3678
3679
48.5k
  for (const auto& tablet : tablets) {
3680
48.5k
    tablet->mutable_metadata()->CommitMutation();
3681
48.5k
  }
3682
3683
8.41k
  if ((colocated && 
tablets_exist57
) ||
(8.36k
req.has_tablegroup_id()8.36k
&&
tablegroup_tablets_exist141
)) {
3684
127
    auto call =
3685
127
        std::make_shared<AsyncAddTableToTablet>(master_, AsyncTaskPool(), tablets[0], table);
3686
127
    table->AddTask(call);
3687
127
    WARN_NOT_OK(ScheduleTask(call), "Failed to send AddTableToTablet request");
3688
127
  }
3689
3690
8.41k
  if (req.has_creator_role_name()) {
3691
242
    const NamespaceName& keyspace_name = req.namespace_().name();
3692
242
    const TableName& table_name = req.name();
3693
242
    RETURN_NOT_OK(permissions_manager_->GrantPermissions(
3694
242
        req.creator_role_name(),
3695
242
        get_canonical_table(keyspace_name, table_name),
3696
242
        table_name,
3697
242
        keyspace_name,
3698
242
        all_permissions_for_resource(ResourceType::TABLE),
3699
242
        ResourceType::TABLE,
3700
242
        resp));
3701
242
  }
3702
3703
  // Verify Transaction gets committed, which occurs after table create finishes.
3704
8.41k
  if (req.has_transaction() && 
PREDICT_TRUE4.71k
(FLAGS_enable_transactional_ddl_gc)) {
3705
4.71k
    LOG(INFO) << "Enqueuing table for Transaction Verification: " << req.name();
3706
4.71k
    std::function<Status(bool)> when_done =
3707
4.71k
        std::bind(&CatalogManager::VerifyTablePgLayer, this, table, _1);
3708
4.71k
    WARN_NOT_OK(background_tasks_thread_pool_->SubmitFunc(
3709
4.71k
        std::bind(&YsqlTransactionDdl::VerifyTransaction, ysql_transaction_.get(), txn, when_done)),
3710
4.71k
        "Could not submit VerifyTransaction to thread pool");
3711
4.71k
  }
3712
3713
8.41k
  LOG(INFO) << "Successfully created " << object_type << " " << table->ToString() << " in "
3714
8.41k
            << ns->ToString() << " per request from " << RequestorString(rpc);
3715
8.41k
  background_tasks_->Wake();
3716
3717
8.41k
  if (FLAGS_master_enable_metrics_snapshotter &&
3718
8.41k
      
!(2
req.table_type() == TableType::YQL_TABLE_TYPE2
&&
3719
2
        
namespace_id == kSystemNamespaceId1
&&
3720
2
        
req.name() == kMetricsSnapshotsTableName1
)) {
3721
1
    Status s = CreateMetricsSnapshotsTableIfNeeded(rpc);
3722
1
    if (!s.ok()) {
3723
0
      return s.CloneAndPrepend("Error while creating metrics snapshots table");
3724
0
    }
3725
1
  }
3726
3727
  // Increment transaction status version if needed.
3728
8.41k
  if (is_transaction_status_table) {
3729
1.09k
    RETURN_NOT_OK(IncrementTransactionTablesVersion());
3730
1.09k
  }
3731
3732
8.41k
  DVLOG
(3) << __PRETTY_FUNCTION__ << " Done."16
;
3733
8.41k
  return Status::OK();
3734
8.41k
}
3735
3736
4.76k
Status CatalogManager::VerifyTablePgLayer(scoped_refptr<TableInfo> table, bool rpc_success) {
3737
  // Upon Transaction completion, check pg system table using OID to ensure SUCCESS.
3738
4.76k
  const uint32_t database_oid = VERIFY_RESULT(GetPgsqlDatabaseOidByTableId(table->id()));
3739
0
  const auto pg_table_id = GetPgsqlTableId(database_oid, kPgClassTableOid);
3740
4.76k
  auto table_storage_id = GetPgsqlTableOid(table->id());
3741
4.76k
  {
3742
4.76k
    SharedLock lock(mutex_);
3743
4.76k
    if (matview_pg_table_ids_map_.find(table->id()) != matview_pg_table_ids_map_.end()) {
3744
24
      table_storage_id = GetPgsqlTableOid(matview_pg_table_ids_map_[table->id()]);
3745
24
    }
3746
4.76k
  }
3747
4.76k
  auto entry_exists = VERIFY_RESULT(
3748
4.76k
      ysql_transaction_->PgEntryExists(pg_table_id, table_storage_id));
3749
0
  auto l = table->LockForWrite();
3750
4.76k
  auto& metadata = table->mutable_metadata()->mutable_dirty()->pb;
3751
3752
4.76k
  SCHECK(metadata.state() == SysTablesEntryPB::RUNNING ||
3753
4.76k
         metadata.state() == SysTablesEntryPB::ALTERING, Aborted,
3754
4.76k
         Substitute("Unexpected table state ($0), abandoning transaction GC work for $1",
3755
4.76k
                    SysTablesEntryPB_State_Name(metadata.state()), table->ToString()));
3756
3757
  // #5981: Mark un-retryable rpc failures as pass to avoid infinite retry of GC'd txns.
3758
4.67k
  const bool txn_check_passed = entry_exists || 
!rpc_success63
;
3759
3760
4.67k
  if (txn_check_passed) {
3761
    // Remove the transaction from the entry since we're done processing it.
3762
4.61k
    metadata.clear_transaction();
3763
4.61k
    RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), table));
3764
4.61k
    
if (4.61k
entry_exists4.61k
) {
3765
4.61k
      LOG_WITH_PREFIX(INFO) << "Table transaction succeeded: " << table->ToString();
3766
18.4E
    } else {
3767
18.4E
      LOG_WITH_PREFIX(WARNING)
3768
18.4E
          << "Unknown RPC failure, removing transaction on table: " << table->ToString();
3769
18.4E
    }
3770
    // Commit the in-memory state.
3771
4.61k
    l.Commit();
3772
4.61k
  } else {
3773
64
    LOG(INFO) << "Table transaction failed, deleting: " << table->ToString();
3774
    // Async enqueue delete.
3775
64
    DeleteTableRequestPB del_tbl_req;
3776
64
    del_tbl_req.mutable_table()->set_table_name(table->name());
3777
64
    del_tbl_req.mutable_table()->set_table_id(table->id());
3778
64
    del_tbl_req.set_is_index_table(table->is_index());
3779
3780
64
    RETURN_NOT_OK(background_tasks_thread_pool_->SubmitFunc( [this, del_tbl_req]() {
3781
64
      DeleteTableResponsePB del_tbl_resp;
3782
64
      WARN_NOT_OK(DeleteTable(&del_tbl_req, &del_tbl_resp, nullptr),
3783
64
          "Failed to Delete Table with failed transaction");
3784
64
    }));
3785
64
  }
3786
4.67k
  return Status::OK();
3787
4.67k
}
3788
3789
Result<TabletInfos> CatalogManager::CreateTabletsFromTable(const vector<Partition>& partitions,
3790
42.8k
                                                           const TableInfoPtr& table) {
3791
42.8k
  TabletInfos tablets;
3792
  // Create the TabletInfo objects in state PREPARING.
3793
83.0k
  for (const Partition& partition : partitions) {
3794
83.0k
    PartitionPB partition_pb;
3795
83.0k
    partition.ToPB(&partition_pb);
3796
83.0k
    tablets.push_back(CreateTabletInfo(table.get(), partition_pb));
3797
83.0k
  }
3798
3799
  // Add the table/tablets to the in-memory map for the assignment.
3800
42.8k
  table->AddTablets(tablets);
3801
42.8k
  auto tablet_map_checkout = tablet_map_.CheckOut();
3802
83.0k
  for (const TabletInfoPtr& tablet : tablets) {
3803
83.0k
    InsertOrDie(tablet_map_checkout.get_ptr(), tablet->tablet_id(), tablet);
3804
83.0k
  }
3805
3806
42.8k
  return tablets;
3807
42.8k
}
3808
3809
Status CatalogManager::CheckValidPlacementInfo(const PlacementInfoPB& placement_info,
3810
                                               const TSDescriptorVector& ts_descs,
3811
56.8k
                                               ValidateReplicationInfoResponsePB* resp) {
3812
56.8k
  size_t num_live_tservers = ts_descs.size();
3813
56.8k
  size_t num_replicas = GetNumReplicasFromPlacementInfo(placement_info);
3814
56.8k
  Status s;
3815
56.8k
  string msg;
3816
3817
  // Verify that the number of replicas isn't larger than the required number of live tservers.
3818
  // To ensure quorum, we need n/2 + 1 live tservers.
3819
56.8k
  size_t replica_quorum_needed = num_replicas / 2 + 1;
3820
56.8k
  if (FLAGS_catalog_manager_check_ts_count_for_create_table &&
3821
56.8k
      
replica_quorum_needed > num_live_tservers56.5k
) {
3822
2
    msg = Substitute("Not enough live tablet servers to create table with replication factor $0. "
3823
2
                     "Need at least $1 tablet servers whereas $2 are alive.",
3824
2
                     num_replicas, replica_quorum_needed, num_live_tservers);
3825
2
    LOG(WARNING) << msg
3826
2
                 << ". Placement info: " << placement_info.ShortDebugString()
3827
2
                 << ", replication factor flag: " << FLAGS_replication_factor;
3828
2
    s = STATUS(InvalidArgument, msg);
3829
2
    return SetupError(resp->mutable_error(), MasterErrorPB::REPLICATION_FACTOR_TOO_HIGH, s);
3830
2
  }
3831
3832
  // Verify that placement requests are reasonable.
3833
56.8k
  if (!placement_info.placement_blocks().empty()) {
3834
874
    size_t minimum_sum = 0;
3835
1.19k
    for (const auto& pb : placement_info.placement_blocks()) {
3836
1.19k
      minimum_sum += pb.min_num_replicas();
3837
1.19k
      if (!pb.has_cloud_info()) {
3838
1
        msg = Substitute("Got placement info without cloud info set: $0", pb.ShortDebugString());
3839
1
        s = STATUS(InvalidArgument, msg);
3840
1
        LOG(WARNING) << msg;
3841
1
        return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
3842
1
      }
3843
1.19k
    }
3844
    // Total replicas requested should be at least the sum of minimums
3845
    // requested in individual placement blocks.
3846
873
    if (minimum_sum > num_replicas) {
3847
1
      msg = Substitute("Sum of minimum replicas per placement ($0) is greater than num_replicas "
3848
1
                       " ($1)", minimum_sum, num_replicas);
3849
1
      s = STATUS(InvalidArgument, msg);
3850
1
      LOG(WARNING) << msg;
3851
1
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
3852
1
    }
3853
3854
    // Verify that there are enough TServers in the requested placements
3855
    // to match the total required replication factor.
3856
872
    auto allowed_ts = VERIFY_RESULT(FindTServersForPlacementInfo(placement_info, ts_descs));
3857
3858
    // Fail if we don't have enough tablet servers in the areas requested.
3859
    // We need n/2 + 1 for quorum.
3860
872
    if (allowed_ts.size() < replica_quorum_needed) {
3861
29
      msg = Substitute("Not enough tablet servers in the requested placements. "
3862
29
                        "Need at least $0, have $1",
3863
29
                        replica_quorum_needed, allowed_ts.size());
3864
29
      s = STATUS(InvalidArgument, msg);
3865
29
      LOG(WARNING) << msg;
3866
29
      return SetupError(resp->mutable_error(), MasterErrorPB::REPLICATION_FACTOR_TOO_HIGH, s);
3867
29
    }
3868
3869
    // Try allocating tservers for the replicas and see if we can place a quorum
3870
    // number of replicas.
3871
    // Essentially, the logic is:
3872
    // 1. We satisfy whatever we can from the minimums.
3873
    // 2. We then satisfy whatever we can from the slack.
3874
    //    Here it doesn't whether where we put the slack replicas as long as
3875
    //    the tservers are chosen from any of the valid placement blocks.
3876
    // Overall, if in this process we are able to place n/2 + 1 replicas
3877
    // then we succeed otherwise we fail.
3878
843
    size_t total_extra_replicas = num_replicas - minimum_sum;
3879
843
    size_t total_feasible_replicas = 0;
3880
843
    size_t total_extra_servers = 0;
3881
1.14k
    for (const auto& pb : placement_info.placement_blocks()) {
3882
1.14k
      auto allowed_ts = VERIFY_RESULT(FindTServersForPlacementBlock(pb, ts_descs));
3883
0
      size_t allowed_ts_size = allowed_ts.size();
3884
1.14k
      size_t min_num_replicas = pb.min_num_replicas();
3885
      // For every placement block, we can only satisfy upto the number of
3886
      // tservers present in that particular placement block.
3887
1.14k
      total_feasible_replicas += min(allowed_ts_size, min_num_replicas);
3888
      // Extra tablet servers beyond min_num_replicas will be used to place
3889
      // the extra replicas over and above the minimums.
3890
1.14k
      if (allowed_ts_size > min_num_replicas) {
3891
377
        total_extra_servers += allowed_ts_size - min_num_replicas;
3892
377
      }
3893
1.14k
    }
3894
    // The total number of extra replicas that we can put cannot be more than
3895
    // the total tablet servers that are extra.
3896
843
    total_feasible_replicas += min(total_extra_replicas, total_extra_servers);
3897
3898
    // If we place the replicas in accordance with above, we should be able to place
3899
    // at least replica_quorum_needed otherwise we fail.
3900
843
    if (total_feasible_replicas < replica_quorum_needed) {
3901
1
      msg = Substitute("Not enough tablet servers in the requested placements. "
3902
1
                        "Can only find $0 tablet servers for the replicas but need at least "
3903
1
                        "$1.", total_feasible_replicas, replica_quorum_needed);
3904
1
      s = STATUS(InvalidArgument, msg);
3905
1
      LOG(WARNING) << msg;
3906
1
      return SetupError(resp->mutable_error(), MasterErrorPB::REPLICATION_FACTOR_TOO_HIGH, s);
3907
1
    }
3908
843
  }
3909
3910
56.8k
  return Status::OK();
3911
56.8k
}
3912
3913
Status CatalogManager::CreateTableInMemory(const CreateTableRequestPB& req,
3914
                                           const Schema& schema,
3915
                                           const PartitionSchema& partition_schema,
3916
                                           const NamespaceId& namespace_id,
3917
                                           const NamespaceName& namespace_name,
3918
                                           const std::vector<Partition>& partitions,
3919
                                           IndexInfoPB* index_info,
3920
                                           TabletInfos* tablets,
3921
                                           CreateTableResponsePB* resp,
3922
56.3k
                                           scoped_refptr<TableInfo>* table) {
3923
  // Add the new table in "preparing" state.
3924
56.3k
  *table = CreateTableInfo(req, schema, partition_schema, namespace_id, namespace_name, index_info);
3925
56.3k
  const TableId& table_id = (*table)->id();
3926
3927
56.3k
  
VLOG_WITH_PREFIX_AND_FUNC0
(2)
3928
0
      << "Table: " << (**table).ToString() << ", create_tablets: " << (tablets ? "YES" : "NO");
3929
3930
56.3k
  auto table_ids_map_checkout = table_ids_map_.CheckOut();
3931
56.3k
  (*table_ids_map_checkout)[table_id] = *table;
3932
  // Do not add Postgres tables to the name map as the table name is not unique in a namespace.
3933
56.3k
  if (req.table_type() != PGSQL_TABLE_TYPE) {
3934
38.1k
    table_names_map_[{namespace_id, req.name()}] = *table;
3935
38.1k
  }
3936
3937
56.3k
  if (req.table_type() == TRANSACTION_STATUS_TABLE_TYPE) {
3938
1.09k
    transaction_table_ids_set_.insert(table_id);
3939
1.09k
  }
3940
3941
56.3k
  if (tablets) {
3942
42.8k
    *tablets = VERIFY_RESULT(CreateTabletsFromTable(partitions, *table));
3943
42.8k
  }
3944
3945
56.3k
  if (resp != nullptr) {
3946
21.8k
    resp->set_table_id(table_id);
3947
21.8k
  }
3948
3949
56.3k
  HandleNewTableId(table_id);
3950
3951
56.3k
  return Status::OK();
3952
56.3k
}
3953
3954
Result<bool> CatalogManager::TableExists(
3955
6.64k
    const std::string& namespace_name, const std::string& table_name) const {
3956
6.64k
  TableIdentifierPB table_id_pb;
3957
6.64k
  table_id_pb.set_table_name(table_name);
3958
6.64k
  table_id_pb.mutable_namespace_()->set_name(namespace_name);
3959
6.64k
  return DoesTableExist(FindTable(table_id_pb));
3960
6.64k
}
3961
3962
CHECKED_STATUS CatalogManager::CreateTransactionStatusTable(
3963
    const CreateTransactionStatusTableRequestPB* req, CreateTransactionStatusTableResponsePB* resp,
3964
0
    rpc::RpcContext *rpc) {
3965
0
  const string& table_name = req->table_name();
3966
0
  Status s = CreateTransactionStatusTableInternal(rpc, table_name, nullptr /* tablespace_id */);
3967
0
  if (s.IsAlreadyPresent()) {
3968
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_ALREADY_PRESENT, s);
3969
0
  }
3970
0
  if (!s.ok()) {
3971
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INTERNAL_ERROR, s);
3972
0
  }
3973
0
  return Status::OK();
3974
0
}
3975
3976
CHECKED_STATUS CatalogManager::CreateTransactionStatusTableInternal(
3977
6.33k
    rpc::RpcContext *rpc, const string& table_name, const TablespaceId* tablespace_id) {
3978
6.33k
  if (VERIFY_RESULT(TableExists(kSystemNamespaceName, table_name))) {
3979
5.23k
    return STATUS_SUBSTITUTE(AlreadyPresent, "Table already exists: $0", table_name);
3980
5.23k
  }
3981
3982
1.10k
  LOG(INFO) << "Creating transaction status table " << table_name;
3983
  // Set up a CreateTable request internally.
3984
1.10k
  CreateTableRequestPB req;
3985
1.10k
  CreateTableResponsePB resp;
3986
1.10k
  req.set_name(table_name);
3987
1.10k
  req.mutable_namespace_()->set_name(kSystemNamespaceName);
3988
1.10k
  req.set_table_type(TableType::TRANSACTION_STATUS_TABLE_TYPE);
3989
1.10k
  if (tablespace_id) {
3990
28
    req.set_tablespace_id(*tablespace_id);
3991
28
  }
3992
3993
  // Explicitly set the number tablets if the corresponding flag is set, otherwise CreateTable
3994
  // will use the same defaults as for regular tables.
3995
1.10k
  int num_tablets;
3996
1.10k
  if (FLAGS_transaction_table_num_tablets > 0) {
3997
349
    num_tablets = FLAGS_transaction_table_num_tablets;
3998
756
  } else {
3999
756
    auto placement_uuid =
4000
756
        ClusterConfig()->LockForRead()->pb.replication_info().live_replicas().placement_uuid();
4001
756
    num_tablets = narrow_cast<int>(GetNumLiveTServersForPlacement(placement_uuid) *
4002
756
                                   FLAGS_transaction_table_num_tablets_per_tserver);
4003
756
  }
4004
1.10k
  req.mutable_schema()->mutable_table_properties()->set_num_tablets(num_tablets);
4005
4006
1.10k
  ColumnSchema hash(kRedisKeyColumnName, BINARY, /* is_nullable */ false, /* is_hash_key */ true);
4007
1.10k
  ColumnSchemaToPB(hash, req.mutable_schema()->mutable_columns()->Add());
4008
4009
1.10k
  Status s = CreateTable(&req, &resp, rpc);
4010
  // We do not lock here so it is technically possible that the table was already created.
4011
  // If so, there is nothing to do so we just ignore the "AlreadyPresent" error.
4012
1.10k
  if (!s.ok() && 
!s.IsAlreadyPresent()8
) {
4013
8
    return s;
4014
8
  }
4015
4016
1.09k
  return Status::OK();
4017
1.10k
}
4018
4019
330
bool CatalogManager::DoesTransactionTableExistForTablespace(const TablespaceId& tablespace_id) {
4020
330
  SharedLock lock(mutex_);
4021
553
  for (const auto& table_id : transaction_table_ids_set_) {
4022
553
    auto table = table_ids_map_->find(table_id);
4023
553
    if (table == table_ids_map_->end()) {
4024
0
      LOG(DFATAL) << "Table uuid " << table_id
4025
0
                  << " in transaction_table_ids_set_ but not in table_ids_map_";
4026
0
      continue;
4027
0
    }
4028
553
    auto this_tablespace_id = GetTransactionStatusTableTablespace(table->second);
4029
553
    if (this_tablespace_id && 
*this_tablespace_id == tablespace_id467
) {
4030
302
      return true;
4031
302
    }
4032
553
  }
4033
28
  return false;
4034
330
}
4035
4036
CHECKED_STATUS CatalogManager::CreateLocalTransactionStatusTableIfNeeded(
4037
330
    rpc::RpcContext *rpc, const TablespaceId& tablespace_id) {
4038
330
  std::lock_guard<std::mutex> lock(tablespace_transaction_table_creation_mutex_);
4039
4040
330
  if (DoesTransactionTableExistForTablespace(tablespace_id)) {
4041
302
    VLOG
(1) << "Transaction status table already exists, not creating."0
;
4042
302
    return Status::OK();
4043
302
  }
4044
4045
28
  std::string table_name;
4046
28
  if (FLAGS_TEST_name_transaction_tables_with_tablespace_id) {
4047
12
    uint32_t tablespace_oid = VERIFY_RESULT(GetPgsqlTablespaceOid(tablespace_id));
4048
0
    table_name = kTransactionTablePrefix + std::to_string(tablespace_oid);
4049
16
  } else {
4050
16
    std::string uuid;
4051
16
    RETURN_NOT_OK(yb::Uuid::Generate().ToString(&uuid));
4052
16
    table_name = kTransactionTablePrefix + uuid;
4053
16
  }
4054
4055
28
  return CreateTransactionStatusTableInternal(rpc, table_name, &tablespace_id);
4056
28
}
4057
4058
6.31k
CHECKED_STATUS CatalogManager::CreateGlobalTransactionStatusTableIfNeeded(rpc::RpcContext *rpc) {
4059
6.31k
  Status s = CreateTransactionStatusTableInternal(
4060
6.31k
      rpc, kGlobalTransactionsTableName, nullptr /* tablespace_id */);
4061
6.31k
  if (s.IsAlreadyPresent()) {
4062
5.23k
    VLOG
(1) << "Transaction status table already exists, not creating."0
;
4063
5.23k
    return Status::OK();
4064
5.23k
  }
4065
1.07k
  return s;
4066
6.31k
}
4067
4068
6.71k
Result<TableInfoPtr> CatalogManager::GetGlobalTransactionStatusTable() {
4069
6.71k
  TableIdentifierPB global_txn_table_identifier;
4070
6.71k
  global_txn_table_identifier.set_table_name(kGlobalTransactionsTableName);
4071
6.71k
  global_txn_table_identifier.mutable_namespace_()->set_name(kSystemNamespaceName);
4072
6.71k
  return FindTable(global_txn_table_identifier);
4073
6.71k
}
4074
4075
CHECKED_STATUS CatalogManager::GetGlobalTransactionStatusTablets(
4076
3.25k
    GetTransactionStatusTabletsResponsePB* resp) {
4077
3.25k
  auto global_txn_table = VERIFY_RESULT(GetGlobalTransactionStatusTable());
4078
4079
0
  auto l = global_txn_table->LockForRead();
4080
3.25k
  RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp));
4081
4082
61.8k
  
for (const auto& tablet : global_txn_table->GetTablets())3.25k
{
4083
61.8k
    TabletLocationsPB locs_pb;
4084
61.8k
    RETURN_NOT_OK(BuildLocationsForTablet(tablet, &locs_pb));
4085
61.8k
    resp->add_global_tablet_id(tablet->tablet_id());
4086
61.8k
  }
4087
4088
3.25k
  return Status::OK();
4089
3.25k
}
4090
4091
Result<std::vector<TableInfoPtr>> CatalogManager::GetPlacementLocalTransactionStatusTables(
4092
3.38k
    const CloudInfoPB& placement) {
4093
3.38k
  std::vector<TableInfoPtr> same_placement_transaction_tables;
4094
3.38k
  auto tablespace_manager = GetTablespaceManager();
4095
4096
3.38k
  SharedLock lock(mutex_);
4097
3.75k
  for (const auto& table_id : transaction_table_ids_set_) {
4098
3.75k
    auto table = table_ids_map_->find(table_id);
4099
3.75k
    if (table == table_ids_map_->end()) {
4100
0
      LOG(DFATAL) << "Table uuid " << table_id
4101
0
                  << " in transaction_table_ids_set_ but not in table_ids_map_";
4102
0
      continue;
4103
0
    }
4104
    // system.transaction is filtered out because it cannot have a placement set.
4105
3.75k
    auto table_info = table->second;
4106
3.75k
    auto lock = table_info->LockForRead();
4107
3.75k
    auto tablespace_id = GetTransactionStatusTableTablespace(table_info);
4108
3.75k
    auto cloud_info = lock->pb.replication_info();
4109
3.81k
    if (
!IsReplicationInfoSet(cloud_info)3.75k
) {
4110
3.81k
      if (tablespace_id) {
4111
245
        const auto result = tablespace_manager->GetTablespaceReplicationInfo(*tablespace_id);
4112
245
        if (!result.ok() || 
!*result185
||
!IsReplicationInfoSet(**result)185
) {
4113
60
          continue;
4114
60
        }
4115
185
        cloud_info = **result;
4116
185
      }
4117
3.81k
    }
4118
3.69k
    const auto& txn_table_replicas = cloud_info.live_replicas();
4119
    // Skip transaction tables spanning multiple regions, since using them will incur global
4120
    // latencies. See #11268.
4121
3.69k
    if (CatalogManagerUtil::DoesPlacementInfoSpanMultipleRegions(txn_table_replicas)) {
4122
9
      continue;
4123
9
    }
4124
3.68k
    if (CatalogManagerUtil::DoesPlacementInfoContainCloudInfo(txn_table_replicas, placement)) {
4125
161
      same_placement_transaction_tables.push_back(table_info);
4126
161
    }
4127
3.68k
  }
4128
4129
3.38k
  return same_placement_transaction_tables;
4130
3.38k
}
4131
4132
CHECKED_STATUS CatalogManager::GetPlacementLocalTransactionStatusTablets(
4133
    const std::vector<TableInfoPtr>& placement_local_tables,
4134
3.31k
    GetTransactionStatusTabletsResponsePB* resp) {
4135
3.31k
  if (placement_local_tables.empty()) {
4136
3.22k
    return Status::OK();
4137
3.22k
  }
4138
4139
86
  SharedLock lock(mutex_);
4140
86
  for (const auto& table_info : placement_local_tables) {
4141
86
    auto lock = table_info->LockForRead();
4142
2.49k
    for (const auto& tablet : table_info->GetTablets()) {
4143
2.49k
      TabletLocationsPB locs_pb;
4144
2.49k
      RETURN_NOT_OK(BuildLocationsForTablet(tablet, &locs_pb));
4145
2.49k
      resp->add_placement_local_tablet_id(tablet->tablet_id());
4146
2.49k
    }
4147
86
  }
4148
4149
86
  return Status::OK();
4150
86
}
4151
4152
CHECKED_STATUS CatalogManager::GetTransactionStatusTablets(
4153
    const GetTransactionStatusTabletsRequestPB* req,
4154
    GetTransactionStatusTabletsResponsePB* resp,
4155
3.36k
    rpc::RpcContext *rpc) {
4156
3.44k
  for (;;) {
4157
3.44k
    SCOPED_LEADER_SHARED_LOCK(lock, this);
4158
3.44k
    auto global_txn_table = VERIFY_RESULT(GetGlobalTransactionStatusTable());
4159
3.44k
    if (!VERIFY_RESULT(IsCreateTableDone(global_txn_table))) {
4160
61
      lock.Unlock();
4161
61
      RETURN_NOT_OK(WaitForCreateTableToFinish(global_txn_table->id(), rpc->GetClientDeadline()));
4162
61
      continue;
4163
61
    }
4164
4165
3.38k
    std::vector<TableInfoPtr> local_tables;
4166
3.38k
    if (req->has_placement()) {
4167
3.37k
      local_tables = VERIFY_RESULT(GetPlacementLocalTransactionStatusTables(req->placement()));
4168
0
      bool need_restart = false;
4169
3.37k
      for (const auto& table : local_tables) {
4170
161
        if (!VERIFY_RESULT(IsCreateTableDone(table))) {
4171
75
          if (!need_restart) {
4172
75
            need_restart = true;
4173
75
            lock.Unlock();
4174
75
          }
4175
75
          RETURN_NOT_OK(WaitForCreateTableToFinish(table->id(), rpc->GetClientDeadline()));
4176
75
        }
4177
161
      }
4178
3.37k
      if (need_restart) {
4179
75
        continue;
4180
75
      }
4181
3.37k
    }
4182
4183
3.31k
    RETURN_NOT_OK(GetGlobalTransactionStatusTablets(resp));
4184
3.31k
    RETURN_NOT_OK(GetPlacementLocalTransactionStatusTablets(local_tables, resp));
4185
4186
3.31k
    return Status::OK();
4187
3.31k
  }
4188
3.36k
}
4189
4190
1
Status CatalogManager::CreateMetricsSnapshotsTableIfNeeded(rpc::RpcContext *rpc) {
4191
1
  if (VERIFY_RESULT(TableExists(kSystemNamespaceName, kMetricsSnapshotsTableName))) {
4192
0
    return Status::OK();
4193
0
  }
4194
4195
  // Set up a CreateTable request internally.
4196
1
  CreateTableRequestPB req;
4197
1
  CreateTableResponsePB resp;
4198
1
  req.set_name(kMetricsSnapshotsTableName);
4199
1
  req.mutable_namespace_()->set_name(kSystemNamespaceName);
4200
1
  req.set_table_type(TableType::YQL_TABLE_TYPE);
4201
4202
  // Explicitly set the number tablets if the corresponding flag is set, otherwise CreateTable
4203
  // will use the same defaults as for regular tables.
4204
1
  if (FLAGS_metrics_snapshots_table_num_tablets > 0) {
4205
0
    req.mutable_schema()->mutable_table_properties()->set_num_tablets(
4206
0
        FLAGS_metrics_snapshots_table_num_tablets);
4207
0
  }
4208
4209
  // Schema description: "node" refers to tserver uuid. "entity_type" can be either
4210
  // "tserver" or "table". "entity_id" is uuid of corresponding tserver or table.
4211
  // "metric" is the name of the metric and "value" is its val. "ts" is time at
4212
  // which the snapshot was recorded. "details" is a json column for future extensibility.
4213
4214
1
  YBSchemaBuilder schemaBuilder;
4215
1
  schemaBuilder.AddColumn("node")->Type(STRING)->HashPrimaryKey()->NotNull();
4216
1
  schemaBuilder.AddColumn("entity_type")->Type(STRING)->PrimaryKey()->NotNull();
4217
1
  schemaBuilder.AddColumn("entity_id")->Type(STRING)->PrimaryKey()->NotNull();
4218
1
  schemaBuilder.AddColumn("metric")->Type(STRING)->PrimaryKey()->NotNull();
4219
1
  schemaBuilder.AddColumn("ts")->Type(TIMESTAMP)->PrimaryKey()->NotNull()->
4220
1
    SetSortingType(SortingType::kDescending);
4221
1
  schemaBuilder.AddColumn("value")->Type(INT64);
4222
1
  schemaBuilder.AddColumn("details")->Type(JSONB);
4223
4224
1
  YBSchema ybschema;
4225
1
  CHECK_OK(schemaBuilder.Build(&ybschema));
4226
4227
1
  auto schema = yb::client::internal::GetSchema(ybschema);
4228
1
  SchemaToPB(schema, req.mutable_schema());
4229
4230
1
  Status s = CreateTable(&req, &resp, rpc);
4231
  // We do not lock here so it is technically possible that the table was already created.
4232
  // If so, there is nothing to do so we just ignore the "AlreadyPresent" error.
4233
1
  if (s.IsAlreadyPresent()) {
4234
0
    return Status::OK();
4235
0
  }
4236
1
  return s;
4237
1
}
4238
4239
40.3k
Result<bool> CatalogManager::IsCreateTableDone(const TableInfoPtr& table) {
4240
40.3k
  TRACE("Locking table");
4241
40.3k
  auto l = table->LockForRead();
4242
40.3k
  RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l));
4243
40.2k
  const auto& pb = l->pb;
4244
4245
  // 2. Verify if the create is in-progress.
4246
40.2k
  TRACE("Verify if the table creation is in progress for $0", table->ToString());
4247
40.2k
  auto result = !table->IsCreateInProgress();
4248
4249
  // 3. Set any current errors, if we are experiencing issues creating the table. This will be
4250
  // bubbled up to the MasterService layer. If it is an error, it gets wrapped around in
4251
  // MasterErrorPB::UNKNOWN_ERROR.
4252
40.2k
  RETURN_NOT_OK(table->GetCreateTableErrorStatus());
4253
4254
  // 4. If this is an index, we are not done until the index is in the indexed table's schema.  An
4255
  // exception is YSQL system table indexes, which don't get added to their indexed tables' schemas.
4256
40.2k
  if (result && 
IsIndex(pb)16.8k
) {
4257
1.43k
    auto& indexed_table_id = GetIndexedTableId(pb);
4258
    // For user indexes (which add index info to indexed table's schema),
4259
    // - if this index is created without backfill,
4260
    //   - waiting for the index to be in the indexed table's schema is sufficient, and, by that
4261
    //     point, things are fully created.
4262
    // - if this index is created with backfill
4263
    //   - and it's YCQL,
4264
    //     - waiting for the index to be in the indexed table's schema means waiting for the
4265
    //       DELETE_ONLY index permission, and it's fine to return to the client before the index
4266
    //       gets the rest of the permissions because the expectation is that backfill will be
4267
    //       completed asynchronously.
4268
    //   - and it's YSQL,
4269
    //     - waiting for the index to be in the indexed table's schema means just that (DocDB index
4270
    //       permissions don't really matter for YSQL besides being used for backfill purposes), and
4271
    //       it's a signal for postgres to continue the index backfill process, activating index
4272
    //       state flags then later triggering backfill and so on.
4273
    // For YSQL system indexes (which don't add index info to indexed table's schema),
4274
    // - there's nothing additional to wait on.
4275
    // Therefore, the only thing needed here is to check whether the index info is in the indexed
4276
    // table's schema for user indexes.
4277
1.43k
    if (pb.table_type() == YQL_TABLE_TYPE ||
4278
1.43k
        
(912
pb.table_type() == PGSQL_TABLE_TYPE912
&&
IsUserCreatedTable(*table)912
)) {
4279
1.31k
      GetTableSchemaRequestPB get_schema_req;
4280
1.31k
      GetTableSchemaResponsePB get_schema_resp;
4281
1.31k
      get_schema_req.mutable_table()->set_table_id(indexed_table_id);
4282
1.31k
      const bool get_fully_applied_indexes = true;
4283
1.31k
      RETURN_NOT_OK(GetTableSchemaInternal(
4284
1.31k
          &get_schema_req, &get_schema_resp, get_fully_applied_indexes));
4285
4286
1.31k
      result = false;
4287
2.22k
      for (const auto& index : get_schema_resp.indexes()) {
4288
2.22k
        if (index.has_table_id() && index.table_id() == table->id()) {
4289
1.19k
          result = true;
4290
1.19k
          break;
4291
1.19k
        }
4292
2.22k
      }
4293
1.31k
    }
4294
1.43k
  }
4295
4296
  // Sanity check that this table is present in system.partitions if it is a YCQL table.
4297
  // Only check if we are automatically generating the vtable on changes. If we are creating via
4298
  // the bg task, then there may be a delay.
4299
40.2k
  if (DCHECK_IS_ON() &&
4300
40.2k
      
result40.2k
&&
4301
40.2k
      
IsYcqlTable(*table)16.7k
&&
4302
40.2k
      
YQLPartitionsVTable::GeneratePartitionsVTableOnChanges()2.01k
&&
4303
40.2k
      
FLAGS_TEST_catalog_manager_check_yql_partitions_exist_for_is_create_table_done2.01k
) {
4304
2.01k
    Schema schema;
4305
2.01k
    RETURN_NOT_OK(table->GetSchema(&schema));
4306
    // Copartitioned tables don't actually create tablets currently (unimplemented), so ignore them.
4307
2.01k
    if (!schema.table_properties().HasCopartitionTableId()) {
4308
2.01k
      DCHECK(GetYqlPartitionsVtable().CheckTableIsPresent(table->id(), table->NumPartitions()));
4309
2.01k
    }
4310
2.01k
  }
4311
4312
  // If this is a transactional table we are not done until the transaction status table is created.
4313
  // However, if we are currently initializing the system catalog snapshot, we don't create the
4314
  // transactions table.
4315
40.2k
  if (!FLAGS_create_initial_sys_catalog_snapshot &&
4316
40.2k
      
result39.9k
&&
pb.schema().table_properties().is_transactional()16.5k
) {
4317
5.59k
    result = VERIFY_RESULT(IsTransactionStatusTableCreated());
4318
5.59k
  }
4319
4320
  // We are not done until the metrics snapshots table is created.
4321
40.2k
  if (FLAGS_master_enable_metrics_snapshotter && 
result0
&&
4322
40.2k
      
!(0
table->GetTableType() == TableType::YQL_TABLE_TYPE0
&&
4323
0
        table->namespace_id() == kSystemNamespaceId &&
4324
0
        table->name() == kMetricsSnapshotsTableName)) {
4325
0
    result = VERIFY_RESULT(IsMetricsSnapshotsTableCreated());
4326
0
  }
4327
4328
  // If this is a colocated table and there is a pending AddTableToTablet task then we are not done.
4329
40.2k
  if (result && 
pb.colocated()16.6k
) {
4330
294
    result = !table->HasTasks(MonitoredTask::Type::ASYNC_ADD_TABLE_TO_TABLET);
4331
294
  }
4332
4333
40.2k
  return result;
4334
40.2k
}
4335
4336
Status CatalogManager::IsCreateTableDone(const IsCreateTableDoneRequestPB* req,
4337
31.1k
                                         IsCreateTableDoneResponsePB* resp) {
4338
31.1k
  TRACE("Looking up table");
4339
  // 1. Lookup the table and verify if it exists.
4340
31.1k
  scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTable(req->table()));
4341
31.0k
  resp->set_done(VERIFY_RESULT(IsCreateTableDone(table)));
4342
0
  return Status::OK();
4343
31.1k
}
4344
4345
Status CatalogManager::IsCreateTableInProgress(const TableId& table_id,
4346
                                               CoarseTimePoint deadline,
4347
1.33k
                                               bool* create_in_progress) {
4348
1.33k
  DCHECK_ONLY_NOTNULL(create_in_progress);
4349
1.33k
  DCHECK(!table_id.empty());
4350
4351
1.33k
  IsCreateTableDoneRequestPB req;
4352
1.33k
  IsCreateTableDoneResponsePB resp;
4353
1.33k
  req.mutable_table()->set_table_id(table_id);
4354
1.33k
  RETURN_NOT_OK(IsCreateTableDone(&req, &resp));
4355
4356
1.33k
  if (resp.has_error()) {
4357
0
    return StatusFromPB(resp.error().status());
4358
0
  }
4359
4360
1.33k
  *create_in_progress = !resp.done();
4361
1.33k
  return Status::OK();
4362
1.33k
}
4363
4364
Status CatalogManager::WaitForCreateTableToFinish(
4365
136
    const TableId& table_id, CoarseTimePoint deadline) {
4366
136
  return client::RetryFunc(
4367
136
      deadline, "Waiting on Create Table to be completed", "Timed out waiting for Table Creation",
4368
136
      std::bind(&CatalogManager::IsCreateTableInProgress, this, table_id, _1, _2));
4369
136
}
4370
4371
5.59k
Result<bool> CatalogManager::IsTransactionStatusTableCreated() {
4372
5.59k
  TableIdentifierPB table_id;
4373
4374
5.59k
  table_id.set_table_name(kGlobalTransactionsTableName);
4375
5.59k
  table_id.mutable_namespace_()->set_name(kSystemNamespaceName);
4376
4377
5.59k
  return IsCreateTableDone(VERIFY_RESULT(FindTable(table_id)));
4378
5.59k
}
4379
4380
0
Result<bool> CatalogManager::IsMetricsSnapshotsTableCreated() {
4381
0
  TableIdentifierPB table_id;
4382
4383
0
  table_id.set_table_name(kMetricsSnapshotsTableName);
4384
0
  table_id.mutable_namespace_()->set_name(kSystemNamespaceName);
4385
0
  table_id.mutable_namespace_()->set_database_type(YQLDatabase::YQL_DATABASE_CQL);
4386
4387
0
  return IsCreateTableDone(VERIFY_RESULT(FindTable(table_id)));
4388
0
}
4389
4390
10
std::string CatalogManager::GenerateId(boost::optional<const SysRowEntryType> entity_type) {
4391
10
  SharedLock lock(mutex_);
4392
10
  return GenerateIdUnlocked(entity_type);
4393
10
}
4394
4395
std::string CatalogManager::GenerateIdUnlocked(
4396
124k
    boost::optional<const SysRowEntryType> entity_type) {
4397
124k
  while (true) {
4398
    // Generate id and make sure it is unique within its category.
4399
124k
    std::string id = GenerateObjectId();
4400
124k
    if (!entity_type) {
4401
10
      return id;
4402
10
    }
4403
124k
    switch (*entity_type) {
4404
2.41k
      case SysRowEntryType::NAMESPACE:
4405
2.41k
        if (FindPtrOrNull(namespace_ids_map_, id) == nullptr) return id;
4406
0
        break;
4407
38.1k
      case SysRowEntryType::TABLE:
4408
38.1k
        if (FindPtrOrNull(*table_ids_map_, id) == nullptr) return id;
4409
0
        break;
4410
83.1k
      case SysRowEntryType::TABLET:
4411
83.1k
        if (FindPtrOrNull(*tablet_map_, id) == nullptr) return id;
4412
0
        break;
4413
46
      case SysRowEntryType::UDTYPE:
4414
46
        if (FindPtrOrNull(udtype_ids_map_, id) == nullptr) return id;
4415
0
        break;
4416
0
      case SysRowEntryType::SNAPSHOT:
4417
0
        return id;
4418
310
      case SysRowEntryType::CDC_STREAM:
4419
310
        if (!CDCStreamExistsUnlocked(id)) return id;
4420
0
        break;
4421
0
      case SysRowEntryType::CLUSTER_CONFIG: FALLTHROUGH_INTENDED;
4422
0
      case SysRowEntryType::ROLE: FALLTHROUGH_INTENDED;
4423
0
      case SysRowEntryType::REDIS_CONFIG: FALLTHROUGH_INTENDED;
4424
0
      case SysRowEntryType::UNIVERSE_REPLICATION: FALLTHROUGH_INTENDED;
4425
0
      case SysRowEntryType::SYS_CONFIG: FALLTHROUGH_INTENDED;
4426
0
      case SysRowEntryType::SNAPSHOT_SCHEDULE: FALLTHROUGH_INTENDED;
4427
0
      case SysRowEntryType::DDL_LOG_ENTRY: FALLTHROUGH_INTENDED;
4428
0
      case SysRowEntryType::UNKNOWN:
4429
0
        LOG(DFATAL) << "Invalid id type: " << *entity_type;
4430
0
        return id;
4431
124k
    }
4432
124k
  }
4433
124k
}
4434
4435
scoped_refptr<TableInfo> CatalogManager::CreateTableInfo(const CreateTableRequestPB& req,
4436
                                                         const Schema& schema,
4437
                                                         const PartitionSchema& partition_schema,
4438
                                                         const NamespaceId& namespace_id,
4439
                                                         const NamespaceName& namespace_name,
4440
56.3k
                                                         IndexInfoPB* index_info) {
4441
56.3k
  DCHECK(schema.has_column_ids());
4442
56.3k
  TableId table_id
4443
56.3k
      = !req.table_id().empty() ? 
req.table_id()18.2k
:
GenerateIdUnlocked(SysRowEntryType::TABLE)38.1k
;
4444
56.3k
  scoped_refptr<TableInfo> table = NewTableInfo(table_id);
4445
56.3k
  if (req.has_tablespace_id()) {
4446
186
    table->SetTablespaceIdForTableCreation(req.tablespace_id());
4447
186
  }
4448
56.3k
  table->mutable_metadata()->StartMutation();
4449
56.3k
  SysTablesEntryPB *metadata = &table->mutable_metadata()->mutable_dirty()->pb;
4450
56.3k
  metadata->set_state(SysTablesEntryPB::PREPARING);
4451
56.3k
  metadata->set_name(req.name());
4452
56.3k
  metadata->set_table_type(req.table_type());
4453
56.3k
  metadata->set_namespace_id(namespace_id);
4454
56.3k
  metadata->set_namespace_name(namespace_name);
4455
56.3k
  metadata->set_version(0);
4456
56.3k
  metadata->set_next_column_id(ColumnId(schema.max_col_id() + 1));
4457
56.3k
  if (req.has_replication_info()) {
4458
1
    metadata->mutable_replication_info()->CopyFrom(req.replication_info());
4459
1
  }
4460
  // Use the Schema object passed in, since it has the column IDs already assigned,
4461
  // whereas the user request PB does not.
4462
56.3k
  SchemaToPB(schema, metadata->mutable_schema());
4463
56.3k
  partition_schema.ToPB(metadata->mutable_partition_schema());
4464
  // For index table, set index details (indexed table id and whether the index is local).
4465
56.3k
  if (req.has_index_info()) {
4466
7.10k
    metadata->mutable_index_info()->CopyFrom(req.index_info());
4467
4468
    // Set the deprecated fields also for compatibility reasons.
4469
7.10k
    metadata->set_indexed_table_id(req.index_info().indexed_table_id());
4470
7.10k
    metadata->set_is_local_index(req.index_info().is_local());
4471
7.10k
    metadata->set_is_unique_index(req.index_info().is_unique());
4472
4473
    // Setup index info.
4474
7.10k
    if (index_info != nullptr) {
4475
1.18k
      index_info->set_table_id(table->id());
4476
1.18k
      metadata->mutable_index_info()->CopyFrom(*index_info);
4477
1.18k
    }
4478
49.2k
  } else if (req.has_indexed_table_id()) {
4479
    // Read data from the deprecated field and update the new fields.
4480
18
    metadata->mutable_index_info()->set_indexed_table_id(req.indexed_table_id());
4481
18
    metadata->mutable_index_info()->set_is_local(req.is_local_index());
4482
18
    metadata->mutable_index_info()->set_is_unique(req.is_unique_index());
4483
4484
    // Set the deprecated fields also for compatibility reasons.
4485
18
    metadata->set_indexed_table_id(req.indexed_table_id());
4486
18
    metadata->set_is_local_index(req.is_local_index());
4487
18
    metadata->set_is_unique_index(req.is_unique_index());
4488
4489
    // Setup index info.
4490
18
    if (index_info != nullptr) {
4491
18
      index_info->set_table_id(table->id());
4492
18
      metadata->mutable_index_info()->CopyFrom(*index_info);
4493
18
    }
4494
18
  }
4495
4496
56.3k
  if (req.is_pg_shared_table()) {
4497
50
    metadata->set_is_pg_shared_table(true);
4498
50
  }
4499
4500
56.3k
  return table;
4501
56.3k
}
4502
4503
TabletInfoPtr CatalogManager::CreateTabletInfo(TableInfo* table,
4504
83.1k
                                               const PartitionPB& partition) {
4505
83.1k
  auto tablet = make_scoped_refptr<TabletInfo>(table, GenerateIdUnlocked(SysRowEntryType::TABLET));
4506
83.1k
  
VLOG_WITH_PREFIX_AND_FUNC0
(2)
4507
0
      << "Table: " << table->ToString() << ", tablet: " << tablet->ToString();
4508
4509
83.1k
  tablet->mutable_metadata()->StartMutation();
4510
83.1k
  SysTabletsEntryPB *metadata = &tablet->mutable_metadata()->mutable_dirty()->pb;
4511
83.1k
  metadata->set_state(SysTabletsEntryPB::PREPARING);
4512
83.1k
  metadata->mutable_partition()->CopyFrom(partition);
4513
83.1k
  metadata->set_table_id(table->id());
4514
  // This is important: we are setting the first table id in the table_ids list
4515
  // to be the id of the original table that creates the tablet.
4516
83.1k
  metadata->add_table_ids(table->id());
4517
83.1k
  return tablet;
4518
83.1k
}
4519
4520
Status CatalogManager::RemoveTableIdsFromTabletInfo(
4521
    TabletInfoPtr tablet_info,
4522
99
    std::unordered_set<TableId> tables_to_remove) {
4523
99
  auto tablet_lock = tablet_info->LockForWrite();
4524
4525
99
  google::protobuf::RepeatedPtrField<std::string> new_table_ids;
4526
54.0k
  for (const auto& table_id : tablet_lock->pb.table_ids()) {
4527
54.0k
    if (tables_to_remove.find(table_id) == tables_to_remove.end()) {
4528
46.0k
      *new_table_ids.Add() = std::move(table_id);
4529
46.0k
    }
4530
54.0k
  }
4531
99
  tablet_lock.mutable_data()->pb.mutable_table_ids()->Swap(&new_table_ids);
4532
4533
99
  RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), tablet_info));
4534
95
  tablet_lock.Commit();
4535
95
  return Status::OK();
4536
99
}
4537
4538
Result<scoped_refptr<TableInfo>> CatalogManager::FindTable(
4539
640k
    const TableIdentifierPB& table_identifier) const {
4540
640k
  SharedLock lock(mutex_);
4541
640k
  return FindTableUnlocked(table_identifier);
4542
640k
}
4543
4544
Result<scoped_refptr<TableInfo>> CatalogManager::FindTableUnlocked(
4545
640k
    const TableIdentifierPB& table_identifier) const {
4546
640k
  if (table_identifier.has_table_id()) {
4547
508k
    return FindTableByIdUnlocked(table_identifier.table_id());
4548
508k
  }
4549
4550
131k
  if (table_identifier.has_table_name()) {
4551
131k
    auto namespace_info = 
VERIFY_RESULT131k
(FindNamespaceUnlocked(table_identifier.namespace_()));131k
4552
4553
    // We can't lookup YSQL table by name because Postgres concept of "schemas"
4554
    // introduces ambiguity.
4555
131k
    if (namespace_info->database_type() == YQL_DATABASE_PGSQL) {
4556
0
      return STATUS(InvalidArgument, "Cannot lookup YSQL table by name");
4557
0
    }
4558
4559
131k
    auto it = table_names_map_.find({namespace_info->id(), table_identifier.table_name()});
4560
131k
    if (it == table_names_map_.end()) {
4561
3.57k
      return STATUS_EC_FORMAT(
4562
3.57k
          NotFound, MasterError(MasterErrorPB::OBJECT_NOT_FOUND),
4563
3.57k
          "Table $0.$1 not found", namespace_info->name(), table_identifier.table_name());
4564
3.57k
    }
4565
128k
    return it->second;
4566
131k
  }
4567
4568
34
  return STATUS(InvalidArgument, "Neither table id or table name are specified",
4569
131k
                table_identifier.ShortDebugString());
4570
131k
}
4571
4572
Result<scoped_refptr<TableInfo>> CatalogManager::FindTableById(
4573
8.24k
    const TableId& table_id) const {
4574
8.24k
  SharedLock lock(mutex_);
4575
8.24k
  return FindTableByIdUnlocked(table_id);
4576
8.24k
}
4577
4578
Result<scoped_refptr<TableInfo>> CatalogManager::FindTableByIdUnlocked(
4579
516k
    const TableId& table_id) const {
4580
516k
  auto it = table_ids_map_->find(table_id);
4581
516k
  if (it == table_ids_map_->end()) {
4582
429
    return STATUS_EC_FORMAT(
4583
429
        NotFound, MasterError(MasterErrorPB::OBJECT_NOT_FOUND),
4584
429
        "Table with identifier $0 not found", table_id);
4585
429
  }
4586
516k
  return it->second;
4587
516k
}
4588
4589
Result<scoped_refptr<NamespaceInfo>> CatalogManager::FindNamespaceById(
4590
678k
    const NamespaceId& id) const {
4591
678k
  SharedLock lock(mutex_);
4592
678k
  return FindNamespaceByIdUnlocked(id);
4593
678k
}
4594
4595
Result<scoped_refptr<NamespaceInfo>> CatalogManager::FindNamespaceByIdUnlocked(
4596
775k
    const NamespaceId& id) const {
4597
775k
  auto it = namespace_ids_map_.find(id);
4598
775k
  if (it == namespace_ids_map_.end()) {
4599
3
    
VLOG_WITH_FUNC0
(4) << "Not found: " << id << "\n" << GetStackTrace()0
;
4600
3
    return STATUS(NotFound, "Keyspace identifier not found", id,
4601
3
                  MasterError(MasterErrorPB::NAMESPACE_NOT_FOUND));
4602
3
  }
4603
775k
  return it->second;
4604
775k
}
4605
4606
Result<scoped_refptr<NamespaceInfo>> CatalogManager::FindNamespaceUnlocked(
4607
236k
    const NamespaceIdentifierPB& ns_identifier) const {
4608
236k
  if (ns_identifier.has_id()) {
4609
96.7k
    return FindNamespaceByIdUnlocked(ns_identifier.id());
4610
96.7k
  }
4611
4612
139k
  if (ns_identifier.has_name()) {
4613
139k
    auto db = GetDatabaseType(ns_identifier);
4614
139k
    auto it = namespace_names_mapper_[db].find(ns_identifier.name());
4615
139k
    if (it == namespace_names_mapper_[db].end()) {
4616
1.91k
      return STATUS(NotFound, "Keyspace name not found", ns_identifier.name(),
4617
1.91k
                    MasterError(MasterErrorPB::NAMESPACE_NOT_FOUND));
4618
1.91k
    }
4619
137k
    return it->second;
4620
139k
  }
4621
4622
26
  LOG(DFATAL) << __func__ << ": " << ns_identifier.ShortDebugString() << ", \n" << GetStackTrace();
4623
26
  return STATUS(NotFound, "Neither keyspace id nor keyspace name is specified",
4624
139k
                ns_identifier.ShortDebugString(), MasterError(MasterErrorPB::NAMESPACE_NOT_FOUND));
4625
139k
}
4626
4627
Result<scoped_refptr<NamespaceInfo>> CatalogManager::FindNamespace(
4628
36.3k
    const NamespaceIdentifierPB& ns_identifier) const {
4629
36.3k
  SharedLock lock(mutex_);
4630
36.3k
  return FindNamespaceUnlocked(ns_identifier);
4631
36.3k
}
4632
4633
Result<TableDescription> CatalogManager::DescribeTable(
4634
0
    const TableIdentifierPB& table_identifier, bool succeed_if_create_in_progress) {
4635
0
  TRACE("Looking up table");
4636
0
  return DescribeTable(VERIFY_RESULT(FindTable(table_identifier)), succeed_if_create_in_progress);
4637
0
}
4638
4639
Result<TableDescription> CatalogManager::DescribeTable(
4640
45
    const TableInfoPtr& table_info, bool succeed_if_create_in_progress) {
4641
45
  TableDescription result;
4642
45
  result.table_info = table_info;
4643
45
  NamespaceId namespace_id;
4644
45
  {
4645
45
    TRACE("Locking table");
4646
45
    auto l = table_info->LockForRead();
4647
4648
45
    if (!succeed_if_create_in_progress && 
table_info->IsCreateInProgress()14
) {
4649
0
      return STATUS(IllegalState, "Table creation is in progress", table_info->ToString(),
4650
0
                    MasterError(MasterErrorPB::TABLE_CREATION_IS_IN_PROGRESS));
4651
0
    }
4652
4653
45
    result.tablet_infos = table_info->GetTablets();
4654
4655
45
    namespace_id = table_info->namespace_id();
4656
45
  }
4657
4658
45
  TRACE("Looking up namespace");
4659
45
  result.namespace_info = VERIFY_RESULT(FindNamespaceById(namespace_id));
4660
4661
0
  return result;
4662
45
}
4663
4664
0
Result<string> CatalogManager::GetPgSchemaName(const TableInfoPtr& table_info) {
4665
0
  RSTATUS_DCHECK_EQ(table_info->GetTableType(), PGSQL_TABLE_TYPE, InternalError,
4666
0
      Format("Expected YSQL table, got: $0", table_info->GetTableType()));
4667
4668
0
  const uint32_t database_oid = VERIFY_RESULT(GetPgsqlDatabaseOid(table_info->namespace_id()));
4669
0
  uint32_t table_oid = VERIFY_RESULT(GetPgsqlTableOid(table_info->id()));
4670
0
  {
4671
0
    if (matview_pg_table_ids_map_.find(table_info->id()) != matview_pg_table_ids_map_.end()) {
4672
0
      table_oid = VERIFY_RESULT(GetPgsqlTableOid(matview_pg_table_ids_map_[table_info->id()]));
4673
0
    }
4674
0
  }
4675
0
  const uint32_t relnamespace_oid = VERIFY_RESULT(
4676
0
      sys_catalog_->ReadPgClassRelnamespace(database_oid, table_oid));
4677
0
  return sys_catalog_->ReadPgNamespaceNspname(database_oid, relnamespace_oid);
4678
0
}
4679
4680
// Truncate a Table.
4681
Status CatalogManager::TruncateTable(const TruncateTableRequestPB* req,
4682
                                     TruncateTableResponsePB* resp,
4683
12.6k
                                     rpc::RpcContext* rpc) {
4684
12.6k
  LOG(INFO) << "Servicing TruncateTable request from " << RequestorString(rpc)
4685
12.6k
            << ": " << req->ShortDebugString();
4686
4687
16.4k
  for (int i = 0; i < req->table_ids_size(); 
i++3.79k
) {
4688
3.79k
    RETURN_NOT_OK(TruncateTable(req->table_ids(i), resp, rpc));
4689
3.79k
  }
4690
4691
12.6k
  return Status::OK();
4692
12.6k
}
4693
4694
Status CatalogManager::TruncateTable(const TableId& table_id,
4695
                                     TruncateTableResponsePB* resp,
4696
7.17k
                                     rpc::RpcContext* rpc) {
4697
  // Lookup the table and verify if it exists.
4698
7.17k
  TRACE(Substitute("Looking up object by id $0", table_id));
4699
7.17k
  scoped_refptr<TableInfo> table;
4700
7.17k
  {
4701
7.17k
    SharedLock lock(mutex_);
4702
7.17k
    table = FindPtrOrNull(*table_ids_map_, table_id);
4703
7.17k
    if (table == nullptr) {
4704
0
      Status s = STATUS_SUBSTITUTE(NotFound, "The object with id $0 does not exist", table_id);
4705
0
      return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
4706
0
    }
4707
7.17k
  }
4708
4709
7.17k
  TRACE(Substitute("Locking object with id $0", table_id));
4710
7.17k
  auto l = table->LockForRead();
4711
7.17k
  RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp));
4712
4713
  // Truncate on a colocated table should not hit master because it should be handled by a write
4714
  // DML that creates a table-level tombstone.
4715
7.17k
  LOG_IF
(WARNING, table->IsColocatedUserTable()) << "cannot truncate a colocated table on master"0
;
4716
4717
7.17k
  if (!FLAGS_enable_delete_truncate_xcluster_replicated_table && IsCdcEnabled(*table)) {
4718
0
    return STATUS(NotSupported,
4719
0
                  "Cannot truncate a table in replication.",
4720
0
                  table_id,
4721
0
                  MasterError(MasterErrorPB::INVALID_REQUEST));
4722
0
  }
4723
4724
  // Send a Truncate() request to each tablet in the table.
4725
7.17k
  SendTruncateTableRequest(table);
4726
4727
7.17k
  LOG(INFO) << "Successfully initiated TRUNCATE for " << table->ToString() << " per request from "
4728
7.17k
            << RequestorString(rpc);
4729
7.17k
  background_tasks_->Wake();
4730
4731
  // Truncate indexes also.
4732
  // Note: PG table does not have references to indexes in the base table, so associated indexes
4733
  //       must be truncated from the PG code separately.
4734
7.17k
  const bool is_index = IsIndex(l->pb);
4735
7.17k
  DCHECK
(!is_index || l->pb.indexes().empty()) << "indexes should be empty for index table"0
;
4736
7.17k
  for (const auto& index_info : l->pb.indexes()) {
4737
3.38k
    RETURN_NOT_OK(TruncateTable(index_info.table_id(), resp, rpc));
4738
3.38k
  }
4739
4740
7.17k
  return Status::OK();
4741
7.17k
}
4742
4743
7.17k
void CatalogManager::SendTruncateTableRequest(const scoped_refptr<TableInfo>& table) {
4744
57.2k
  for (const auto& tablet : table->GetTablets()) {
4745
57.2k
    SendTruncateTabletRequest(tablet);
4746
57.2k
  }
4747
7.17k
}
4748
4749
57.2k
void CatalogManager::SendTruncateTabletRequest(const scoped_refptr<TabletInfo>& tablet) {
4750
57.2k
  LOG_WITH_PREFIX(INFO) << "Truncating tablet " << tablet->id();
4751
57.2k
  auto call = std::make_shared<AsyncTruncate>(master_, AsyncTaskPool(), tablet);
4752
57.2k
  tablet->table()->AddTask(call);
4753
57.2k
  WARN_NOT_OK(
4754
57.2k
      ScheduleTask(call),
4755
57.2k
      Substitute("Failed to send truncate request for tablet $0", tablet->id()));
4756
57.2k
}
4757
4758
Status CatalogManager::IsTruncateTableDone(const IsTruncateTableDoneRequestPB* req,
4759
10.5k
                                           IsTruncateTableDoneResponsePB* resp) {
4760
10.5k
  LOG(INFO) << "Servicing IsTruncateTableDone request for table id " << req->table_id();
4761
4762
  // Lookup the truncated table.
4763
10.5k
  TRACE("Looking up table $0", req->table_id());
4764
10.5k
  scoped_refptr<TableInfo> table;
4765
10.5k
  {
4766
10.5k
    SharedLock lock(mutex_);
4767
10.5k
    table = FindPtrOrNull(*table_ids_map_, req->table_id());
4768
10.5k
  }
4769
4770
10.5k
  if (table == nullptr) {
4771
0
    Status s = STATUS(NotFound, "The object does not exist: table with id", req->table_id());
4772
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
4773
0
  }
4774
4775
10.5k
  TRACE("Locking table");
4776
10.5k
  RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(table->LockForRead(), resp));
4777
4778
10.5k
  resp->set_done(!table->HasTasks(MonitoredTask::Type::ASYNC_TRUNCATE_TABLET));
4779
10.5k
  return Status::OK();
4780
10.5k
}
4781
4782
// Note: only used by YSQL as of 2020-10-29.
4783
Status CatalogManager::BackfillIndex(
4784
    const BackfillIndexRequestPB* req,
4785
    BackfillIndexResponsePB* resp,
4786
540
    rpc::RpcContext* rpc) {
4787
540
  const TableIdentifierPB& index_table_identifier = req->index_identifier();
4788
4789
540
  scoped_refptr<TableInfo> index_table = VERIFY_RESULT(FindTable(index_table_identifier));
4790
4791
540
  if (index_table->GetTableType() != PGSQL_TABLE_TYPE) {
4792
    // This request is only supported for YSQL for now.  YCQL has its own mechanism.
4793
0
    return STATUS(
4794
0
        InvalidArgument,
4795
0
        "Unexpected non-YSQL table",
4796
0
        index_table_identifier.ShortDebugString());
4797
0
  }
4798
4799
  // Collect indexed_table.
4800
540
  scoped_refptr<TableInfo> indexed_table;
4801
540
  {
4802
540
    auto l = index_table->LockForRead();
4803
540
    TableId indexed_table_id = GetIndexedTableId(l->pb);
4804
540
    resp->mutable_table_identifier()->set_table_id(indexed_table_id);
4805
540
    indexed_table = GetTableInfo(indexed_table_id);
4806
540
  }
4807
4808
540
  if (indexed_table == nullptr) {
4809
0
    return STATUS(InvalidArgument, "Empty indexed table",
4810
0
                  index_table_identifier.ShortDebugString());
4811
0
  }
4812
4813
  // TODO(jason): when ready to use INDEX_PERM_DO_BACKFILL for resuming backfill across master
4814
  // leader changes, replace the following (issue #6218).
4815
4816
  // Collect index_info_pb.
4817
540
  IndexInfoPB index_info_pb;
4818
540
  indexed_table->GetIndexInfo(index_table->id()).ToPB(&index_info_pb);
4819
540
  if (index_info_pb.index_permissions() != INDEX_PERM_WRITE_AND_DELETE) {
4820
0
    return SetupError(
4821
0
        resp->mutable_error(),
4822
0
        MasterErrorPB::INVALID_SCHEMA,
4823
0
        STATUS_FORMAT(
4824
0
            InvalidArgument,
4825
0
            "Expected WRITE_AND_DELETE perm, got $0",
4826
0
            IndexPermissions_Name(index_info_pb.index_permissions())));
4827
0
  }
4828
4829
540
  return MultiStageAlterTable::StartBackfillingData(
4830
540
      this, indexed_table, {index_info_pb}, boost::none);
4831
540
}
4832
4833
Status CatalogManager::GetBackfillJobs(
4834
    const GetBackfillJobsRequestPB* req,
4835
    GetBackfillJobsResponsePB* resp,
4836
697
    rpc::RpcContext* rpc) {
4837
697
  TableIdentifierPB table_id = req->table_identifier();
4838
4839
697
  scoped_refptr<TableInfo> indexed_table = VERIFY_RESULT(FindTable(table_id));
4840
697
  if (indexed_table == nullptr) {
4841
0
    Status s = STATUS(NotFound, "Requested table $0 does not exist", table_id.ShortDebugString());
4842
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
4843
0
  }
4844
4845
697
  {
4846
697
    auto l = indexed_table->LockForRead();
4847
697
    resp->mutable_backfill_jobs()->CopyFrom(l->pb.backfill_jobs());
4848
697
  }
4849
697
  return Status::OK();
4850
697
}
4851
4852
Status CatalogManager::LaunchBackfillIndexForTable(
4853
    const LaunchBackfillIndexForTableRequestPB* req,
4854
    LaunchBackfillIndexForTableResponsePB* resp,
4855
1
    rpc::RpcContext* rpc) {
4856
1
  const TableIdentifierPB& table_id = req->table_identifier();
4857
4858
1
  scoped_refptr<TableInfo> indexed_table = VERIFY_RESULT(FindTable(table_id));
4859
1
  if (indexed_table == nullptr) {
4860
0
    Status s = STATUS(NotFound, "Requested table $0 does not exist", table_id.ShortDebugString());
4861
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
4862
0
  }
4863
1
  if (indexed_table->GetTableType() != YQL_TABLE_TYPE) {
4864
    // This request is only supported for YCQL for now.  YSQL has its own mechanism.
4865
0
    return STATUS(InvalidArgument, "Unexpected non-YCQL table $0", table_id.ShortDebugString());
4866
0
  }
4867
4868
1
  uint32_t current_version;
4869
1
  {
4870
1
    auto l = indexed_table->LockForRead();
4871
1
    if (l->pb.state() != SysTablesEntryPB::RUNNING) {
4872
0
      Status s = STATUS(TryAgain,
4873
0
                        "The table is in state $0. An alter may already be in progress.",
4874
0
                        SysTablesEntryPB_State_Name(l->pb.state()));
4875
0
      VLOG(2) << "Table " << indexed_table->ToString() << " is not running returning " << s;
4876
0
      return SetupError(resp->mutable_error(), MasterErrorPB::INTERNAL_ERROR, s);
4877
0
    }
4878
1
    current_version = l->pb.version();
4879
1
  }
4880
4881
0
  auto s = MultiStageAlterTable::LaunchNextTableInfoVersionIfNecessary(
4882
1
      this, indexed_table, current_version, /* respect deferrals for backfill */ false);
4883
1
  if (!s.ok()) {
4884
0
    VLOG(3) << __func__ << " Done failed " << s;
4885
0
    return SetupError(resp->mutable_error(), MasterErrorPB::UNKNOWN_ERROR, s);
4886
0
  }
4887
1
  return Status::OK();
4888
1
}
4889
4890
Status CatalogManager::MarkIndexInfoFromTableForDeletion(
4891
    const TableId& indexed_table_id, const TableId& index_table_id, bool multi_stage,
4892
916
    DeleteTableResponsePB* resp) {
4893
  // Lookup the indexed table and verify if it exists.
4894
916
  scoped_refptr<TableInfo> indexed_table = GetTableInfo(indexed_table_id);
4895
916
  if (indexed_table == nullptr) {
4896
0
    LOG(WARNING) << "Indexed table " << indexed_table_id << " for index "
4897
0
                 << index_table_id << " not found";
4898
0
    return Status::OK();
4899
0
  }
4900
4901
916
  if (resp) {
4902
916
    auto ns_info = VERIFY_RESULT(master_->catalog_manager()->FindNamespaceById(
4903
916
        indexed_table->namespace_id()));
4904
0
    auto* resp_indexed_table = resp->mutable_indexed_table();
4905
916
    resp_indexed_table->mutable_namespace_()->set_name(ns_info->name());
4906
916
    resp_indexed_table->set_table_name(indexed_table->name());
4907
916
    resp_indexed_table->set_table_id(indexed_table_id);
4908
916
  }
4909
916
  if (multi_stage) {
4910
105
    RETURN_NOT_OK(MultiStageAlterTable::UpdateIndexPermission(
4911
105
        this, indexed_table,
4912
105
        {{index_table_id, IndexPermissions::INDEX_PERM_WRITE_AND_DELETE_WHILE_REMOVING}}));
4913
811
  } else {
4914
811
    RETURN_NOT_OK(DeleteIndexInfoFromTable(indexed_table_id, index_table_id));
4915
811
  }
4916
4917
  // Actual Deletion of the index info will happen asynchronously after all the
4918
  // tablets move to the new IndexPermission of DELETE_ONLY_WHILE_REMOVING.
4919
916
  RETURN_NOT_OK(SendAlterTableRequest(indexed_table));
4920
916
  return Status::OK();
4921
916
}
4922
4923
Status CatalogManager::DeleteIndexInfoFromTable(
4924
811
    const TableId& indexed_table_id, const TableId& index_table_id) {
4925
811
  scoped_refptr<TableInfo> indexed_table = GetTableInfo(indexed_table_id);
4926
811
  if (indexed_table == nullptr) {
4927
0
    LOG(WARNING) << "Indexed table " << indexed_table_id << " for index " << index_table_id
4928
0
                 << " not found";
4929
0
    return Status::OK();
4930
0
  }
4931
811
  TRACE("Locking indexed table");
4932
811
  auto l = indexed_table->LockForWrite();
4933
811
  auto &indexed_table_data = *l.mutable_data();
4934
4935
  // Heed issue #6233.
4936
811
  if (!l->pb.has_fully_applied_schema()) {
4937
701
    MultiStageAlterTable::CopySchemaDetailsToFullyApplied(&indexed_table_data.pb);
4938
701
  }
4939
811
  auto *indexes = indexed_table_data.pb.mutable_indexes();
4940
988
  for (int i = 0; i < indexes->size(); 
i++177
) {
4941
988
    if (indexes->Get(i).table_id() == index_table_id) {
4942
4943
811
      indexes->DeleteSubrange(i, 1);
4944
4945
811
      indexed_table_data.pb.set_version(indexed_table_data.pb.version() + 1);
4946
      // TODO(Amit) : Is this compatible with the previous version?
4947
811
      indexed_table_data.pb.set_updates_only_index_permissions(false);
4948
811
      indexed_table_data.set_state(
4949
811
          SysTablesEntryPB::ALTERING,
4950
811
          Format("Delete index info version=$0 ts=$1",
4951
811
                 indexed_table_data.pb.version(), LocalTimeAsString()));
4952
4953
      // Update sys-catalog with the deleted indexed table info.
4954
811
      TRACE("Updating indexed table metadata on disk");
4955
811
      RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), indexed_table));
4956
4957
      // Update the in-memory state.
4958
811
      TRACE("Committing in-memory state");
4959
811
      l.Commit();
4960
811
      return Status::OK();
4961
811
    }
4962
988
  }
4963
4964
0
  LOG(WARNING) << "Index " << index_table_id << " not found in indexed table " << indexed_table_id;
4965
0
  return Status::OK();
4966
811
}
4967
4968
Status CatalogManager::DeleteTable(
4969
5.60k
    const DeleteTableRequestPB* req, DeleteTableResponsePB* resp, rpc::RpcContext* rpc) {
4970
5.60k
  LOG(INFO) << "Servicing DeleteTable request from " << RequestorString(rpc) << ": "
4971
5.60k
            << req->ShortDebugString();
4972
4973
5.60k
  scoped_refptr<TableInfo> table = 
VERIFY_RESULT5.58k
(FindTable(req->table()));5.58k
4974
0
  bool result = IsCdcEnabled(*table);
4975
5.58k
  if (!FLAGS_enable_delete_truncate_xcluster_replicated_table && result) {
4976
1
    return STATUS(NotSupported,
4977
1
                  "Cannot delete a table in replication.",
4978
1
                  req->ShortDebugString(),
4979
1
                  MasterError(MasterErrorPB::INVALID_REQUEST));
4980
1
  }
4981
4982
5.58k
  if (req->is_index_table()) {
4983
808
    TRACE("Looking up index");
4984
808
    TableId table_id = table->id();
4985
808
    resp->set_table_id(table_id);
4986
808
    TableId indexed_table_id;
4987
808
    {
4988
808
      auto l = table->LockForRead();
4989
808
      indexed_table_id = GetIndexedTableId(l->pb);
4990
808
    }
4991
808
    scoped_refptr<TableInfo> indexed_table = GetTableInfo(indexed_table_id);
4992
808
    const bool is_pg_table = indexed_table != nullptr &&
4993
808
                             indexed_table->GetTableType() == PGSQL_TABLE_TYPE;
4994
808
    bool is_transactional;
4995
808
    {
4996
808
      Schema index_schema;
4997
808
      RETURN_NOT_OK(table->GetSchema(&index_schema));
4998
808
      is_transactional = index_schema.table_properties().is_transactional();
4999
808
    }
5000
0
    const bool index_backfill_enabled =
5001
808
        IsIndexBackfillEnabled(table->GetTableType(), is_transactional);
5002
808
    if (!is_pg_table && 
index_backfill_enabled124
) {
5003
105
      return MarkIndexInfoFromTableForDeletion(
5004
105
          indexed_table_id, table_id, /* multi_stage */ true, resp);
5005
105
    }
5006
808
  }
5007
5008
5.48k
  return DeleteTableInternal(req, resp, rpc);
5009
5.58k
}
5010
5011
// Delete a Table
5012
//  - Update the table state to "DELETING".
5013
//  - Issue DeleteTablet tasks to all said tablets.
5014
//  - Update all the underlying tablet states as "DELETED".
5015
//
5016
// This order of events can help us guarantee that:
5017
//  - If a table is DELETING/DELETED, we do not add further tasks to it.
5018
//  - A DeleteTable is done when a table is either DELETING or DELETED and has no running tasks.
5019
//  - If a table is DELETING and it has no tasks on it, then it is safe to mark DELETED.
5020
//
5021
// We are lazy about deletions.
5022
//
5023
// IMPORTANT: If modifying, consider updating DeleteYsqlDBTables(), the bulk deletion API.
5024
Status CatalogManager::DeleteTableInternal(
5025
5.60k
    const DeleteTableRequestPB* req, DeleteTableResponsePB* resp, rpc::RpcContext* rpc) {
5026
5.60k
  auto schedules_to_tables_map = VERIFY_RESULT(
5027
5.60k
      MakeSnapshotSchedulesToObjectIdsMap(SysRowEntryType::TABLE));
5028
5029
0
  vector<DeletingTableData> tables;
5030
5.60k
  RETURN_NOT_OK(DeleteTableInMemory(req->table(), req->is_index_table(),
5031
5.60k
                                    true /* update_indexed_table */, schedules_to_tables_map,
5032
5.60k
                                    &tables, resp, rpc));
5033
5034
  // Update the in-memory state.
5035
5.58k
  TRACE("Committing in-memory state");
5036
5.58k
  std::unordered_set<TableId> sys_table_ids;
5037
5.88k
  for (auto& table : tables) {
5038
5.88k
    if (IsSystemTable(*table.info)) {
5039
1
      sys_table_ids.insert(table.info->id());
5040
1
    }
5041
5.88k
    table.write_lock.Commit();
5042
5.88k
  }
5043
5044
  // Delete any CDC streams that are set up on this table, after releasing the Table lock.
5045
5.58k
  TRACE("Deleting CDC streams on table");
5046
  // table_id for the requested table will be added to the end of the response.
5047
5.58k
  RSTATUS_DCHECK_GE(resp->deleted_table_ids_size(), 1, IllegalState,
5048
5.58k
      "DeleteTableInMemory expected to add the index id to resp");
5049
5.58k
  RETURN_NOT_OK(
5050
5.58k
      DeleteCDCStreamsForTable(resp->deleted_table_ids(resp->deleted_table_ids_size() - 1)));
5051
5052
5.58k
  if (PREDICT_FALSE(FLAGS_catalog_manager_inject_latency_in_delete_table_ms > 0)) {
5053
2
    LOG(INFO) << "Sleeping in CatalogManager::DeleteTable for " <<
5054
2
        FLAGS_catalog_manager_inject_latency_in_delete_table_ms << " ms";
5055
2
    SleepFor(MonoDelta::FromMilliseconds(FLAGS_catalog_manager_inject_latency_in_delete_table_ms));
5056
2
  }
5057
5058
  // Update the internal table maps. Exclude Postgres tables which are not in the name map.
5059
  // Also exclude hidden tables, that were already removed from this map.
5060
5.58k
  if (std::any_of(tables.begin(), tables.end(), [](auto& t) 
{ return t.remove_from_name_map; }5.58k
)) {
5061
1.33k
    TRACE("Removing tables from by-name map");
5062
1.33k
    LockGuard lock(mutex_);
5063
1.62k
    for (const auto& table : tables) {
5064
1.62k
      if (table.remove_from_name_map) {
5065
1.62k
        TableInfoByNameMap::key_type key = {table.info->namespace_id(), table.info->name()};
5066
1.62k
        if (table_names_map_.erase(key) != 1) {
5067
0
          LOG(WARNING) << "Could not remove table from map: " << key.first << "." << key.second;
5068
0
        }
5069
5070
        // Also remove from the system.partitions table.
5071
1.62k
        GetYqlPartitionsVtable().RemoveFromCache(table.info->id());
5072
5073
        // Remove matviews from matview to pg table id map
5074
1.62k
        matview_pg_table_ids_map_.erase(table.info->id());
5075
1.62k
      }
5076
1.62k
    }
5077
    // We commit another map to increment its version and reset cache.
5078
    // Since table_name_map_ does not have version.
5079
1.33k
    table_ids_map_.Commit();
5080
1.33k
  }
5081
5082
5.88k
  for (const auto& table : tables) {
5083
5.88k
    LOG(INFO) << "Deleting table: " << table.info->name() << ", retained by: "
5084
5.88k
              << AsString(table.retained_by_snapshot_schedules, &Uuid::TryFullyDecode);
5085
5086
    // Send a DeleteTablet() request to each tablet replica in the table.
5087
5.88k
    RETURN_NOT_OK(DeleteTabletsAndSendRequests(table.info, table.retained_by_snapshot_schedules));
5088
    // Send a RemoveTableFromTablet() request to each colocated parent tablet replica in the table.
5089
    // TODO(pitr) handle YSQL colocated tables.
5090
5.88k
    if (table.info->IsColocatedUserTable()) {
5091
81
      {
5092
81
        LockGuard lock(mutex_);
5093
81
        const auto it = table_tablegroup_ids_map_.find(table.info->id());
5094
81
        if (it != table_tablegroup_ids_map_.end()) {
5095
68
          const TablegroupId& tablegroup_id = it->second;
5096
68
          const auto& tablegroup = DCHECK_NOTNULL(tablegroup_ids_map_[tablegroup_id]);
5097
68
          tablegroup->DeleteChildTable(table.info->id());
5098
68
          table_tablegroup_ids_map_.erase(table.info->id());
5099
68
        }
5100
81
      }
5101
81
      auto call = std::make_shared<AsyncRemoveTableFromTablet>(
5102
81
          master_, AsyncTaskPool(), table.info->GetColocatedTablet(), table.info);
5103
81
      table.info->AddTask(call);
5104
81
      WARN_NOT_OK(ScheduleTask(call), "Failed to send RemoveTableFromTablet request");
5105
81
    }
5106
5.88k
  }
5107
5108
  // If there are any permissions granted on this table find them and delete them. This is necessary
5109
  // because we keep track of the permissions based on the canonical resource name which is a
5110
  // combination of the keyspace and table names, so if another table with the same name is created
5111
  // (in the same keyspace where the previous one existed), and the permissions were not deleted at
5112
  // the time of the previous table deletion, then the permissions that existed for the previous
5113
  // table will automatically be granted to the new table even though this wasn't the intention.
5114
5.58k
  string canonical_resource = get_canonical_table(req->table().namespace_().name(),
5115
5.58k
                                                  req->table().table_name());
5116
5.58k
  RETURN_NOT_OK(permissions_manager_->RemoveAllPermissionsForResource(canonical_resource, resp));
5117
5118
  // Remove the system tables from system catalog.
5119
5.58k
  if (!sys_table_ids.empty()) {
5120
    // We do not expect system tables deletion during initial snapshot forming.
5121
1
    DCHECK(!initial_snapshot_writer_);
5122
5123
1
    TRACE("Sending system table delete RPCs");
5124
1
    for (auto& table_id : sys_table_ids) {
5125
      // "sys_catalog_->DeleteYsqlSystemTable(table_id)" won't work here
5126
      // as it only acts on the leader.
5127
1
      tablet::ChangeMetadataRequestPB change_req;
5128
1
      change_req.set_tablet_id(kSysCatalogTabletId);
5129
1
      change_req.set_remove_table_id(table_id);
5130
1
      RETURN_NOT_OK(tablet::SyncReplicateChangeMetadataOperation(
5131
1
          &change_req, sys_catalog_->tablet_peer().get(), leader_ready_term()));
5132
1
    }
5133
5.58k
  } else {
5134
5.58k
    TRACE("No system tables to delete");
5135
5.58k
  }
5136
5137
5.58k
  LOG(INFO) << "Successfully initiated deletion of "
5138
5.58k
            << (req->is_index_table() ? 
"index"811
:
"table"4.77k
) << " with "
5139
5.58k
            << req->table().DebugString() << " per request from " << RequestorString(rpc);
5140
  // Asynchronously cleans up the final memory traces of the deleted database.
5141
5.58k
  background_tasks_->Wake();
5142
5.58k
  return Status::OK();
5143
5.58k
}
5144
5145
Status CatalogManager::DeleteTableInMemory(
5146
    const TableIdentifierPB& table_identifier,
5147
    const bool is_index_table,
5148
    const bool update_indexed_table,
5149
    const SnapshotSchedulesToObjectIdsMap& schedules_to_tables_map,
5150
    vector<DeletingTableData>* tables,
5151
    DeleteTableResponsePB* resp,
5152
5.90k
    rpc::RpcContext* rpc) {
5153
  // TODO(NIC): How to handle a DeleteTable request when the namespace is being deleted?
5154
5.90k
  const char* const object_type = is_index_table ? 
"index"1.12k
:
"table"4.78k
;
5155
5.90k
  const bool cascade_delete_index = is_index_table && 
!update_indexed_table1.12k
;
5156
5157
5.90k
  
VLOG_WITH_PREFIX_AND_FUNC0
(1) << 0
YB_STRUCT_TO_STRING0
(
5158
0
      table_identifier, is_index_table, update_indexed_table) << "\n" << GetStackTrace();
5159
5160
  // Lookup the table and verify if it exists.
5161
5.90k
  TRACE(Substitute("Looking up $0", object_type));
5162
5.90k
  auto table_result = FindTable(table_identifier);
5163
5.90k
  if (!VERIFY_RESULT(DoesTableExist(table_result))) {
5164
0
    if (cascade_delete_index) {
5165
0
      LOG(WARNING) << "Index " << table_identifier.DebugString() << " not found";
5166
0
      return Status::OK();
5167
0
    } else {
5168
0
      return table_result.status();
5169
0
    }
5170
0
  }
5171
5.90k
  auto table = std::move(*table_result);
5172
5173
5.90k
  TRACE(Substitute("Locking $0", object_type));
5174
5.90k
  auto data = DeletingTableData {
5175
5.90k
    .info = table,
5176
5.90k
    .write_lock = table->LockForWrite(),
5177
5.90k
    .retained_by_snapshot_schedules = RepeatedBytes(),
5178
5.90k
    .remove_from_name_map = false
5179
5.90k
  };
5180
5.90k
  auto& l = data.write_lock;
5181
  // table_id for the requested table will be added to the end of the response.
5182
5.90k
  *resp->add_deleted_table_ids() = table->id();
5183
5184
5.90k
  if (is_index_table == IsTable(l->pb)) {
5185
0
    Status s = STATUS(NotFound, "The object does not exist");
5186
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
5187
0
  }
5188
5189
5.90k
  FillRetainedBySnapshotSchedules(
5190
5.90k
      schedules_to_tables_map, table->id(), &data.retained_by_snapshot_schedules);
5191
5.90k
  bool hide_only = !data.retained_by_snapshot_schedules.empty();
5192
5193
5.90k
  if (l->started_deleting() || 
(5.88k
hide_only5.88k
&&
l->started_hiding()4
)) {
5194
16
    if (cascade_delete_index) {
5195
0
      LOG(WARNING) << "Index " << table_identifier.ShortDebugString() << " was "
5196
0
                   << (l->started_deleting() ? "deleted" : "hidden");
5197
0
      return Status::OK();
5198
16
    } else {
5199
16
      Status s = STATUS(NotFound, "The object was deleted", l->pb.state_msg());
5200
16
      return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
5201
16
    }
5202
16
  }
5203
5204
  // Determine if we have to remove from the name map here before we change the table state.
5205
5.89k
  data.remove_from_name_map = l.data().table_type() != PGSQL_TABLE_TYPE && 
!l->started_hiding()1.63k
;
5206
5207
5.89k
  TRACE("Updating metadata on disk");
5208
  // Update the metadata for the on-disk state.
5209
5.89k
  if (hide_only) {
5210
4
    l.mutable_data()->pb.set_hide_state(SysTablesEntryPB::HIDING);
5211
5.88k
  } else {
5212
5.88k
    l.mutable_data()->set_state(SysTablesEntryPB::DELETING,
5213
5.88k
                                 Substitute("Started deleting at $0", LocalTimeAsString()));
5214
5.88k
  }
5215
5216
5.89k
  auto now = master_->clock()->Now();
5217
5.89k
  DdlLogEntry ddl_log_entry(now, table->id(), l->pb, "Drop");
5218
5.89k
  if (is_index_table) {
5219
1.11k
    const auto& indexed_table_id = GetIndexedTableId(l->pb);
5220
1.11k
    auto indexed_table = FindTableById(indexed_table_id);
5221
1.11k
    if (indexed_table.ok()) {
5222
1.11k
      auto lock = (**indexed_table).LockForRead();
5223
1.11k
      ddl_log_entry = DdlLogEntry(
5224
1.11k
          now, indexed_table_id, lock->pb, Format("Drop index $0", l->name()));
5225
1.11k
    }
5226
1.11k
  }
5227
5228
  // Update sys-catalog with the removed table state.
5229
5.89k
  Status s = sys_catalog_->Upsert(leader_ready_term(), &ddl_log_entry, table);
5230
5231
5.89k
  if (PREDICT_FALSE(FLAGS_TEST_simulate_crash_after_table_marked_deleting)) {
5232
1
    return Status::OK();
5233
1
  }
5234
5235
5.89k
  if (!s.ok()) {
5236
    // The mutation will be aborted when 'l' exits the scope on early return.
5237
4
    s = s.CloneAndPrepend("An error occurred while updating sys tables");
5238
4
    LOG(WARNING) << s;
5239
4
    return CheckIfNoLongerLeaderAndSetupError(s, resp);
5240
4
  }
5241
5242
  // For regular (indexed) table, delete all its index tables if any. Else for index table, delete
5243
  // index info from the indexed table.
5244
5.88k
  if (!is_index_table) {
5245
4.77k
    TableIdentifierPB index_identifier;
5246
4.77k
    for (const auto& index : l->pb.indexes()) {
5247
300
      index_identifier.set_table_id(index.table_id());
5248
300
      RETURN_NOT_OK(DeleteTableInMemory(index_identifier, true /* is_index_table */,
5249
300
                                        false /* update_indexed_table */, schedules_to_tables_map,
5250
300
                                        tables, resp, rpc));
5251
300
    }
5252
4.77k
  } else 
if (1.11k
update_indexed_table1.11k
) {
5253
811
    s = MarkIndexInfoFromTableForDeletion(
5254
811
        GetIndexedTableId(l->pb), table->id(), /* multi_stage */ false, resp);
5255
811
    if (!s.ok()) {
5256
0
      s = s.CloneAndPrepend(Substitute("An error occurred while deleting index info: $0",
5257
0
                                       s.ToString()));
5258
0
      LOG(WARNING) << s.ToString();
5259
0
      return CheckIfNoLongerLeaderAndSetupError(s, resp);
5260
0
    }
5261
811
  }
5262
5263
5.88k
  if (!hide_only) {
5264
    // If table is being hidden we should not abort snapshot related tasks.
5265
5.87k
    table->AbortTasks();
5266
5.87k
  }
5267
5268
  // For regular (indexed) table, insert table info and lock in the front of the list. Else for
5269
  // index table, append them to the end. We do so so that we will commit and delete the indexed
5270
  // table first before its indexes.
5271
5.88k
  tables->insert(is_index_table ? 
tables->end()1.11k
:
tables->begin()4.77k
, std::move(data));
5272
5273
5.88k
  return Status::OK();
5274
5.88k
}
5275
5276
15.9M
TableInfo::WriteLock CatalogManager::MaybeTransitionTableToDeleted(const TableInfoPtr& table) {
5277
15.9M
  if (!table) {
5278
0
    LOG_WITH_PREFIX(INFO) << "Finished deleting an Orphaned tablet. "
5279
0
                          << "Table Information is null. Skipping updating its state to DELETED.";
5280
0
    return TableInfo::WriteLock();
5281
0
  }
5282
15.9M
  if (table->HasTasks()) {
5283
104k
    
VLOG_WITH_PREFIX_AND_FUNC0
(2) << table->ToString() << " has tasks"0
;
5284
104k
    return TableInfo::WriteLock();
5285
104k
  }
5286
15.8M
  bool hide_only;
5287
15.8M
  {
5288
15.8M
    auto lock = table->LockForRead();
5289
5290
    // For any table in DELETING state, we will want to mark it as DELETED once all its respective
5291
    // tablets have been successfully removed from tservers.
5292
    // For any hiding table we will want to mark it as HIDDEN once all its respective
5293
    // tablets have been successfully hidden on tservers.
5294
15.8M
    if (lock->is_deleted()) {
5295
      // Clear the tablets_ and partitions_ maps if table has already been DELETED.
5296
      // Usually this would have been done except for tables that were hidden and are now deleted.
5297
      // Also, this is a catch all in case any other path misses clearing the maps.
5298
1.00M
      table->ClearTabletMaps();
5299
1.00M
      return TableInfo::WriteLock();
5300
1.00M
    }
5301
14.8M
    hide_only = !lock->is_deleting();
5302
14.8M
    if (hide_only && 
!lock->is_hiding()14.7M
) {
5303
14.7M
      return TableInfo::WriteLock();
5304
14.7M
    }
5305
14.8M
  }
5306
  // The current relevant order of operations during a DeleteTable is:
5307
  // 1) Mark the table as DELETING
5308
  // 2) Abort the current table tasks
5309
  // 3) Per tablet, send DeleteTable requests to all TS, then mark that tablet as DELETED
5310
  //
5311
  // This creates a race, wherein, after 2, HasTasks can be false, but we still have not
5312
  // gotten to point 3, which would add further tasks for the deletes.
5313
  //
5314
  // However, HasTasks is cheaper than AreAllTabletsDeletedOrHidden...
5315
16.3k
  auto all_tablets_done = hide_only ? 
table->AreAllTabletsHidden()4
:
table->AreAllTabletsDeleted()16.3k
;
5316
18.4E
  VLOG_WITH_PREFIX_AND_FUNC(2)
5317
18.4E
      << table->ToString() << " hide only: " << hide_only << ", all tablets done: "
5318
18.4E
      << all_tablets_done;
5319
16.3k
  if (!all_tablets_done && 
!IsSystemTable(*table)8.09k
&&
!table->IsColocatedUserTable()203
) {
5320
115
    return TableInfo::WriteLock();
5321
115
  }
5322
5323
16.2k
  auto lock = table->LockForWrite();
5324
16.2k
  if (lock->is_hiding()) {
5325
4
    LOG(INFO) << "Marking table as HIDDEN: " << table->ToString();
5326
4
    lock.mutable_data()->pb.set_hide_state(SysTablesEntryPB::HIDDEN);
5327
    // Erase all the tablets from partitions_ structure.
5328
4
    table->ClearTabletMaps(DeactivateOnly::kTrue);
5329
4
    return lock;
5330
4
  }
5331
16.2k
  if (lock->is_deleting()) {
5332
    // Update the metadata for the on-disk state.
5333
13.8k
    LOG(INFO) << "Marking table as DELETED: " << table->ToString();
5334
13.8k
    lock.mutable_data()->set_state(SysTablesEntryPB::DELETED,
5335
13.8k
        Substitute("Deleted with tablets at $0", LocalTimeAsString()));
5336
    // Erase all the tablets from tablets_ and partitions_ structures.
5337
13.8k
    table->ClearTabletMaps();
5338
13.8k
    return lock;
5339
13.8k
  }
5340
2.43k
  return TableInfo::WriteLock();
5341
16.2k
}
5342
5343
56.6k
void CatalogManager::CleanUpDeletedTables() {
5344
  // TODO(bogdan): Cache tables being deleted to make this iterate only over those?
5345
56.6k
  vector<scoped_refptr<TableInfo>> tables_to_delete;
5346
  // Garbage collecting.
5347
  // Going through all tables under the global lock, copying them to not hold lock for too long.
5348
56.6k
  TableInfoMap copy_of_table_by_id_map;
5349
56.6k
  {
5350
56.6k
    LockGuard lock(mutex_);
5351
56.6k
    copy_of_table_by_id_map = *table_ids_map_;
5352
56.6k
  }
5353
  // Mark the tables as DELETED and remove them from the in-memory maps.
5354
56.6k
  vector<TableInfo*> tables_to_update_on_disk;
5355
56.6k
  vector<TableInfo::WriteLock> table_locks;
5356
15.7M
  for (const auto& it : copy_of_table_by_id_map) {
5357
15.7M
    const auto& table = it.second;
5358
15.7M
    auto lock = MaybeTransitionTableToDeleted(table);
5359
15.7M
    if (lock.locked()) {
5360
7.93k
      table_locks.push_back(std::move(lock));
5361
7.93k
      tables_to_update_on_disk.push_back(table.get());
5362
7.93k
    }
5363
15.7M
  }
5364
56.6k
  if (tables_to_update_on_disk.size() > 0) {
5365
87
    Status s = sys_catalog_->Upsert(leader_ready_term(), tables_to_update_on_disk);
5366
87
    if (!s.ok()) {
5367
0
      LOG(WARNING) << "Error marking tables as DELETED: " << s.ToString();
5368
0
      return;
5369
0
    }
5370
    // Update the table in-memory info as DELETED after we've removed them from the maps.
5371
7.93k
    
for (auto& lock : table_locks)87
{
5372
7.93k
      lock.Commit();
5373
7.93k
    }
5374
    // TODO: Check if we want to delete the totally deleted table from the sys_catalog here.
5375
    // TODO: SysCatalog::DeleteItem() if we've DELETED all user tables in a DELETING namespace.
5376
    // TODO: Also properly handle namespace_ids_map_.erase(table->namespace_id())
5377
87
  }
5378
56.6k
}
5379
5380
Status CatalogManager::IsDeleteTableDone(const IsDeleteTableDoneRequestPB* req,
5381
11.7k
                                         IsDeleteTableDoneResponsePB* resp) {
5382
  // Lookup the deleted table.
5383
11.7k
  TRACE("Looking up table $0", req->table_id());
5384
11.7k
  scoped_refptr<TableInfo> table;
5385
11.7k
  {
5386
11.7k
    SharedLock lock(mutex_);
5387
11.7k
    table = FindPtrOrNull(*table_ids_map_, req->table_id());
5388
11.7k
  }
5389
5390
11.7k
  if (table == nullptr) {
5391
40
    LOG(INFO) << "Servicing IsDeleteTableDone request for table id "
5392
40
              << req->table_id() << ": deleted (not found)";
5393
40
    resp->set_done(true);
5394
40
    return Status::OK();
5395
40
  }
5396
5397
11.7k
  TRACE("Locking table");
5398
11.7k
  auto l = table->LockForRead();
5399
5400
11.7k
  if (!l->started_deleting() && 
!l->started_hiding()118
) {
5401
106
    LOG(WARNING) << "Servicing IsDeleteTableDone request for table id "
5402
106
                 << req->table_id() << ": NOT deleted";
5403
106
    Status s = STATUS(IllegalState, "The object was NOT deleted", l->pb.state_msg());
5404
106
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
5405
106
  }
5406
5407
  // Temporary fix for github issue #5290.
5408
  // TODO: Wait till deletion completed for tablegroup parent table.
5409
11.6k
  if (table->IsTablegroupParentTable()) {
5410
0
    LOG(INFO) << "Servicing IsDeleteTableDone request for tablegroup parent table id "
5411
0
              << req->table_id() << ": deleting. Skipping wait for DELETED state.";
5412
0
    resp->set_done(true);
5413
0
    return Status::OK();
5414
0
  }
5415
5416
11.6k
  if (l->is_deleted() || 
l->is_hidden()5.96k
) {
5417
5.66k
    LOG(INFO) << "Servicing IsDeleteTableDone request for table id "
5418
5.66k
              << req->table_id() << ": totally " << (l->is_hidden() ? 
"hidden"4
:
"deleted"5.66k
);
5419
5.66k
    resp->set_done(true);
5420
5.96k
  } else {
5421
5.96k
    LOG(INFO) << "Servicing IsDeleteTableDone request for table id " << req->table_id()
5422
5.96k
              << ((!table->IsColocatedUserTable()) ? 
": deleting tablets"5.88k
:
""79
);
5423
5424
5.96k
    std::vector<std::shared_ptr<TSDescriptor>> descs;
5425
5.96k
    master_->ts_manager()->GetAllDescriptors(&descs);
5426
17.8k
    for (auto& ts_desc : descs) {
5427
17.8k
      LOG(INFO) << "Deleting on " << ts_desc->permanent_uuid() << ": "
5428
17.8k
                << ts_desc->PendingTabletDeleteToString();
5429
17.8k
    }
5430
5431
5.96k
    resp->set_done(false);
5432
5.96k
  }
5433
5434
11.6k
  return Status::OK();
5435
11.6k
}
5436
5437
namespace {
5438
5439
CHECKED_STATUS ApplyAlterSteps(server::Clock* clock,
5440
                               const TableId& table_id,
5441
                               const SysTablesEntryPB& current_pb,
5442
                               const AlterTableRequestPB* req,
5443
                               Schema* new_schema,
5444
                               ColumnId* next_col_id,
5445
580
                               std::vector<DdlLogEntry>* ddl_log_entries) {
5446
580
  const SchemaPB& current_schema_pb = current_pb.schema();
5447
580
  Schema cur_schema;
5448
580
  RETURN_NOT_OK(SchemaFromPB(current_schema_pb, &cur_schema));
5449
5450
580
  SchemaBuilder builder(cur_schema);
5451
580
  if (current_pb.has_next_column_id()) {
5452
580
    builder.set_next_column_id(ColumnId(current_pb.next_column_id()));
5453
580
  }
5454
580
  if (current_pb.has_colocated() && 
current_pb.colocated()6
) {
5455
6
    if (current_schema_pb.table_properties().is_ysql_catalog_table()) {
5456
0
      Uuid cotable_id;
5457
0
      RETURN_NOT_OK(cotable_id.FromHexString(req->table().table_id()));
5458
0
      builder.set_cotable_id(cotable_id);
5459
0
    }
5460
    // Colocation ID is set in schema and cannot be altered.
5461
6
  }
5462
5463
600
  
for (const AlterTableRequestPB::Step& step : req->alter_schema_steps())580
{
5464
600
    auto time = clock->Now();
5465
600
    switch (step.type()) {
5466
339
      case AlterTableRequestPB::ADD_COLUMN: {
5467
339
        if (!step.has_add_column()) {
5468
0
          return STATUS(InvalidArgument, "ADD_COLUMN missing column info");
5469
0
        }
5470
5471
        // Verify that encoding is appropriate for the new column's type.
5472
339
        ColumnSchemaPB new_col_pb = step.add_column().schema();
5473
339
        if (new_col_pb.has_id()) {
5474
0
          return STATUS_SUBSTITUTE(InvalidArgument,
5475
0
              "column $0: client should not specify column id", new_col_pb.ShortDebugString());
5476
0
        }
5477
339
        ColumnSchema new_col = ColumnSchemaFromPB(new_col_pb);
5478
5479
339
        RETURN_NOT_OK(builder.AddColumn(new_col, false));
5480
339
        ddl_log_entries->emplace_back(time, table_id, current_pb, Format("Add column $0", new_col));
5481
339
        break;
5482
339
      }
5483
5484
232
      case AlterTableRequestPB::DROP_COLUMN: {
5485
232
        if (!step.has_drop_column()) {
5486
0
          return STATUS(InvalidArgument, "DROP_COLUMN missing column info");
5487
0
        }
5488
5489
232
        if (cur_schema.is_key_column(step.drop_column().name())) {
5490
0
          return STATUS(InvalidArgument, "cannot remove a key column");
5491
0
        }
5492
5493
232
        RETURN_NOT_OK(builder.RemoveColumn(step.drop_column().name()));
5494
232
        ddl_log_entries->emplace_back(
5495
232
            time, table_id, current_pb, Format("Drop column $0", step.drop_column().name()));
5496
232
        break;
5497
232
      }
5498
5499
29
      case AlterTableRequestPB::RENAME_COLUMN: {
5500
29
        if (!step.has_rename_column()) {
5501
0
          return STATUS(InvalidArgument, "RENAME_COLUMN missing column info");
5502
0
        }
5503
5504
29
        RETURN_NOT_OK(builder.RenameColumn(
5505
29
            step.rename_column().old_name(),
5506
29
            step.rename_column().new_name()));
5507
29
        ddl_log_entries->emplace_back(
5508
29
            time, table_id, current_pb,
5509
29
            Format("Rename column $0 => $1", step.rename_column().old_name(),
5510
29
                   step.rename_column().new_name()));
5511
29
        break;
5512
29
      }
5513
5514
        // TODO: EDIT_COLUMN.
5515
5516
0
      default: {
5517
0
        return STATUS_SUBSTITUTE(InvalidArgument, "Invalid alter step type: $0", step.type());
5518
29
      }
5519
600
    }
5520
600
  }
5521
5522
580
  if (req->has_alter_properties()) {
5523
7
    RETURN_NOT_OK(builder.AlterProperties(req->alter_properties()));
5524
7
  }
5525
5526
580
  *new_schema = builder.Build();
5527
580
  *next_col_id = builder.next_column_id();
5528
580
  return Status::OK();
5529
580
}
5530
5531
} // namespace
5532
5533
Status CatalogManager::AlterTable(const AlterTableRequestPB* req,
5534
                                  AlterTableResponsePB* resp,
5535
5.89k
                                  rpc::RpcContext* rpc) {
5536
5.89k
  LOG_WITH_PREFIX(INFO) << "Servicing " << __func__ << " request from " << RequestorString(rpc)
5537
5.89k
                        << ": " << req->ShortDebugString();
5538
5539
5.89k
  std::vector<DdlLogEntry> ddl_log_entries;
5540
5541
  // Lookup the table and verify if it exists.
5542
5.89k
  TRACE("Looking up table");
5543
5.89k
  scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTable(req->table()));
5544
5545
0
  NamespaceId new_namespace_id;
5546
5547
5.89k
  if (req->has_new_namespace()) {
5548
    // Lookup the new namespace and verify if it exists.
5549
119
    TRACE("Looking up new namespace");
5550
119
    scoped_refptr<NamespaceInfo> ns;
5551
119
    NamespaceIdentifierPB namespace_identifier = req->new_namespace();
5552
    // Use original namespace_id as new_namespace_id for YSQL tables.
5553
119
    if (table->GetTableType() == PGSQL_TABLE_TYPE && 
!namespace_identifier.has_id()115
) {
5554
115
      namespace_identifier.set_id(table->namespace_id());
5555
115
    }
5556
119
    ns = 
VERIFY_NAMESPACE_FOUND117
(117
FindNamespace(namespace_identifier), resp);
5557
5558
117
    auto ns_lock = ns->LockForRead();
5559
117
    new_namespace_id = ns->id();
5560
    // Don't use Namespaces that aren't running.
5561
117
    if (ns->state() != SysNamespaceEntryPB::RUNNING) {
5562
0
      Status s = STATUS_SUBSTITUTE(TryAgain,
5563
0
          "Namespace not running (State=$0). Cannot create $1.$2",
5564
0
          SysNamespaceEntryPB::State_Name(ns->state()), ns->name(), table->name() );
5565
0
      return SetupError(resp->mutable_error(), NamespaceMasterError(ns->state()), s);
5566
0
    }
5567
117
  }
5568
5.89k
  if (req->has_new_namespace() || 
req->has_new_table_name()5.77k
) {
5569
117
    if (new_namespace_id.empty()) {
5570
0
      const Status s = STATUS(InvalidArgument, "No namespace used");
5571
0
      return SetupError(resp->mutable_error(), MasterErrorPB::NO_NAMESPACE_USED, s);
5572
0
    }
5573
117
  }
5574
5575
5.89k
  TRACE("Locking table");
5576
5.89k
  auto l = table->LockForWrite();
5577
5.89k
  RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp));
5578
5579
5.89k
  bool has_changes = false;
5580
5.89k
  auto& table_pb = l.mutable_data()->pb;
5581
5.89k
  const TableName table_name = l->name();
5582
5.89k
  const NamespaceId namespace_id = l->namespace_id();
5583
5.89k
  const TableName new_table_name = req->has_new_table_name() ? 
req->new_table_name()115
:
table_name5.78k
;
5584
5585
  // Calculate new schema for the on-disk state, not persisted yet.
5586
5.89k
  Schema new_schema;
5587
5.89k
  ColumnId next_col_id = ColumnId(l->pb.next_column_id());
5588
5.89k
  if (req->alter_schema_steps_size() || 
req->has_alter_properties()5.32k
) {
5589
580
    TRACE("Apply alter schema");
5590
580
    Status s = ApplyAlterSteps(
5591
580
        master_->clock(), table->id(), l->pb, req, &new_schema, &next_col_id, &ddl_log_entries);
5592
580
    if (!s.ok()) {
5593
0
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
5594
0
    }
5595
580
    DCHECK_NE(next_col_id, 0);
5596
580
    DCHECK_EQ(new_schema.find_column_by_id(next_col_id),
5597
580
              static_cast<int>(Schema::kColumnNotFound));
5598
580
    has_changes = true;
5599
580
  }
5600
5601
  // Try to acquire the new table name.
5602
5.89k
  if (req->has_new_namespace() || 
req->has_new_table_name()5.77k
) {
5603
5604
    // Postgres handles name uniqueness constraints in it's own layer.
5605
117
    if (l->table_type() != PGSQL_TABLE_TYPE) {
5606
2
      LockGuard lock(mutex_);
5607
2
      
VLOG_WITH_FUNC0
(3) << "Acquired the catalog manager lock"0
;
5608
5609
2
      TRACE("Acquired catalog manager lock");
5610
5611
      // Verify that the table does not exist.
5612
2
      scoped_refptr<TableInfo> other_table = FindPtrOrNull(
5613
2
          table_names_map_, {new_namespace_id, new_table_name});
5614
2
      if (other_table != nullptr) {
5615
1
        Status s = STATUS_SUBSTITUTE(AlreadyPresent,
5616
1
            "Object '$0.$1' already exists",
5617
1
            GetNamespaceNameUnlocked(new_namespace_id), other_table->name());
5618
1
        LOG(WARNING) << "Found table: " << other_table->ToStringWithState()
5619
1
                     << ". Failed alterring table with error: "
5620
1
                     << s.ToString() << " Request:\n" << req->DebugString();
5621
1
        return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_ALREADY_PRESENT, s);
5622
1
      }
5623
5624
      // Acquire the new table name (now we have 2 name for the same table).
5625
1
      table_names_map_[{new_namespace_id, new_table_name}] = table;
5626
1
    }
5627
5628
116
    table_pb.set_namespace_id(new_namespace_id);
5629
116
    table_pb.set_name(new_table_name);
5630
5631
116
    has_changes = true;
5632
116
  }
5633
5634
  // Check if there has been any changes to the placement policies for this table.
5635
5.89k
  if (req->has_replication_info()) {
5636
    // If this is a colocated table, it does not make sense to set placement
5637
    // policy for this table, as the tablet associated with it is shared by
5638
    // multiple tables.
5639
4
    if (table->colocated()) {
5640
0
      const Status s = STATUS(InvalidArgument,
5641
0
          "Placement policy cannot be altered for a colocated table");
5642
0
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s);
5643
0
    }
5644
4
    if (table->GetTableType() == PGSQL_TABLE_TYPE) {
5645
0
      const Status s = STATUS(InvalidArgument,
5646
0
            "Placement policy cannot be altered for YSQL tables, use Tablespaces");
5647
0
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s);
5648
0
    }
5649
    // Validate table replication info.
5650
4
    RETURN_NOT_OK(ValidateTableReplicationInfo(req->replication_info()));
5651
4
    table_pb.mutable_replication_info()->CopyFrom(req->replication_info());
5652
4
    has_changes = true;
5653
4
  }
5654
5655
  // TODO(hector): Simplify the AlterSchema workflow to avoid doing the same checks on every layer
5656
  // this request goes through: https://github.com/YugaByte/yugabyte-db/issues/1882.
5657
5.89k
  if (req->has_wal_retention_secs()) {
5658
5.19k
    if (has_changes) {
5659
0
      const Status s = STATUS(InvalidArgument,
5660
0
          "wal_retention_secs cannot be altered concurrently with other properties");
5661
0
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s);
5662
0
    }
5663
    // TODO(hector): Handle co-partitioned tables:
5664
    // https://github.com/YugaByte/yugabyte-db/issues/1905.
5665
5.19k
    table_pb.set_wal_retention_secs(req->wal_retention_secs());
5666
5.19k
    has_changes = true;
5667
5.19k
  }
5668
5669
5.89k
  if (!has_changes) {
5670
0
    if (req->has_force_send_alter_request() && req->force_send_alter_request()) {
5671
0
      RETURN_NOT_OK(SendAlterTableRequest(table, req));
5672
0
    }
5673
    // Skip empty requests...
5674
0
    return Status::OK();
5675
0
  }
5676
5677
  // Serialize the schema Increment the version number.
5678
5.89k
  if (new_schema.initialized()) {
5679
580
    if (!l->pb.has_fully_applied_schema()) {
5680
      // The idea here is that if we are in the middle of updating the schema
5681
      // from one state to another, then YBClients will be given the older
5682
      // version until the schema is updated on all the tablets.
5683
      // As of Dec 2019, this may lead to some rejected operations/retries during
5684
      // the index backfill. See #3284 for possible optimizations.
5685
580
      MultiStageAlterTable::CopySchemaDetailsToFullyApplied(&table_pb);
5686
580
    }
5687
580
    SchemaToPB(new_schema, table_pb.mutable_schema());
5688
580
  }
5689
5690
  // Only increment the version number if it is a schema change (AddTable change goes through a
5691
  // different path and it's not processed here).
5692
5.89k
  if (!req->has_wal_retention_secs()) {
5693
700
    table_pb.set_version(table_pb.version() + 1);
5694
700
    table_pb.set_updates_only_index_permissions(false);
5695
700
  }
5696
5.89k
  table_pb.set_next_column_id(next_col_id);
5697
5.89k
  l.mutable_data()->set_state(
5698
5.89k
      SysTablesEntryPB::ALTERING,
5699
5.89k
      Substitute("Alter table version=$0 ts=$1", table_pb.version(), LocalTimeAsString()));
5700
5701
  // Update sys-catalog with the new table schema.
5702
5.89k
  TRACE("Updating metadata on disk");
5703
5.89k
  std::vector<const DdlLogEntry*> ddl_log_entry_pointers;
5704
5.89k
  ddl_log_entry_pointers.reserve(ddl_log_entries.size());
5705
5.89k
  for (const auto& entry : ddl_log_entries) {
5706
600
    ddl_log_entry_pointers.push_back(&entry);
5707
600
  }
5708
5.89k
  Status s = sys_catalog_->Upsert(leader_ready_term(), ddl_log_entry_pointers, table);
5709
5.89k
  if (!s.ok()) {
5710
2
    s = s.CloneAndPrepend(
5711
2
        Substitute("An error occurred while updating sys-catalog tables entry: $0",
5712
2
                   s.ToString()));
5713
2
    LOG(WARNING) << s.ToString();
5714
2
    if (table->GetTableType() != PGSQL_TABLE_TYPE &&
5715
2
        
(0
req->has_new_namespace()0
||
req->has_new_table_name()0
)) {
5716
0
      LockGuard lock(mutex_);
5717
0
      VLOG_WITH_FUNC(3) << "Acquired the catalog manager lock";
5718
0
      CHECK_EQ(table_names_map_.erase({new_namespace_id, new_table_name}), 1);
5719
0
    }
5720
    // TableMetadaLock follows RAII paradigm: when it leaves scope,
5721
    // 'l' will be unlocked, and the mutation will be aborted.
5722
2
    return CheckIfNoLongerLeaderAndSetupError(s, resp);
5723
2
  }
5724
5725
  // Remove the old name. Not present if PGSQL.
5726
5.89k
  if (table->GetTableType() != PGSQL_TABLE_TYPE &&
5727
5.89k
      
(183
req->has_new_namespace()183
||
req->has_new_table_name()182
)) {
5728
1
    TRACE("Removing (namespace, table) combination ($0, $1) from by-name map",
5729
1
          namespace_id, table_name);
5730
1
    LockGuard lock(mutex_);
5731
1
    table_names_map_.erase({namespace_id, table_name});
5732
1
  }
5733
5734
  // Update the in-memory state.
5735
5.89k
  TRACE("Committing in-memory state");
5736
5.89k
  l.Commit();
5737
5738
5.89k
  RETURN_NOT_OK(SendAlterTableRequest(table, req));
5739
5740
  // Increment transaction status version if needed.
5741
5.89k
  if (table->GetTableType() == TableType::TRANSACTION_STATUS_TABLE_TYPE) {
5742
0
    RETURN_NOT_OK(IncrementTransactionTablesVersion());
5743
0
  }
5744
5745
5.89k
  LOG(INFO) << "Successfully initiated ALTER TABLE (pending tablet schema updates) for "
5746
5.89k
            << table->ToString() << " per request from " << RequestorString(rpc);
5747
5.89k
  return Status::OK();
5748
5.89k
}
5749
5750
Status CatalogManager::IsAlterTableDone(const IsAlterTableDoneRequestPB* req,
5751
1.39k
                                        IsAlterTableDoneResponsePB* resp) {
5752
  // 1. Lookup the table and verify if it exists.
5753
1.39k
  TRACE("Looking up table");
5754
1.39k
  scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTable(req->table()));
5755
5756
1.39k
  TRACE("Locking table");
5757
1.39k
  auto l = table->LockForRead();
5758
1.39k
  RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp));
5759
5760
  // 2. Verify if the alter is in-progress.
5761
1.39k
  TRACE("Verify if there is an alter operation in progress for $0", table->ToString());
5762
1.39k
  resp->set_schema_version(l->pb.version());
5763
1.39k
  resp->set_done(l->pb.state() != SysTablesEntryPB::ALTERING);
5764
5765
1.39k
  return Status::OK();
5766
1.39k
}
5767
5768
Result<TabletInfoPtr> CatalogManager::RegisterNewTabletForSplit(
5769
    TabletInfo* source_tablet_info, const PartitionPB& partition,
5770
88
    TableInfo::WriteLock* table_write_lock, TabletInfo::WriteLock* tablet_write_lock) {
5771
88
  const auto tablet_lock = source_tablet_info->LockForRead();
5772
5773
88
  auto table = source_tablet_info->table();
5774
88
  TabletInfoPtr new_tablet;
5775
88
  {
5776
88
    LockGuard lock(mutex_);
5777
88
    new_tablet = CreateTabletInfo(table.get(), partition);
5778
88
  }
5779
88
  const auto& source_tablet_meta = tablet_lock->pb;
5780
5781
88
  auto& new_tablet_meta = new_tablet->mutable_metadata()->mutable_dirty()->pb;
5782
88
  new_tablet_meta.set_state(SysTabletsEntryPB::CREATING);
5783
88
  new_tablet_meta.mutable_committed_consensus_state()->CopyFrom(
5784
88
      source_tablet_meta.committed_consensus_state());
5785
88
  new_tablet_meta.set_split_depth(source_tablet_meta.split_depth() + 1);
5786
88
  new_tablet_meta.set_split_parent_tablet_id(source_tablet_info->tablet_id());
5787
  // TODO(tsplit): consider and handle failure scenarios, for example:
5788
  // - Crash or leader failover before sending out the split tasks.
5789
  // - Long enough partition while trying to send out the splits so that they timeout and
5790
  //   not get executed.
5791
88
  int new_partition_list_version;
5792
88
  {
5793
88
    LockGuard lock(mutex_);
5794
5795
88
    auto& table_pb = table_write_lock->mutable_data()->pb;
5796
88
    new_partition_list_version = table_pb.partition_list_version() + 1;
5797
88
    table_pb.set_partition_list_version(new_partition_list_version);
5798
5799
88
    tablet_write_lock->mutable_data()->pb.add_split_tablet_ids(new_tablet->id());
5800
88
    RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), table, new_tablet, source_tablet_info));
5801
5802
88
    MAYBE_FAULT(FLAGS_TEST_crash_after_creating_single_split_tablet);
5803
5804
88
    table->AddTablet(new_tablet);
5805
    // TODO: We use this pattern in other places, but what if concurrent thread accesses not yet
5806
    // committed TabletInfo from the `table` ?
5807
88
    new_tablet->mutable_metadata()->CommitMutation();
5808
5809
88
    auto tablet_map_checkout = tablet_map_.CheckOut();
5810
88
    (*tablet_map_checkout)[new_tablet->id()] = new_tablet;
5811
88
  }
5812
0
  LOG(INFO) << "Registered new tablet " << new_tablet->tablet_id()
5813
88
            << " (" << AsString(partition) << ") to split the tablet "
5814
88
            << source_tablet_info->tablet_id()
5815
88
            << " (" << AsString(source_tablet_meta.partition())
5816
88
            << ") for table " << table->ToString()
5817
88
            << ", new partition_list_version: " << new_partition_list_version;
5818
5819
88
  return new_tablet;
5820
88
}
5821
5822
Status CatalogManager::GetTableSchema(const GetTableSchemaRequestPB* req,
5823
171k
                                      GetTableSchemaResponsePB* resp) {
5824
171k
  VLOG
(1) << "Servicing GetTableSchema request for " << req->ShortDebugString()3
;
5825
5826
  // Lookup the table and verify if it exists.
5827
171k
  TRACE("Looking up table");
5828
171k
  scoped_refptr<TableInfo> table = 
VERIFY_RESULT168k
(FindTable(req->table()));168k
5829
5830
  // Due to differences in the way proxies handle version mismatch (pull for yql vs push for sql).
5831
  // For YQL tables, we will return the "set of indexes" being applied instead of the ones
5832
  // that are fully completed.
5833
  // For PGSQL (and other) tables we want to return the fully applied schema.
5834
0
  const bool get_fully_applied_indexes = table->GetTableType() != TableType::YQL_TABLE_TYPE;
5835
168k
  return GetTableSchemaInternal(req, resp, get_fully_applied_indexes);
5836
171k
}
5837
5838
Status CatalogManager::GetTableSchemaInternal(const GetTableSchemaRequestPB* req,
5839
                                              GetTableSchemaResponsePB* resp,
5840
170k
                                              bool get_fully_applied_indexes) {
5841
170k
  VLOG
(1) << "Servicing GetTableSchema request for " << req->ShortDebugString()10
;
5842
5843
  // Lookup the table and verify if it exists.
5844
170k
  TRACE("Looking up table");
5845
170k
  scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTable(req->table()));
5846
5847
170k
  TRACE("Locking table");
5848
170k
  auto l = table->LockForRead();
5849
170k
  RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp));
5850
5851
169k
  if (l->pb.has_fully_applied_schema()) {
5852
    // An AlterTable is in progress; fully_applied_schema is the last
5853
    // schema that has reached every TS.
5854
1.20k
    DCHECK(l->pb.state() == SysTablesEntryPB::ALTERING);
5855
1.20k
    resp->mutable_schema()->CopyFrom(l->pb.fully_applied_schema());
5856
168k
  } else {
5857
    // There's no AlterTable, the regular schema is "fully applied".
5858
168k
    resp->mutable_schema()->CopyFrom(l->pb.schema());
5859
168k
  }
5860
5861
169k
  if (get_fully_applied_indexes && 
l->pb.has_fully_applied_schema()93.9k
) {
5862
167
    resp->set_version(l->pb.fully_applied_schema_version());
5863
167
    resp->mutable_indexes()->CopyFrom(l->pb.fully_applied_indexes());
5864
167
    if (l->pb.has_fully_applied_index_info()) {
5865
0
      resp->set_obsolete_indexed_table_id(GetIndexedTableId(l->pb));
5866
0
      *resp->mutable_index_info() = l->pb.fully_applied_index_info();
5867
0
    }
5868
167
    VLOG(1) << "Returning"
5869
0
            << "\nfully_applied_schema with version "
5870
0
            << l->pb.fully_applied_schema_version()
5871
0
            << ":\n"
5872
0
            << yb::ToString(l->pb.fully_applied_indexes())
5873
0
            << "\ninstead of schema with version "
5874
0
            << l->pb.version()
5875
0
            << ":\n"
5876
0
            << yb::ToString(l->pb.indexes());
5877
169k
  } else {
5878
169k
    resp->set_version(l->pb.version());
5879
169k
    resp->mutable_indexes()->CopyFrom(l->pb.indexes());
5880
169k
    if (l->pb.has_index_info()) {
5881
36.8k
      resp->set_obsolete_indexed_table_id(GetIndexedTableId(l->pb));
5882
36.8k
      *resp->mutable_index_info() = l->pb.index_info();
5883
36.8k
    }
5884
169k
    VLOG(3) << "Returning"
5885
1
            << "\nschema with version "
5886
1
            << l->pb.version()
5887
1
            << ":\n"
5888
1
            << yb::ToString(l->pb.indexes());
5889
169k
  }
5890
169k
  resp->set_is_compatible_with_previous_version(l->pb.updates_only_index_permissions());
5891
169k
  resp->mutable_partition_schema()->CopyFrom(l->pb.partition_schema());
5892
169k
  if (IsReplicationInfoSet(l->pb.replication_info())) {
5893
2
    resp->mutable_replication_info()->CopyFrom(l->pb.replication_info());
5894
2
  }
5895
169k
  resp->set_create_table_done(!table->IsCreateInProgress());
5896
169k
  resp->set_table_type(table->metadata().state().pb.table_type());
5897
169k
  resp->mutable_identifier()->set_table_name(l->pb.name());
5898
169k
  resp->mutable_identifier()->set_table_id(table->id());
5899
169k
  resp->mutable_identifier()->mutable_namespace_()->set_id(table->namespace_id());
5900
169k
  auto nsinfo = FindNamespaceById(table->namespace_id());
5901
169k
  if (
nsinfo.ok()169k
) {
5902
169k
    resp->mutable_identifier()->mutable_namespace_()->set_name((**nsinfo).name());
5903
169k
  }
5904
5905
169k
  if (l->pb.has_wal_retention_secs()) {
5906
5.03k
    resp->set_wal_retention_secs(l->pb.wal_retention_secs());
5907
5.03k
  }
5908
5909
  // Get namespace name by id.
5910
169k
  SharedLock lock(mutex_);
5911
169k
  TRACE("Looking up namespace");
5912
169k
  const scoped_refptr<NamespaceInfo> ns = FindPtrOrNull(namespace_ids_map_, table->namespace_id());
5913
5914
169k
  if (ns == nullptr) {
5915
0
    Status s = STATUS_SUBSTITUTE(
5916
0
        NotFound, "Could not find namespace by namespace id $0 for request $1.",
5917
0
        table->namespace_id(), req->DebugString());
5918
0
    return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s);
5919
0
  }
5920
5921
169k
  resp->mutable_identifier()->mutable_namespace_()->set_name(ns->name());
5922
5923
169k
  resp->set_colocated(table->colocated());
5924
5925
18.4E
  VLOG(1) << "Serviced GetTableSchema request for " << req->ShortDebugString() << " with "
5926
18.4E
          << yb::ToString(*resp);
5927
169k
  return Status::OK();
5928
169k
}
5929
5930
Status CatalogManager::GetTablegroupSchema(const GetTablegroupSchemaRequestPB* req,
5931
0
                                           GetTablegroupSchemaResponsePB* resp) {
5932
0
  VLOG(1) << "Servicing GetTablegroupSchema request for " << req->ShortDebugString();
5933
0
  if (!req->parent_tablegroup().has_id()) {
5934
0
    Status s = STATUS(InvalidArgument, "Invalid get tablegroup request (missing fields)");
5935
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
5936
0
  }
5937
5938
0
  const std::string& tablegroupId = req->parent_tablegroup().id();
5939
0
  if (!IsTablegroupParentTableId(tablegroupId)) {
5940
0
    Status s = STATUS(InvalidArgument, "Received a non tablegroup ID");
5941
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
5942
0
  }
5943
5944
  // Strip the suffix from the tablegroup ID request (since tablegroup_ids_map_
5945
  // only accepts the plain ID).
5946
0
  DCHECK(boost::algorithm::ends_with(tablegroupId, master::kTablegroupParentTableIdSuffix));
5947
0
  size_t tgid_len = tablegroupId.size() - strlen(master::kTablegroupParentTableIdSuffix);
5948
0
  TablegroupId tgid = tablegroupId.substr(0, tgid_len);
5949
5950
  // Lookup the tablegroup.
5951
0
  std::unordered_set<TableId> tablesInTablegroup;
5952
0
  {
5953
0
    SharedLock lock(mutex_);
5954
5955
0
    if (tablegroup_ids_map_.find(tgid) == tablegroup_ids_map_.end()) {
5956
0
      return STATUS(NotFound, Substitute("Tablegroup not found for tablegroup id: $0",
5957
0
                                         req->parent_tablegroup().id()));
5958
0
    }
5959
0
    scoped_refptr<TablegroupInfo> tginfo = tablegroup_ids_map_[tgid];
5960
0
    tablesInTablegroup = tginfo->ChildTables();
5961
0
  }
5962
5963
0
  for (const auto& t : tablesInTablegroup) {
5964
0
    TRACE("Looking up table");
5965
0
    GetTableSchemaRequestPB schemaReq;
5966
0
    GetTableSchemaResponsePB schemaResp;
5967
0
    schemaReq.mutable_table()->set_table_id(t);
5968
0
    Status s = GetTableSchema(&schemaReq, &schemaResp);
5969
0
    if (!s.ok() || schemaResp.has_error()) {
5970
0
      LOG(ERROR) << "Error while getting table schema: " << s;
5971
0
      return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
5972
0
    }
5973
0
    resp->add_get_table_schema_response_pbs()->Swap(&schemaResp);
5974
0
  }
5975
5976
0
  return Status::OK();
5977
0
}
5978
5979
Status CatalogManager::GetColocatedTabletSchema(const GetColocatedTabletSchemaRequestPB* req,
5980
0
                                                GetColocatedTabletSchemaResponsePB* resp) {
5981
0
  VLOG(1) << "Servicing GetColocatedTabletSchema request for " << req->ShortDebugString();
5982
5983
  // Lookup the given parent colocated table and verify if it exists.
5984
0
  TRACE("Looking up table");
5985
0
  auto parent_colocated_table = VERIFY_RESULT(FindTable(req->parent_colocated_table()));
5986
0
  {
5987
0
    TRACE("Locking table");
5988
0
    auto l = parent_colocated_table->LockForRead();
5989
0
    RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp));
5990
0
  }
5991
5992
0
  if (!parent_colocated_table->colocated() || !parent_colocated_table->IsColocatedParentTable()) {
5993
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_TABLE_TYPE,
5994
0
                      STATUS(InvalidArgument, "Table provided is not a parent colocated table"));
5995
0
  }
5996
5997
  // Next get all the user tables that are in the database.
5998
0
  ListTablesRequestPB listTablesReq;
5999
0
  ListTablesResponsePB ListTablesResp;
6000
6001
0
  listTablesReq.mutable_namespace_()->set_id(parent_colocated_table->namespace_id());
6002
0
  listTablesReq.mutable_namespace_()->set_database_type(YQL_DATABASE_PGSQL);
6003
0
  listTablesReq.set_exclude_system_tables(true);
6004
0
  Status status = ListTables(&listTablesReq, &ListTablesResp);
6005
0
  if (!status.ok() || ListTablesResp.has_error()) {
6006
0
    LOG(ERROR) << "Error while listing tables: " << status;
6007
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, status);
6008
0
  }
6009
6010
  // Get the table schema for each colocated table.
6011
0
  for (const auto& t : ListTablesResp.tables()) {
6012
    // Need to check if this table is colocated first.
6013
0
    TRACE("Looking up table");
6014
0
    scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTableById(t.id()));
6015
6016
0
    if (table->colocated()) {
6017
      // Now we can get the schema for this table.
6018
0
      GetTableSchemaRequestPB schemaReq;
6019
0
      GetTableSchemaResponsePB schemaResp;
6020
0
      schemaReq.mutable_table()->set_table_id(t.id());
6021
0
      status = GetTableSchema(&schemaReq, &schemaResp);
6022
0
      if (!status.ok() || schemaResp.has_error()) {
6023
0
        LOG(ERROR) << "Error while getting table schema: " << status;
6024
0
        return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, status);
6025
0
      }
6026
0
      resp->add_get_table_schema_response_pbs()->Swap(&schemaResp);
6027
0
    }
6028
0
  }
6029
6030
0
  return Status::OK();
6031
0
}
6032
6033
Status CatalogManager::ListTables(const ListTablesRequestPB* req,
6034
2.93k
                                  ListTablesResponsePB* resp) {
6035
2.93k
  NamespaceId namespace_id;
6036
6037
  // Validate namespace.
6038
2.93k
  if (req->has_namespace_()) {
6039
    // Lookup the namespace and verify if it exists.
6040
672
    auto ns = VERIFY_NAMESPACE_FOUND(FindNamespace(req->namespace_()), resp);
6041
6042
329
    auto ns_lock = ns->LockForRead();
6043
329
    namespace_id = ns->id();
6044
6045
    // Don't list tables with a namespace that isn't running.
6046
329
    if (ns->state() != SysNamespaceEntryPB::RUNNING) {
6047
0
      LOG(INFO) << "ListTables request for a Namespace not running (State="
6048
0
                << SysNamespaceEntryPB::State_Name(ns->state()) << ")";
6049
0
      return Status::OK();
6050
0
    }
6051
329
  }
6052
6053
2.59k
  bool has_rel_filter = req->relation_type_filter_size() > 0;
6054
2.59k
  bool include_user_table = has_rel_filter ? 
false310
:
true2.28k
;
6055
2.59k
  bool include_user_index = has_rel_filter ? 
false310
:
true2.28k
;
6056
2.59k
  bool include_system_table = req->exclude_system_tables() ? 
false368
6057
2.59k
      : 
(2.22k
has_rel_filter2.22k
?
false310
:
true1.91k
);
6058
6059
2.59k
  for (const auto &relation : req->relation_type_filter()) {
6060
311
    if (relation == SYSTEM_TABLE_RELATION) {
6061
2
      include_system_table = true;
6062
309
    } else if (relation == USER_TABLE_RELATION) {
6063
308
      include_user_table = true;
6064
308
    } else 
if (1
relation == INDEX_TABLE_RELATION1
) {
6065
1
      include_user_index = true;
6066
1
    }
6067
311
  }
6068
6069
2.59k
  SharedLock lock(mutex_);
6070
2.59k
  RelationType relation_type;
6071
6072
720k
  for (const auto& entry : *table_ids_map_) {
6073
720k
    auto& table_info = *entry.second;
6074
720k
    auto ltm = table_info.LockForRead();
6075
6076
720k
    if (!ltm->visible_to_client() && 
!req->include_not_running()3.00k
) {
6077
3.00k
      continue;
6078
3.00k
    }
6079
6080
717k
    if (!namespace_id.empty() && 
namespace_id != table_info.namespace_id()182k
) {
6081
144k
      continue; // Skip tables from other namespaces.
6082
144k
    }
6083
6084
572k
    if (req->has_name_filter()) {
6085
279k
      size_t found = ltm->name().find(req->name_filter());
6086
279k
      if (found == string::npos) {
6087
279k
        continue;
6088
279k
      }
6089
279k
    }
6090
6091
293k
    if (IsUserIndexUnlocked(table_info)) {
6092
164
      if (!include_user_index) {
6093
0
        continue;
6094
0
      }
6095
164
      relation_type = INDEX_TABLE_RELATION;
6096
293k
    } else if (IsUserTableUnlocked(table_info)) {
6097
11.8k
      if (!include_user_table) {
6098
4
        continue;
6099
4
      }
6100
11.8k
      relation_type = USER_TABLE_RELATION;
6101
281k
    } else {
6102
281k
      if (!include_system_table) {
6103
225k
        continue;
6104
225k
      }
6105
56.3k
      relation_type = SYSTEM_TABLE_RELATION;
6106
56.3k
    }
6107
6108
68.4k
    NamespaceIdentifierPB ns_identifier;
6109
68.4k
    ns_identifier.set_id(ltm->namespace_id());
6110
68.4k
    auto ns = FindNamespaceUnlocked(ns_identifier);
6111
68.4k
    if (!ns.ok() || 
(**ns).state() != SysNamespaceEntryPB::RUNNING68.4k
) {
6112
2
      if (PREDICT_FALSE(FLAGS_TEST_return_error_if_namespace_not_found)) {
6113
1
        
VERIFY_NAMESPACE_FOUND0
(0
std::move(ns), resp);
6114
0
      }
6115
1
      LOG(ERROR) << "Unable to find namespace with id " << ltm->namespace_id()
6116
1
                 << " for table " << ltm->name();
6117
1
      continue;
6118
2
    }
6119
6120
68.4k
    ListTablesResponsePB::TableInfo *table = resp->add_tables();
6121
68.4k
    {
6122
68.4k
      auto namespace_lock = (**ns).LockForRead();
6123
68.4k
      table->mutable_namespace_()->set_id((**ns).id());
6124
68.4k
      table->mutable_namespace_()->set_name(namespace_lock->name());
6125
68.4k
      table->mutable_namespace_()->set_database_type(namespace_lock->pb.database_type());
6126
68.4k
    }
6127
68.4k
    table->set_id(entry.second->id());
6128
68.4k
    table->set_name(ltm->name());
6129
68.4k
    table->set_table_type(ltm->table_type());
6130
68.4k
    table->set_relation_type(relation_type);
6131
68.4k
    table->set_state(ltm->pb.state());
6132
68.4k
    table->set_pgschema_name(ltm->schema().pgschema_name());
6133
68.4k
  }
6134
2.59k
  return Status::OK();
6135
2.59k
}
6136
6137
0
boost::optional<TablegroupId> CatalogManager::FindTablegroupByTableId(const TableId& table_id) {
6138
0
  SharedLock lock(mutex_);
6139
6140
0
  for (const auto& tablegroup : tablegroup_ids_map_) {
6141
0
    const auto& tgid = tablegroup.first;
6142
0
    const auto& tginfo = tablegroup.second;
6143
0
    for (const auto& t : tginfo->ChildTables()) {
6144
0
      if (table_id == t) {
6145
0
        return boost::optional<TablegroupId>(tgid + kTablegroupParentTableIdSuffix);
6146
0
      }
6147
0
    }
6148
0
  }
6149
6150
0
  return boost::none;
6151
0
}
6152
6153
4.85M
scoped_refptr<TableInfo> CatalogManager::GetTableInfo(const TableId& table_id) {
6154
4.85M
  SharedLock lock(mutex_);
6155
4.85M
  return FindPtrOrNull(*table_ids_map_, table_id);
6156
4.85M
}
6157
6158
scoped_refptr<TableInfo> CatalogManager::GetTableInfoFromNamespaceNameAndTableName(
6159
0
    YQLDatabase db_type, const NamespaceName& namespace_name, const TableName& table_name) {
6160
0
  if (db_type == YQL_DATABASE_PGSQL)
6161
0
    return nullptr;
6162
0
  SharedLock lock(mutex_);
6163
0
  const auto ns = FindPtrOrNull(namespace_names_mapper_[db_type], namespace_name);
6164
0
  return ns
6165
0
    ? FindPtrOrNull(table_names_map_, {ns->id(), table_name})
6166
0
    : nullptr;
6167
0
}
6168
6169
1.78M
scoped_refptr<TableInfo> CatalogManager::GetTableInfoUnlocked(const TableId& table_id) {
6170
1.78M
  return FindPtrOrNull(*table_ids_map_, table_id);
6171
1.78M
}
6172
6173
47.5k
std::vector<TableInfoPtr> CatalogManager::GetTables(GetTablesMode mode) {
6174
47.5k
  std::vector<TableInfoPtr> result;
6175
47.5k
  {
6176
47.5k
    SharedLock lock(mutex_);
6177
47.5k
    result.reserve(table_ids_map_->size());
6178
1.35M
    for (const auto& e : *table_ids_map_) {
6179
1.35M
      result.push_back(e.second);
6180
1.35M
    }
6181
47.5k
  }
6182
47.5k
  switch (mode) {
6183
2
    case GetTablesMode::kAll:
6184
2
      return result;
6185
160
    case GetTablesMode::kRunning: {
6186
3.44k
      auto filter = [](const TableInfoPtr& table_info) { return !table_info->is_running(); };
6187
160
      EraseIf(filter, &result);
6188
160
      return result;
6189
0
    }
6190
47.4k
    case GetTablesMode::kVisibleToClient: {
6191
1.35M
      auto filter = [](const TableInfoPtr& table_info) {
6192
1.35M
        return !table_info->LockForRead()->visible_to_client();
6193
1.35M
      };
6194
47.4k
      EraseIf(filter, &result);
6195
47.4k
      return result;
6196
0
    }
6197
47.5k
  }
6198
0
  FATAL_INVALID_ENUM_VALUE(GetTablesMode, mode);
6199
0
}
6200
6201
void CatalogManager::GetAllNamespaces(std::vector<scoped_refptr<NamespaceInfo>>* namespaces,
6202
13.6k
                                      bool includeOnlyRunningNamespaces) {
6203
13.6k
  namespaces->clear();
6204
13.6k
  SharedLock lock(mutex_);
6205
56.0k
  for (const NamespaceInfoMap::value_type& e : namespace_ids_map_) {
6206
56.0k
    if (includeOnlyRunningNamespaces && 
e.second->state() != SysNamespaceEntryPB::RUNNING55.8k
) {
6207
7
      continue;
6208
7
    }
6209
56.0k
    namespaces->push_back(e.second);
6210
56.0k
  }
6211
13.6k
}
6212
6213
13.9k
void CatalogManager::GetAllUDTypes(std::vector<scoped_refptr<UDTypeInfo>>* types) {
6214
13.9k
  types->clear();
6215
13.9k
  SharedLock lock(mutex_);
6216
13.9k
  for (const UDTypeInfoMap::value_type& e : udtype_ids_map_) {
6217
227
    types->push_back(e.second);
6218
227
  }
6219
13.9k
}
6220
6221
3
std::vector<std::shared_ptr<MonitoredTask>> CatalogManager::GetRecentTasks() {
6222
3
  return tasks_tracker_->GetTasks();
6223
3
}
6224
6225
0
std::vector<std::shared_ptr<MonitoredTask>> CatalogManager::GetRecentJobs() {
6226
0
  return jobs_tracker_->GetTasks();
6227
0
}
6228
6229
25.2k
NamespaceName CatalogManager::GetNamespaceNameUnlocked(const NamespaceId& id) const  {
6230
25.2k
  const scoped_refptr<NamespaceInfo> ns = FindPtrOrNull(namespace_ids_map_, id);
6231
25.2k
  return ns == nullptr ? 
NamespaceName()4
:
ns->name()25.2k
;
6232
25.2k
}
6233
6234
37
NamespaceName CatalogManager::GetNamespaceName(const NamespaceId& id) const {
6235
37
  TRACE("Acquired catalog manager lock");
6236
37
  SharedLock lock(mutex_);
6237
37
  return GetNamespaceNameUnlocked(id);
6238
37
}
6239
6240
NamespaceName CatalogManager::GetNamespaceNameUnlocked(
6241
0
    const scoped_refptr<TableInfo>& table) const  {
6242
0
  return GetNamespaceNameUnlocked(table->namespace_id());
6243
0
}
6244
6245
0
NamespaceName CatalogManager::GetNamespaceName(const scoped_refptr<TableInfo>& table) const {
6246
0
  return GetNamespaceName(table->namespace_id());
6247
0
}
6248
6249
239M
bool CatalogManager::IsSystemTable(const TableInfo& table) const {
6250
239M
  return table.is_system();
6251
239M
}
6252
6253
// True if table is created by user.
6254
// Table can be regular table or index in this case.
6255
965
bool CatalogManager::IsUserCreatedTable(const TableInfo& table) const {
6256
965
  SharedLock lock(mutex_);
6257
965
  return IsUserCreatedTableUnlocked(table);
6258
965
}
6259
6260
588k
bool CatalogManager::IsUserCreatedTableUnlocked(const TableInfo& table) const {
6261
588k
  if (table.GetTableType() == PGSQL_TABLE_TYPE || 
table.GetTableType() == YQL_TABLE_TYPE59.8k
) {
6262
586k
    if (!IsSystemTable(table) && 
!IsSequencesSystemTable(table)25.2k
&&
6263
586k
        
GetNamespaceNameUnlocked(table.namespace_id()) != kSystemNamespaceName25.2k
&&
6264
586k
        
!table.IsColocatedParentTable()24.7k
&&
6265
586k
        
!table.IsTablegroupParentTable()24.7k
) {
6266
24.7k
      return true;
6267
24.7k
    }
6268
586k
  }
6269
563k
  return false;
6270
588k
}
6271
6272
216
bool CatalogManager::IsUserTable(const TableInfo& table) const {
6273
216
  SharedLock lock(mutex_);
6274
216
  return IsUserTableUnlocked(table);
6275
216
}
6276
6277
293k
bool CatalogManager::IsUserTableUnlocked(const TableInfo& table) const {
6278
293k
  return IsUserCreatedTableUnlocked(table) && 
table.indexed_table_id().empty()11.9k
;
6279
293k
}
6280
6281
36
bool CatalogManager::IsUserIndex(const TableInfo& table) const {
6282
36
  SharedLock lock(mutex_);
6283
36
  return IsUserIndexUnlocked(table);
6284
36
}
6285
6286
293k
bool CatalogManager::IsUserIndexUnlocked(const TableInfo& table) const {
6287
293k
  return IsUserCreatedTableUnlocked(table) && 
!table.indexed_table_id().empty()12.0k
;
6288
293k
}
6289
6290
5
bool CatalogManager::IsTablegroupParentTableId(const TableId& table_id) const {
6291
5
  return table_id.find(kTablegroupParentTableIdSuffix) != std::string::npos;
6292
5
}
6293
6294
5
bool CatalogManager::IsColocatedParentTableId(const TableId& table_id) const {
6295
5
  return table_id.find(kColocatedParentTableIdSuffix) != std::string::npos;
6296
5
}
6297
6298
25.2k
bool CatalogManager::IsSequencesSystemTable(const TableInfo& table) const {
6299
25.2k
  if (table.GetTableType() == PGSQL_TABLE_TYPE && 
!table.IsColocatedParentTable()22.3k
6300
25.2k
                                               && 
!table.IsTablegroupParentTable()22.3k
) {
6301
    // This case commonly occurs during unit testing. Avoid unnecessary assert within Get().
6302
22.3k
    if (!IsPgsqlId(table.namespace_id()) || 
!IsPgsqlId(table.id())22.3k
) {
6303
4
      LOG(WARNING) << "Not PGSQL IDs " << table.namespace_id() << ", " << table.id();
6304
4
      return false;
6305
4
    }
6306
22.3k
    Result<uint32_t> database_oid = GetPgsqlDatabaseOid(table.namespace_id());
6307
22.3k
    if (!database_oid.ok()) {
6308
0
      LOG(WARNING) << "Invalid Namespace ID " << table.namespace_id();
6309
0
      return false;
6310
0
    }
6311
22.3k
    Result<uint32_t> table_oid = GetPgsqlTableOid(table.id());
6312
22.3k
    if (!table_oid.ok()) {
6313
0
      LOG(WARNING) << "Invalid Table ID " << table.id();
6314
0
      return false;
6315
0
    }
6316
22.3k
    if (*database_oid == kPgSequencesDataDatabaseOid && 
*table_oid == kPgSequencesDataTableOid0
) {
6317
0
      return true;
6318
0
    }
6319
22.3k
  }
6320
25.2k
  return false;
6321
25.2k
}
6322
6323
void CatalogManager::NotifyTabletDeleteFinished(const TabletServerId& tserver_uuid,
6324
                                                const TabletId& tablet_id,
6325
75.8k
                                                const TableInfoPtr& table) {
6326
75.8k
  shared_ptr<TSDescriptor> ts_desc;
6327
75.8k
  if (!master_->ts_manager()->LookupTSByUUID(tserver_uuid, &ts_desc)) {
6328
1
    LOG(WARNING) << "Unable to find tablet server " << tserver_uuid;
6329
75.8k
  } else if (!ts_desc->IsTabletDeletePending(tablet_id)) {
6330
1.86k
    LOG(WARNING) << "Pending delete for tablet " << tablet_id << " in ts "
6331
1.86k
                 << tserver_uuid << " doesn't exist";
6332
73.9k
  } else {
6333
73.9k
    LOG(INFO) << "Clearing pending delete for tablet " << tablet_id << " in ts " << tserver_uuid;
6334
73.9k
    ts_desc->ClearPendingTabletDelete(tablet_id);
6335
73.9k
  }
6336
75.8k
  CheckTableDeleted(table);
6337
75.8k
}
6338
6339
bool CatalogManager::ReplicaMapDiffersFromConsensusState(const scoped_refptr<TabletInfo>& tablet,
6340
373k
                                                         const ConsensusStatePB& cstate) {
6341
373k
  auto locs = tablet->GetReplicaLocations();
6342
373k
  if (locs->size() != implicit_cast<size_t>(cstate.config().peers_size())) {
6343
48.8k
    return true;
6344
48.8k
  }
6345
1.32M
  
for (auto iter = cstate.config().peers().begin(); 324k
iter != cstate.config().peers().end();
iter++1.00M
) {
6346
1.00M
      if (locs->find(iter->permanent_uuid()) == locs->end()) {
6347
0
        return true;
6348
0
      }
6349
1.00M
  }
6350
324k
  return false;
6351
324k
}
6352
6353
namespace {
6354
6355
899k
int64_t GetCommittedConsensusStateOpIdIndex(const ReportedTabletPB& report) {
6356
899k
  if (!report.has_committed_consensus_state() ||
6357
899k
      
!report.committed_consensus_state().config().has_opid_index()896k
) {
6358
3.54k
    return consensus::kInvalidOpIdIndex;
6359
3.54k
  }
6360
6361
896k
  return report.committed_consensus_state().config().opid_index();
6362
899k
}
6363
6364
} // namespace
6365
6366
bool CatalogManager::ProcessCommittedConsensusState(
6367
    TSDescriptor* ts_desc,
6368
    bool is_incremental,
6369
    const ReportedTabletPB& report,
6370
    const TableInfo::WriteLock& table_lock,
6371
    const TabletInfoPtr& tablet,
6372
    const TabletInfo::WriteLock& tablet_lock,
6373
450k
    std::vector<RetryingTSRpcTaskPtr>* rpcs) {
6374
450k
  const ConsensusStatePB& prev_cstate = tablet_lock->pb.committed_consensus_state();
6375
450k
  ConsensusStatePB cstate = report.committed_consensus_state();
6376
450k
  bool tablet_was_mutated = false;
6377
6378
  // 6a. The master only processes reports for replicas with committed
6379
  // consensus configurations since it needs the committed index to only
6380
  // cache the most up-to-date config. Since it's possible for TOMBSTONED
6381
  // replicas with no ConsensusMetadata on disk to be reported as having no
6382
  // committed config opid_index, we skip over those replicas.
6383
450k
  if (!cstate.config().has_opid_index()) {
6384
0
    LOG(WARNING) << "Missing opid_index in reported config: " << report.ShortDebugString();
6385
0
    return false;
6386
0
  }
6387
450k
  if (PREDICT_TRUE(FLAGS_master_ignore_stale_cstate) &&
6388
450k
        (cstate.current_term() < prev_cstate.current_term() ||
6389
450k
         
GetCommittedConsensusStateOpIdIndex(report) < prev_cstate.config().opid_index()445k
)) {
6390
12.9k
    LOG(WARNING) << "Stale heartbeat for Tablet " << tablet->ToString()
6391
12.9k
                 << " on TS " << ts_desc->permanent_uuid()
6392
12.9k
                 << "cstate=" << cstate.ShortDebugString()
6393
12.9k
                 << ", prev_cstate=" << prev_cstate.ShortDebugString();
6394
12.9k
    return false;
6395
12.9k
  }
6396
6397
  // 6b. Disregard the leader state if the reported leader is not a member
6398
  // of the committed config.
6399
437k
  if (cstate.leader_uuid().empty() ||
6400
437k
      
!IsRaftConfigMember(cstate.leader_uuid(), cstate.config())276k
) {
6401
160k
    cstate.clear_leader_uuid();
6402
160k
    tablet_was_mutated = true;
6403
160k
  }
6404
6405
  // 6c. Mark the tablet as RUNNING if it makes sense to do so.
6406
  //
6407
  // We need to wait for a leader before marking a tablet as RUNNING, or
6408
  // else we could incorrectly consider a tablet created when only a
6409
  // minority of its replicas were successful. In that case, the tablet
6410
  // would be stuck in this bad state forever.
6411
  // - FLAG added to avoid waiting during mock tests.
6412
437k
  if (!tablet_lock->is_running() &&
6413
437k
      
report.state() == tablet::RUNNING205k
&&
6414
437k
        
(205k
cstate.has_leader_uuid()205k
||
6415
205k
        
!FLAGS_catalog_manager_wait_for_new_tablets_to_elect_leader157k
)) {
6416
47.9k
    DCHECK_EQ(SysTabletsEntryPB::CREATING, tablet_lock->pb.state())
6417
0
        << "Tablet in unexpected state: " << tablet->ToString()
6418
0
        << ": " << tablet_lock->pb.ShortDebugString();
6419
47.9k
    VLOG
(1) << "Tablet " << tablet->ToString() << " is now online"0
;
6420
47.9k
    tablet_lock.mutable_data()->set_state(SysTabletsEntryPB::RUNNING,
6421
47.9k
        "Tablet reported with an active leader");
6422
47.9k
    tablet_was_mutated = true;
6423
47.9k
  }
6424
6425
  // 6d. Update the consensus state if:
6426
  // - A config change operation was committed (reflected by a change to
6427
  //   the committed config's opid_index).
6428
  // - The new cstate has a leader, and either the old cstate didn't, or
6429
  //   there was a term change.
6430
437k
  if (cstate.config().opid_index() > prev_cstate.config().opid_index() ||
6431
437k
      
(432k
cstate.has_leader_uuid()432k
&&
6432
432k
          
(271k
!prev_cstate.has_leader_uuid()271k
||
6433
271k
              
cstate.current_term() > prev_cstate.current_term()223k
))) {
6434
6435
    // 6d(i). Retain knowledge of the leader even if it wasn't reported in
6436
    // the latest config.
6437
    //
6438
    // When a config change is reported to the master, it may not include the
6439
    // leader because the follower doing the reporting may not know who the
6440
    // leader is yet (it may have just started up). It is safe to reuse
6441
    // the previous leader if the reported cstate has the same term as the
6442
    // previous cstate, and the leader was known for that term.
6443
64.5k
    if (cstate.current_term() == prev_cstate.current_term()) {
6444
5.64k
      if (!cstate.has_leader_uuid() && 
prev_cstate.has_leader_uuid()2
) {
6445
2
        cstate.set_leader_uuid(prev_cstate.leader_uuid());
6446
        // Sanity check to detect consensus divergence bugs.
6447
5.64k
      } else if (cstate.has_leader_uuid() && prev_cstate.has_leader_uuid() &&
6448
5.64k
          
cstate.leader_uuid() != prev_cstate.leader_uuid()5.64k
) {
6449
0
        string msg = Substitute("Previously reported cstate for tablet $0 gave "
6450
0
                                "a different leader for term $1 than the current cstate. "
6451
0
                                "Previous cstate: $2. Current cstate: $3.",
6452
0
            tablet->ToString(), cstate.current_term(),
6453
0
            prev_cstate.ShortDebugString(), cstate.ShortDebugString());
6454
0
        LOG(DFATAL) << msg;
6455
0
        return false;
6456
0
      }
6457
5.64k
    }
6458
6459
    // 6d(ii). Delete any replicas from the previous config that are not in the new one.
6460
64.5k
    
if (64.5k
FLAGS_master_tombstone_evicted_tablet_replicas64.5k
) {
6461
64.5k
      std::unordered_set<string> current_member_uuids;
6462
194k
      for (const consensus::RaftPeerPB &peer : cstate.config().peers()) {
6463
194k
        InsertOrDie(&current_member_uuids, peer.permanent_uuid());
6464
194k
      }
6465
194k
      for (const consensus::RaftPeerPB &prev_peer : prev_cstate.config().peers()) {
6466
194k
        const string& peer_uuid = prev_peer.permanent_uuid();
6467
194k
        if (!ContainsKey(current_member_uuids, peer_uuid)) {
6468
          // Don't delete a tablet server that hasn't reported in yet (Bootstrapping).
6469
1.72k
          shared_ptr<TSDescriptor> dummy_ts_desc;
6470
1.72k
          if (!master_->ts_manager()->LookupTSByUUID(peer_uuid, &dummy_ts_desc)) {
6471
12
            continue;
6472
12
          }
6473
          // Otherwise, the TabletServer needs to remove this peer.
6474
1.71k
          rpcs->push_back(std::make_shared<AsyncDeleteReplica>(
6475
1.71k
              master_, AsyncTaskPool(), peer_uuid, tablet->table(), tablet->tablet_id(),
6476
1.71k
              TABLET_DATA_TOMBSTONED, prev_cstate.config().opid_index(),
6477
1.71k
              Substitute("TS $0 not found in new config with opid_index $1",
6478
1.71k
                  peer_uuid, cstate.config().opid_index())));
6479
1.71k
        }
6480
194k
      }
6481
64.5k
    }
6482
    // 6d(iii). Update the in-memory ReplicaLocations for this tablet using the new config.
6483
64.5k
    VLOG(2) << "Updating replicas for tablet " << tablet->tablet_id()
6484
4
          << " using config reported by " << ts_desc->permanent_uuid()
6485
4
          << " to that committed in log index " << cstate.config().opid_index()
6486
4
          << " with leader state from term " << cstate.current_term();
6487
64.5k
    ReconcileTabletReplicasInLocalMemoryWithReport(
6488
64.5k
      tablet, ts_desc->permanent_uuid(), cstate, report);
6489
6490
    // 6d(iv). Update the consensus state. Don't use 'prev_cstate' after this.
6491
64.5k
    LOG(INFO) << "Tablet: " << tablet->tablet_id() << " reported consensus state change."
6492
64.5k
              << " New consensus state: " << cstate.ShortDebugString()
6493
64.5k
              << " from " << ts_desc->permanent_uuid();
6494
64.5k
    *tablet_lock.mutable_data()->pb.mutable_committed_consensus_state() = cstate;
6495
64.5k
    tablet_was_mutated = true;
6496
373k
  } else {
6497
    // Report opid_index is equal to the previous opid_index. If some
6498
    // replica is reporting the same consensus configuration we already know about, but we
6499
    // haven't yet heard from all the tservers in the config, update the in-memory
6500
    // ReplicaLocations.
6501
373k
    LOG(INFO) << "Peer " << ts_desc->permanent_uuid() << " sent "
6502
373k
              << (is_incremental ? 
"incremental"370k
:
"full tablet"2.47k
)
6503
373k
              << " report for " << tablet->tablet_id()
6504
373k
              << ", prev state op id: " << prev_cstate.config().opid_index()
6505
373k
              << ", prev state term: " << prev_cstate.current_term()
6506
373k
              << ", prev state has_leader_uuid: " << prev_cstate.has_leader_uuid()
6507
373k
              << ". Consensus state: " << cstate.ShortDebugString();
6508
373k
    if (GetAtomicFlag(&FLAGS_enable_register_ts_from_raft) &&
6509
373k
        
ReplicaMapDiffersFromConsensusState(tablet, cstate)373k
) {
6510
48.8k
       ReconcileTabletReplicasInLocalMemoryWithReport(
6511
48.8k
         tablet, ts_desc->permanent_uuid(), cstate, report);
6512
324k
    } else {
6513
324k
      UpdateTabletReplicaInLocalMemory(ts_desc, &cstate, report, tablet);
6514
324k
    }
6515
373k
  }
6516
6517
437k
  if (FLAGS_use_create_table_leader_hint &&
6518
437k
      
!cstate.has_leader_uuid()436k
&&
cstate.current_term() == 0160k
) {
6519
153k
    StartElectionIfReady(cstate, tablet.get());
6520
153k
  }
6521
6522
  // 7. Send an AlterSchema RPC if the tablet has an old schema version.
6523
437k
  if (report.has_schema_version() &&
6524
437k
      
report.schema_version() != table_lock->pb.version()437k
) {
6525
344
    if (report.schema_version() > table_lock->pb.version()) {
6526
0
      LOG(ERROR) << "TS " << ts_desc->permanent_uuid()
6527
0
                 << " has reported a schema version greater than the current one "
6528
0
                 << " for tablet " << tablet->ToString()
6529
0
                 << ". Expected version " << table_lock->pb.version()
6530
0
                 << " got " << report.schema_version()
6531
0
                 << " (corruption)";
6532
344
    } else {
6533
      // TODO: For Alter (rolling apply to tablets), this is an expected transitory state.
6534
344
      LOG(INFO) << "TS " << ts_desc->permanent_uuid()
6535
344
                << " does not have the latest schema for tablet " << tablet->ToString()
6536
344
                << ". Expected version " << table_lock->pb.version()
6537
344
                << " got " << report.schema_version();
6538
344
    }
6539
    // It's possible that the tablet being reported is a laggy replica, and in fact
6540
    // the leader has already received an AlterTable RPC. That's OK, though --
6541
    // it'll safely ignore it if we send another.
6542
344
    TransactionId txn_id = TransactionId::Nil();
6543
344
    if (table_lock->pb.has_transaction() &&
6544
344
        
table_lock->pb.transaction().has_transaction_id()39
) {
6545
39
      LOG(INFO) << "Parsing transaction ID for tablet ID " << tablet->tablet_id();
6546
39
      auto txn_id_res = FullyDecodeTransactionId(table_lock->pb.transaction().transaction_id());
6547
39
      if (!txn_id_res.ok()) {
6548
0
        LOG(WARNING) << "Parsing transaction ID failed for tablet ID " << tablet->tablet_id();
6549
0
        return false;
6550
0
      }
6551
39
      txn_id = txn_id_res.get();
6552
39
    }
6553
344
    LOG(INFO) << "Triggering AlterTable with transaction ID " << txn_id
6554
344
              << " due to heartbeat delay for tablet ID " << tablet->tablet_id();
6555
344
    rpcs->push_back(std::make_shared<AsyncAlterTable>(
6556
344
        master_, AsyncTaskPool(), tablet, tablet->table(), txn_id));
6557
344
  }
6558
6559
437k
  return tablet_was_mutated;
6560
437k
}
6561
6562
Status CatalogManager::ProcessTabletReportBatch(
6563
    TSDescriptor* ts_desc,
6564
    bool is_incremental,
6565
    ReportedTablets::const_iterator begin,
6566
    ReportedTablets::const_iterator end,
6567
    TabletReportUpdatesPB* full_report_update,
6568
453k
    std::vector<RetryingTSRpcTaskPtr>* rpcs) {
6569
  // 1. First Pass. Iterate in TabletId Order to discover all Table locks we'll need. Even though
6570
  //    read locks are sufficient here, take write locks since we'll be writing to the tablet while
6571
  //    holding this.
6572
  //    Need to acquire both types of locks in Id order to prevent deadlock.
6573
453k
  std::map<TableId, TableInfo::WriteLock> table_write_locks;
6574
906k
  for (auto it = begin; it != end; 
++it452k
) {
6575
452k
    auto& lock = table_write_locks[it->info->table()->id()];
6576
452k
    if (!lock.locked()) {
6577
451k
      lock = it->info->table()->LockForWrite();
6578
451k
    }
6579
452k
  }
6580
6581
453k
  map<TabletId, TabletInfo::WriteLock> tablet_write_locks; // used for unlock.
6582
  // 2. Second Pass.  Process each tablet. This may not be in the order that the tablets
6583
  // appear in 'full_report', but that has no bearing on correctness.
6584
453k
  vector<TabletInfo*> mutated_tablets; // refcount protected by 'tablet_infos'
6585
908k
  for (auto it = begin; it != end; 
++it454k
) {
6586
454k
    const auto& tablet_id = it->tablet_id;
6587
454k
    const TabletInfoPtr& tablet = it->info;
6588
454k
    const ReportedTabletPB& report = *it->report;
6589
454k
    const TableInfoPtr& table = tablet->table();
6590
6591
    // Prepare an heartbeat response entry for this tablet, now that we're going to process it.
6592
    // Every tablet in the report that is processed gets one, even if there are no changes to it.
6593
454k
    ReportedTabletUpdatesPB* update = full_report_update->add_tablets();
6594
454k
    update->set_tablet_id(tablet_id);
6595
6596
    // Get tablet lock on demand.  This works in the batch case because the loop is ordered.
6597
454k
    tablet_write_locks[tablet_id] = tablet->LockForWrite();
6598
454k
    auto& table_lock = table_write_locks[table->id()];
6599
454k
    auto& tablet_lock = tablet_write_locks[tablet_id];
6600
6601
454k
    TRACE_EVENT1("master", "HandleReportedTablet", "tablet_id", report.tablet_id());
6602
454k
    RETURN_NOT_OK_PREPEND(CheckIsLeaderAndReady(),
6603
454k
        Substitute("This master is no longer the leader, unable to handle report for tablet $0",
6604
454k
            tablet_id));
6605
6606
18.4E
    VLOG(3) << "tablet report: " << report.ShortDebugString();
6607
6608
    // 3. Delete the tablet if it (or its table) have been deleted.
6609
454k
    if (tablet_lock->is_deleted() ||
6610
454k
        table_lock->started_deleting()) {
6611
98
      const string msg = tablet_lock->pb.state_msg();
6612
98
      update->set_state_msg(msg);
6613
98
      LOG(INFO) << "Got report from deleted tablet " << tablet->ToString()
6614
98
                << " (" << msg << "): Sending delete request for this tablet";
6615
      // TODO(unknown): Cancel tablet creation, instead of deleting, in cases
6616
      // where that might be possible (tablet creation timeout & replacement).
6617
98
      rpcs->push_back(std::make_shared<AsyncDeleteReplica>(
6618
98
          master_, AsyncTaskPool(), ts_desc->permanent_uuid(), table, tablet_id,
6619
98
          TABLET_DATA_DELETED, boost::none, msg));
6620
98
      ts_desc->AddPendingTabletDelete(tablet_id);
6621
98
      continue;
6622
98
    }
6623
6624
454k
    if (!table_lock->is_running()) {
6625
0
      const string msg = tablet_lock->pb.state_msg();
6626
0
      LOG(INFO) << "Got report from tablet " << tablet->tablet_id()
6627
0
                << " for non-running table " << table->ToString() << ": " << msg;
6628
0
      update->set_state_msg(msg);
6629
0
      continue;
6630
0
    }
6631
6632
    // 3. Tombstone a replica that is no longer part of the Raft config (and
6633
    // not already tombstoned or deleted outright).
6634
    //
6635
    // If the report includes a committed raft config, we only tombstone if
6636
    // the opid_index is strictly less than the latest reported committed
6637
    // config. This prevents us from spuriously deleting replicas that have
6638
    // just been added to the committed config and are in the process of copying.
6639
454k
    const ConsensusStatePB& prev_cstate = tablet_lock->pb.committed_consensus_state();
6640
454k
    const int64_t prev_opid_index = prev_cstate.config().opid_index();
6641
454k
    const int64_t report_opid_index = GetCommittedConsensusStateOpIdIndex(report);
6642
454k
    if (FLAGS_master_tombstone_evicted_tablet_replicas &&
6643
454k
        report.tablet_data_state() != TABLET_DATA_TOMBSTONED &&
6644
454k
        report.tablet_data_state() != TABLET_DATA_DELETED &&
6645
454k
        report_opid_index < prev_opid_index &&
6646
454k
        
!IsRaftConfigMember(ts_desc->permanent_uuid(), prev_cstate.config())11.0k
) {
6647
194
      const string delete_msg = (report_opid_index == consensus::kInvalidOpIdIndex) ?
6648
29
          "Replica has no consensus available" :
6649
194
          
Substitute("Replica with old config index $0", report_opid_index)165
;
6650
194
      rpcs->push_back(std::make_shared<AsyncDeleteReplica>(
6651
194
          master_, AsyncTaskPool(), ts_desc->permanent_uuid(), table, tablet_id,
6652
194
          TABLET_DATA_TOMBSTONED, prev_opid_index,
6653
194
          Substitute("$0 (current committed config index is $1)",
6654
194
              delete_msg, prev_opid_index)));
6655
194
      ts_desc->AddPendingTabletDelete(tablet_id);
6656
194
      continue;
6657
194
    }
6658
6659
    // 4. Skip a non-deleted tablet which reports an error.
6660
454k
    if (report.has_error()) {
6661
0
      Status s = StatusFromPB(report.error());
6662
0
      DCHECK(!s.ok());
6663
0
      DCHECK_EQ(report.state(), tablet::FAILED);
6664
0
      LOG(WARNING) << "Tablet " << tablet->ToString() << " has failed on TS "
6665
0
                   << ts_desc->permanent_uuid() << ": " << s.ToString();
6666
0
      continue;
6667
0
    }
6668
6669
    // Hide the tablet if it (or its table) has been hidden and the tablet hasn't.
6670
454k
    if ((tablet_lock->is_hidden() ||
6671
454k
        table_lock->started_hiding()) &&
6672
454k
        
report.has_is_hidden()0
&&
6673
454k
        
!report.is_hidden()0
) {
6674
0
      const string msg = tablet_lock->pb.state_msg();
6675
0
      LOG(INFO) << "Got report from hidden tablet " << tablet->ToString()
6676
0
                << " (" << msg << "): Sending hide request for this tablet";
6677
0
      auto task = std::make_shared<AsyncDeleteReplica>(
6678
0
          master_, AsyncTaskPool(), ts_desc->permanent_uuid(), table, tablet_id,
6679
0
          TABLET_DATA_DELETED, boost::none, msg);
6680
0
      task->set_hide_only(true);
6681
0
      ts_desc->AddPendingTabletDelete(tablet_id);
6682
0
      rpcs->push_back(task);
6683
0
    }
6684
6685
    // 5. Process the report's consensus state.
6686
    // The report will not have a committed_consensus_state if it is in the
6687
    // middle of starting up, such as during tablet bootstrap.
6688
    // If we received an incremental report, and the tablet is starting up, we will update the
6689
    // replica so that the balancer knows how many tablets are in the middle of remote bootstrap.
6690
454k
    if (report.has_committed_consensus_state()) {
6691
450k
      if (ProcessCommittedConsensusState(
6692
450k
              ts_desc, is_incremental, report, table_lock, tablet, tablet_lock, rpcs)) {
6693
        // 6. If the tablet was mutated, add it to the tablets to be re-persisted.
6694
        //
6695
        // Done here and not on a per-mutation basis to avoid duplicate entries.
6696
225k
        mutated_tablets.push_back(tablet.get());
6697
225k
      }
6698
450k
    } else 
if (3.40k
is_incremental3.40k
&&
6699
3.41k
        (report.state() == tablet::NOT_STARTED || 
report.state() == tablet::BOOTSTRAPPING613
)) {
6700
      // When a tablet server is restarted, it sends a full tablet report with all of its tablets
6701
      // in the NOT_STARTED state, so this would make the load balancer think that all the
6702
      // tablets are being remote bootstrapped at once, so only process incremental reports here.
6703
3.41k
      UpdateTabletReplicaInLocalMemory(ts_desc, nullptr /* consensus */, report, tablet);
6704
3.41k
    }
6705
454k
  } // Finished one round of batch processing.
6706
6707
  // 7. Unlock the tables; we no longer need to access their state.
6708
454k
  
for (auto& l : table_write_locks)453k
{
6709
454k
    l.second.Unlock();
6710
454k
  }
6711
453k
  table_write_locks.clear();
6712
6713
  // 8. Write all tablet mutations to the catalog table.
6714
  //
6715
  // SysCatalogTable::Write will short-circuit the case where the data has not
6716
  // in fact changed since the previous version and avoid any unnecessary mutations.
6717
453k
  if (!mutated_tablets.empty()) {
6718
225k
    Status s = sys_catalog_->Upsert(leader_ready_term(), mutated_tablets);
6719
225k
    if (!s.ok()) {
6720
1
      LOG(WARNING) << "Error updating tablets: " << s;
6721
1
      return s;
6722
1
    }
6723
225k
  }
6724
  // Filter the mutated tablets to find which tablets were modified. Need to actually commit the
6725
  // state of the tablets before updating the system.partitions table, so get this first.
6726
453k
  vector<TabletInfoPtr> yql_partitions_mutated_tablets =
6727
453k
      VERIFY_RESULT(GetYqlPartitionsVtable().FilterRelevantTablets(mutated_tablets));
6728
6729
  // 9. Publish the in-memory tablet mutations and release the locks.
6730
454k
  for (auto& l : tablet_write_locks) {
6731
454k
    l.second.Commit();
6732
454k
  }
6733
453k
  tablet_write_locks.clear();
6734
6735
  // Update the relevant tablet entries in system.partitions.
6736
453k
  if (!yql_partitions_mutated_tablets.empty()) {
6737
21.1k
    Status s = GetYqlPartitionsVtable()
6738
21.1k
        .ProcessMutatedTablets(yql_partitions_mutated_tablets, tablet_write_locks);
6739
21.1k
  }
6740
6741
  // 10. Third Pass. Process all tablet schema version changes.
6742
  // (This is separate from tablet state mutations because only table on-disk state is changed.)
6743
908k
  for (auto it = begin; it != end; 
++it454k
) {
6744
454k
    const ReportedTabletPB& report = *it->report;
6745
454k
    if (!report.has_schema_version()) {
6746
0
      continue;
6747
0
    }
6748
454k
    const TabletInfoPtr& tablet = it->info;
6749
454k
    auto leader = tablet->GetLeader();
6750
454k
    if (leader.ok() && 
leader.get()->permanent_uuid() == ts_desc->permanent_uuid()296k
) {
6751
65.6k
      RETURN_NOT_OK(HandleTabletSchemaVersionReport(tablet.get(), report.schema_version()));
6752
65.6k
    }
6753
454k
  }
6754
6755
453k
  return Status::OK();
6756
453k
}
6757
6758
Status CatalogManager::ProcessTabletReport(TSDescriptor* ts_desc,
6759
                                           const TabletReportPB& full_report,
6760
                                           TabletReportUpdatesPB* full_report_update,
6761
4.80M
                                           RpcContext* rpc) {
6762
4.80M
  int num_tablets = full_report.updated_tablets_size();
6763
4.80M
  TRACE_EVENT2("master", "ProcessTabletReport",
6764
4.80M
               "requestor", rpc->requestor_string(),
6765
4.80M
               "num_tablets", num_tablets);
6766
6767
4.80M
  
VLOG_WITH_PREFIX2.50k
(2) << "Received tablet report from " << RequestorString(rpc) << "("
6768
2.50k
                      << ts_desc->permanent_uuid() << "): " << full_report.DebugString();
6769
6770
4.80M
  if (!ts_desc->has_tablet_report() && 
full_report.is_incremental()16.4k
) {
6771
8.02k
    LOG_WITH_PREFIX(WARNING)
6772
8.02k
        << "Invalid tablet report from " << ts_desc->permanent_uuid()
6773
8.02k
        << ": Received an incremental tablet report when a full one was needed";
6774
    // We should respond with success in order to send reply that we need full report.
6775
8.02k
    return Status::OK();
6776
8.02k
  }
6777
6778
  // TODO: on a full tablet report, we may want to iterate over the tablets we think
6779
  // the server should have, compare vs the ones being reported, and somehow mark
6780
  // any that have been "lost" (eg somehow the tablet metadata got corrupted or something).
6781
6782
4.80M
  ReportedTablets reported_tablets;
6783
6784
  // Tablet Deletes to process after the catalog lock below.
6785
4.80M
  set<TabletId> tablets_to_delete;
6786
6787
4.80M
  {
6788
    // Lock the catalog to iterate over tablet_ids_map_ & table_ids_map_.
6789
4.80M
    SharedLock lock(mutex_);
6790
6791
    // Fill the above variables before processing
6792
4.80M
    full_report_update->mutable_tablets()->Reserve(num_tablets);
6793
4.80M
    for (const ReportedTabletPB& report : full_report.updated_tablets()) {
6794
456k
      const string& tablet_id = report.tablet_id();
6795
6796
      // 1a. Find the tablet, deleting/skipping it if it can't be found.
6797
456k
      scoped_refptr<TabletInfo> tablet = FindPtrOrNull(*tablet_map_, tablet_id);
6798
456k
      if (!tablet) {
6799
        // If a TS reported an unknown tablet, send a delete tablet rpc to the TS.
6800
0
        LOG(INFO) << "Null tablet reported, possibly the TS was not around when the"
6801
0
                      " table was being deleted. Sending Delete tablet RPC to this TS.";
6802
0
        tablets_to_delete.insert(tablet_id);
6803
        // Every tablet in the report that is processed gets a heartbeat response entry.
6804
0
        ReportedTabletUpdatesPB* update = full_report_update->add_tablets();
6805
0
        update->set_tablet_id(tablet_id);
6806
0
        continue;
6807
0
      }
6808
456k
      if (!tablet->table() || 
FindOrNull(*table_ids_map_, tablet->table()->id()) == nullptr454k
) {
6809
0
        auto table_id = tablet->table() == nullptr ? "(null)" : tablet->table()->id();
6810
0
        LOG(INFO) << "Got report from an orphaned tablet " << tablet_id << " on table " << table_id;
6811
0
        tablets_to_delete.insert(tablet_id);
6812
        // Every tablet in the report that is processed gets a heartbeat response entry.
6813
0
        ReportedTabletUpdatesPB* update = full_report_update->add_tablets();
6814
0
        update->set_tablet_id(tablet_id);
6815
0
        continue;
6816
0
      }
6817
6818
      // 1b. Found the tablet, update local state.
6819
456k
      reported_tablets.push_back(ReportedTablet {
6820
456k
        .tablet_id = tablet_id,
6821
456k
        .info = tablet,
6822
456k
        .report = &report,
6823
456k
      });
6824
456k
    }
6825
4.80M
  }
6826
6827
4.80M
  std::sort(reported_tablets.begin(), reported_tablets.end(), [](const auto& lhs, const auto& rhs) {
6828
206k
    return lhs.tablet_id < rhs.tablet_id;
6829
206k
  });
6830
6831
  // Process any delete requests from orphaned tablets, identified above.
6832
4.80M
  for (auto tablet_id : tablets_to_delete) {
6833
0
    SendDeleteTabletRequest(tablet_id, TABLET_DATA_DELETED, boost::none, nullptr, ts_desc,
6834
0
        "Report from an orphaned tablet");
6835
0
  }
6836
6837
  // Calculate the deadline for this expensive loop coming up.
6838
4.80M
  const auto safe_deadline = rpc->GetClientDeadline() -
6839
4.80M
    (FLAGS_heartbeat_rpc_timeout_ms * 1ms * FLAGS_heartbeat_safe_deadline_ratio);
6840
6841
  // Process tablets by batches.
6842
5.25M
  for (auto tablet_iter = reported_tablets.begin(); tablet_iter != reported_tablets.end();) {
6843
453k
    auto batch_begin = tablet_iter;
6844
453k
    tablet_iter += std::min<size_t>(
6845
453k
        reported_tablets.end() - tablet_iter, FLAGS_catalog_manager_report_batch_size);
6846
6847
    // Keeps track of all RPCs that should be sent when we're done with a single batch.
6848
453k
    std::vector<RetryingTSRpcTaskPtr> rpcs;
6849
453k
    auto status = ProcessTabletReportBatch(
6850
453k
        ts_desc, full_report.is_incremental(), batch_begin, tablet_iter, full_report_update, &rpcs);
6851
453k
    if (!status.ok()) {
6852
2
      for (auto& rpc : rpcs) {
6853
0
        rpc->AbortAndReturnPrevState(status);
6854
0
      }
6855
2
      return status;
6856
2
    }
6857
6858
    // 13. Send all queued RPCs.
6859
453k
    for (auto& rpc : rpcs) {
6860
2.34k
      DCHECK(rpc->table());
6861
2.34k
      rpc->table()->AddTask(rpc);
6862
2.34k
      WARN_NOT_OK(ScheduleTask(rpc), Substitute("Failed to send $0", rpc->description()));
6863
2.34k
    }
6864
453k
    rpcs.clear();
6865
6866
    // 14. Check deadline. Need to exit before processing all batches if we're close to timing out.
6867
453k
    if (ts_desc->HasCapability(CAPABILITY_TabletReportLimit) &&
6868
454k
        tablet_iter != reported_tablets.end()) {
6869
      // [TESTING] Inject latency before processing a batch to test deadline.
6870
153k
      if (PREDICT_FALSE(FLAGS_TEST_inject_latency_during_tablet_report_ms > 0)) {
6871
0
        LOG(INFO) << "Sleeping in CatalogManager::ProcessTabletReport for "
6872
0
                  << FLAGS_TEST_inject_latency_during_tablet_report_ms << " ms";
6873
0
        SleepFor(MonoDelta::FromMilliseconds(FLAGS_TEST_inject_latency_during_tablet_report_ms));
6874
0
      }
6875
6876
      // Return from here at configured safe heartbeat deadline to give the response packet time.
6877
153k
      if (safe_deadline < CoarseMonoClock::Now()) {
6878
919
        LOG(INFO) << "Reached Heartbeat deadline. Returning early after processing "
6879
919
                  << full_report_update->tablets_size() << " tablets";
6880
919
        full_report_update->set_processing_truncated(true);
6881
919
        return Status::OK();
6882
919
      }
6883
153k
    }
6884
453k
  } // Loop to process the next batch until fully iterated.
6885
6886
4.80M
  if (!full_report.is_incremental()) {
6887
    // A full report may take multiple heartbeats.
6888
    // The TS communicates how much is left to process for the full report beyond this specific HB.
6889
8.17k
    bool completed_full_report = !full_report.has_remaining_tablet_count()
6890
8.17k
                               || full_report.remaining_tablet_count() == 0;
6891
8.17k
    if (full_report.updated_tablets_size() == 0) {
6892
7.97k
      LOG(INFO) << ts_desc->permanent_uuid() << " sent full tablet report with 0 tablets.";
6893
7.97k
    } else 
if (206
!ts_desc->has_tablet_report()206
) {
6894
205
      LOG(INFO) << ts_desc->permanent_uuid()
6895
205
                << (completed_full_report ? " finished" : 
" receiving"0
) << " first full report: "
6896
205
                << full_report.updated_tablets_size() << " tablets.";
6897
205
    }
6898
    // We have a tablet report only once we're done processing all the chunks of the initial report.
6899
8.17k
    ts_desc->set_has_tablet_report(completed_full_report);
6900
8.17k
  }
6901
6902
  // 14. Queue background processing if we had updates.
6903
4.80M
  if (full_report.updated_tablets_size() > 0) {
6904
301k
    background_tasks_->WakeIfHasPendingUpdates();
6905
301k
  }
6906
6907
4.80M
  return Status::OK();
6908
4.80M
}
6909
6910
Status CatalogManager::CreateTablegroup(const CreateTablegroupRequestPB* req,
6911
                                        CreateTablegroupResponsePB* resp,
6912
56
                                        rpc::RpcContext* rpc) {
6913
6914
56
  CreateTableRequestPB ctreq;
6915
56
  CreateTableResponsePB ctresp;
6916
6917
  // Sanity check for PB fields.
6918
56
  if (!req->has_id() || !req->has_namespace_id() || !req->has_namespace_name()) {
6919
0
    Status s = STATUS(InvalidArgument, "Improper CREATE TABLEGROUP request (missing fields).");
6920
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
6921
0
  }
6922
6923
  // Use the tablegroup id as the prefix for the parent table id.
6924
56
  const auto parent_table_id = req->id() + kTablegroupParentTableIdSuffix;
6925
56
  const auto parent_table_name = req->id() + kTablegroupParentTableNameSuffix;
6926
56
  ctreq.set_name(parent_table_name);
6927
56
  ctreq.set_table_id(parent_table_id);
6928
56
  ctreq.mutable_namespace_()->set_name(req->namespace_name());
6929
56
  ctreq.mutable_namespace_()->set_id(req->namespace_id());
6930
56
  ctreq.set_table_type(PGSQL_TABLE_TYPE);
6931
56
  ctreq.set_tablegroup_id(req->id());
6932
56
  ctreq.set_tablespace_id(req->tablespace_id());
6933
6934
56
  YBSchemaBuilder schemaBuilder;
6935
56
  schemaBuilder.AddColumn("parent_column")->Type(BINARY)->PrimaryKey()->NotNull();
6936
56
  YBSchema ybschema;
6937
56
  CHECK_OK(schemaBuilder.Build(&ybschema));
6938
56
  auto schema = yb::client::internal::GetSchema(ybschema);
6939
56
  SchemaToPB(schema, ctreq.mutable_schema());
6940
56
  if (!FLAGS_TEST_tablegroup_master_only) {
6941
55
    ctreq.mutable_schema()->mutable_table_properties()->set_is_transactional(true);
6942
55
  }
6943
6944
  // Create a parent table, which will create the tablet.
6945
56
  Status s = CreateTable(&ctreq, &ctresp, rpc);
6946
56
  resp->set_parent_table_id(ctresp.table_id());
6947
56
  resp->set_parent_table_name(parent_table_name);
6948
6949
  // Carry over error.
6950
56
  if (ctresp.has_error()) {
6951
0
    resp->mutable_error()->Swap(ctresp.mutable_error());
6952
0
  }
6953
6954
  // We do not lock here so it is technically possible that the table was already created.
6955
  // If so, there is nothing to do so we just ignore the "AlreadyPresent" error.
6956
56
  if (!s.ok() && 
!s.IsAlreadyPresent()2
) {
6957
2
    LOG(WARNING) << "Tablegroup creation failed: " << s.ToString();
6958
2
    return s;
6959
2
  }
6960
6961
  // Update catalog manager maps
6962
54
  LockGuard lock(mutex_);
6963
54
  TRACE("Acquired catalog manager lock");
6964
54
  TablegroupInfo *tg = new TablegroupInfo(req->id(), req->namespace_id());
6965
54
  tablegroup_ids_map_[req->id()] = tg;
6966
54
  table_tablegroup_ids_map_[parent_table_id] = tg->id();
6967
6968
54
  return s;
6969
56
}
6970
6971
Status CatalogManager::DeleteTablegroup(const DeleteTablegroupRequestPB* req,
6972
                                        DeleteTablegroupResponsePB* resp,
6973
40
                                        rpc::RpcContext* rpc) {
6974
40
  LOG(INFO) << "Servicing DeleteTablegroup request from " << RequestorString(rpc) << ": "
6975
40
            << req->ShortDebugString();
6976
40
  DeleteTableRequestPB dtreq;
6977
40
  DeleteTableResponsePB dtresp;
6978
6979
  // Sanity check for PB fields
6980
40
  if (!req->has_id() || !req->has_namespace_id()) {
6981
0
    Status s = STATUS(InvalidArgument, "Improper DELETE TABLEGROUP request (missing fields).");
6982
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
6983
0
  }
6984
6985
  // Use the tablegroup id as the prefix for the parent table id.
6986
40
  const auto parent_table_id = req->id() + kTablegroupParentTableIdSuffix;
6987
40
  const auto parent_table_name = req->id() + kTablegroupParentTableNameSuffix;
6988
6989
40
  dtreq.mutable_table()->set_table_name(parent_table_name);
6990
40
  dtreq.mutable_table()->set_table_id(parent_table_id);
6991
40
  dtreq.set_is_index_table(false);
6992
6993
40
  {
6994
40
    SharedLock lock(mutex_);
6995
40
    const auto& tablegroup = tablegroup_ids_map_[req->id()];
6996
    // Tablegroup should be empty. In practice that means it would contain only the
6997
    // dummy parent table.
6998
    // TODO(alex): Rework tablegroup internals to track real tables.
6999
40
    if (tablegroup->NumChildTables() > 1) {
7000
0
      return SetupError(
7001
0
          resp->mutable_error(),
7002
0
          MasterErrorPB::INVALID_REQUEST,
7003
0
          STATUS_FORMAT(InvalidArgument,
7004
0
                        "Cannot delete tablegroup, it still has $0 tables in it",
7005
0
                        tablegroup->NumChildTables() - 1));
7006
0
    }
7007
40
  }
7008
7009
40
  Status s = DeleteTable(&dtreq, &dtresp, rpc);
7010
40
  resp->set_parent_table_id(dtresp.table_id());
7011
7012
  // Carry over error.
7013
40
  if (dtresp.has_error()) {
7014
0
    resp->mutable_error()->Swap(dtresp.mutable_error());
7015
0
    return s;
7016
0
  }
7017
7018
  // Perform map updates.
7019
40
  LockGuard lock(mutex_);
7020
40
  TRACE("Acquired catalog manager lock");
7021
40
  tablegroup_ids_map_.erase(req->id());
7022
40
  tablegroup_tablet_ids_map_[req->namespace_id()].erase(req->id());
7023
40
  table_tablegroup_ids_map_.erase(parent_table_id);
7024
7025
40
  LOG(INFO) << "Deleted tablegroup " << req->id();
7026
40
  return s;
7027
40
}
7028
7029
Status CatalogManager::ListTablegroups(const ListTablegroupsRequestPB* req,
7030
                                       ListTablegroupsResponsePB* resp,
7031
7
                                       rpc::RpcContext* rpc) {
7032
7
  SharedLock lock(mutex_);
7033
7034
7
  if (!req->has_namespace_id()) {
7035
0
    Status s = STATUS(InvalidArgument, "Improper ListTablegroups request (missing fields).");
7036
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
7037
0
  }
7038
7039
7
  if (tablegroup_tablet_ids_map_.find(req->namespace_id()) == tablegroup_tablet_ids_map_.end()) {
7040
0
    return STATUS(NotFound, "Tablegroups not found for namespace id: ", req->namespace_id());
7041
0
  }
7042
7043
9
  
for (const auto& entry : tablegroup_tablet_ids_map_[req->namespace_id()])7
{
7044
9
    const TablegroupId tgid = entry.first;
7045
9
    if (tablegroup_ids_map_.find(tgid) == tablegroup_ids_map_.end()) {
7046
0
      LOG(WARNING) << "Tablegroup info in " << req->namespace_id()
7047
0
                   << " not found for tablegroup id: " << tgid;
7048
0
      continue;
7049
0
    }
7050
9
    scoped_refptr<TablegroupInfo> tginfo = tablegroup_ids_map_[tgid];
7051
7052
9
    TablegroupIdentifierPB *tg = resp->add_tablegroups();
7053
9
    tg->set_id(tginfo->id());
7054
9
    tg->set_namespace_id(tginfo->namespace_id());
7055
9
  }
7056
7
  return Status::OK();
7057
7
}
7058
7059
2
bool CatalogManager::HasTablegroups() {
7060
2
  SharedLock lock(mutex_);
7061
2
  return !tablegroup_ids_map_.empty();
7062
2
}
7063
7064
Status CatalogManager::CreateNamespace(const CreateNamespaceRequestPB* req,
7065
                                       CreateNamespaceResponsePB* resp,
7066
2.65k
                                       rpc::RpcContext* rpc) {
7067
2.65k
  Status return_status;
7068
7069
  // Copy the request, so we can fill in some defaults.
7070
2.65k
  LOG(INFO) << "CreateNamespace from " << RequestorString(rpc)
7071
2.65k
            << ": " << req->DebugString();
7072
7073
2.65k
  scoped_refptr<NamespaceInfo> ns;
7074
2.65k
  std::vector<scoped_refptr<TableInfo>> pgsql_tables;
7075
2.65k
  TransactionMetadata txn;
7076
2.65k
  const auto db_type = GetDatabaseType(*req);
7077
2.65k
  {
7078
2.65k
    LockGuard lock(mutex_);
7079
2.65k
    TRACE("Acquired catalog manager lock");
7080
7081
    // Validate the user request.
7082
7083
    // Verify that the namespace does not already exist.
7084
2.65k
    ns = FindPtrOrNull(namespace_ids_map_, req->namespace_id()); // Same ID.
7085
2.65k
    if (ns == nullptr && db_type != YQL_DATABASE_PGSQL) {
7086
      // PGSQL databases have name uniqueness handled at a different layer, so ignore overlaps.
7087
2.39k
      ns = FindPtrOrNull(namespace_names_mapper_[db_type], req->name());
7088
2.39k
    }
7089
2.65k
    if (ns != nullptr) {
7090
6
      resp->set_id(ns->id());
7091
6
      return_status = STATUS_SUBSTITUTE(AlreadyPresent, "Keyspace '$0' already exists",
7092
6
                                        req->name());
7093
6
      LOG(WARNING) << "Found keyspace: " << ns->id() << ". Failed creating keyspace with error: "
7094
6
                   << return_status.ToString() << " Request:\n" << req->DebugString();
7095
6
      return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_ALREADY_PRESENT,
7096
6
                        return_status);
7097
6
    }
7098
7099
    // Add the new namespace.
7100
7101
    // Create unique id for this new namespace.
7102
2.64k
    NamespaceId new_id = !req->namespace_id().empty()
7103
2.64k
        ? 
req->namespace_id()229
:
GenerateIdUnlocked(SysRowEntryType::NAMESPACE)2.41k
;
7104
2.64k
    ns = new NamespaceInfo(new_id);
7105
2.64k
    ns->mutable_metadata()->StartMutation();
7106
2.64k
    SysNamespaceEntryPB *metadata = &ns->mutable_metadata()->mutable_dirty()->pb;
7107
2.64k
    metadata->set_name(req->name());
7108
2.64k
    metadata->set_database_type(db_type);
7109
2.64k
    metadata->set_colocated(req->colocated());
7110
2.64k
    metadata->set_state(SysNamespaceEntryPB::PREPARING);
7111
7112
    // For namespace created for a Postgres database, save the list of tables and indexes for
7113
    // for the database that need to be copied.
7114
2.64k
    if (db_type == YQL_DATABASE_PGSQL) {
7115
260
      if (req->source_namespace_id().empty()) {
7116
137
        metadata->set_next_pg_oid(req->next_pg_oid());
7117
137
      } else {
7118
123
        const auto source_oid = GetPgsqlDatabaseOid(req->source_namespace_id());
7119
123
        if (!source_oid.ok()) {
7120
0
          return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND,
7121
0
                            source_oid.status());
7122
0
        }
7123
77.6k
        
for (const auto& iter : *table_ids_map_)123
{
7124
77.6k
          const auto& table_id = iter.first;
7125
77.6k
          const auto& table = iter.second;
7126
77.6k
          if (IsPgsqlId(table_id) && 
CHECK_RESULT75.2k
(GetPgsqlDatabaseOid(table_id)) == *source_oid75.2k
) {
7127
            // Since indexes have dependencies on the base tables, put the tables in the front.
7128
15.5k
            const bool is_table = table->indexed_table_id().empty();
7129
15.5k
            pgsql_tables.insert(is_table ? 
pgsql_tables.begin()8.75k
:
pgsql_tables.end()6.81k
, table);
7130
15.5k
          }
7131
77.6k
        }
7132
7133
123
        scoped_refptr<NamespaceInfo> source_ns = FindPtrOrNull(namespace_ids_map_,
7134
123
                                                               req->source_namespace_id());
7135
123
        if (!source_ns) {
7136
0
          return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND,
7137
0
                            STATUS(NotFound, "Source keyspace not found",
7138
0
                                   req->source_namespace_id()));
7139
0
        }
7140
123
        auto source_ns_lock = source_ns->LockForRead();
7141
123
        metadata->set_next_pg_oid(source_ns_lock->pb.next_pg_oid());
7142
123
      }
7143
260
    }
7144
7145
    // NS with a Transaction should be rolled back if the transaction does not get Committed.
7146
    // Store this on the NS for now and use it later.
7147
2.64k
    if (req->has_transaction() && 
PREDICT_TRUE114
(FLAGS_enable_transactional_ddl_gc)) {
7148
92
      metadata->mutable_transaction()->CopyFrom(req->transaction());
7149
92
      txn = VERIFY_RESULT(TransactionMetadata::FromPB(req->transaction()));
7150
92
      RSTATUS_DCHECK(!txn.status_tablet.empty(), Corruption, "Given incomplete Transaction");
7151
92
    }
7152
7153
    // Add the namespace to the in-memory map for the assignment.
7154
2.64k
    namespace_ids_map_[ns->id()] = ns;
7155
2.64k
    namespace_names_mapper_[db_type][req->name()] = ns;
7156
7157
2.64k
    resp->set_id(ns->id());
7158
2.64k
  }
7159
2.64k
  TRACE("Inserted new keyspace info into CatalogManager maps");
7160
7161
  // Update the on-disk system catalog.
7162
2.64k
  return_status = sys_catalog_->Upsert(leader_ready_term(), ns);
7163
2.64k
  if (!return_status.ok()) {
7164
6
    LOG(WARNING) << "Keyspace creation failed:" << return_status.ToString();
7165
6
    {
7166
6
      LockGuard lock(mutex_);
7167
6
      namespace_ids_map_.erase(ns->id());
7168
6
      namespace_names_mapper_[db_type].erase(req->name());
7169
6
    }
7170
6
    ns->mutable_metadata()->AbortMutation();
7171
6
    return CheckIfNoLongerLeaderAndSetupError(return_status, resp);
7172
6
  }
7173
2.63k
  TRACE("Wrote keyspace to sys-catalog");
7174
  // Commit the namespace in-memory state.
7175
2.63k
  ns->mutable_metadata()->CommitMutation();
7176
7177
2.63k
  LOG(INFO) << "Created keyspace " << ns->ToString();
7178
7179
2.63k
  if (req->has_creator_role_name()) {
7180
904
    RETURN_NOT_OK(permissions_manager_->GrantPermissions(
7181
904
        req->creator_role_name(),
7182
904
        get_canonical_keyspace(req->name()),
7183
904
        req->name() /* resource name */,
7184
904
        req->name() /* keyspace name */,
7185
904
        all_permissions_for_resource(ResourceType::KEYSPACE),
7186
904
        ResourceType::KEYSPACE,
7187
904
        resp));
7188
904
  }
7189
7190
  // Colocated databases need to create a parent tablet to serve as the base storage location.
7191
2.63k
  if (req->colocated()) {
7192
17
    CreateTableRequestPB req;
7193
17
    CreateTableResponsePB resp;
7194
17
    const auto parent_table_id = ns->id() + kColocatedParentTableIdSuffix;
7195
17
    const auto parent_table_name = ns->id() + kColocatedParentTableNameSuffix;
7196
17
    req.set_name(parent_table_name);
7197
17
    req.set_table_id(parent_table_id);
7198
17
    req.mutable_namespace_()->set_name(ns->name());
7199
17
    req.mutable_namespace_()->set_id(ns->id());
7200
17
    req.set_table_type(GetTableTypeForDatabase(ns->database_type()));
7201
17
    req.set_colocated(true);
7202
7203
17
    YBSchemaBuilder schemaBuilder;
7204
17
    schemaBuilder.AddColumn("parent_column")->Type(BINARY)->PrimaryKey()->NotNull();
7205
17
    YBSchema ybschema;
7206
17
    CHECK_OK(schemaBuilder.Build(&ybschema));
7207
17
    auto schema = yb::client::internal::GetSchema(ybschema);
7208
17
    SchemaToPB(schema, req.mutable_schema());
7209
17
    req.mutable_schema()->mutable_table_properties()->set_is_transactional(true);
7210
7211
    // create a parent table, which will create the tablet.
7212
17
    Status s = CreateTable(&req, &resp, rpc);
7213
    // We do not lock here so it is technically possible that the table was already created.
7214
    // If so, there is nothing to do so we just ignore the "AlreadyPresent" error.
7215
17
    if (!s.ok() && 
!s.IsAlreadyPresent()0
) {
7216
0
      LOG(WARNING) << "Keyspace creation failed:" << s.ToString();
7217
      // TODO: We should verify this behavior works end-to-end.
7218
      // Diverging in-memory state from disk so the user can issue a delete if no new leader.
7219
0
      auto l = ns->LockForWrite();
7220
0
      SysNamespaceEntryPB& metadata = ns->mutable_metadata()->mutable_dirty()->pb;
7221
0
      metadata.set_state(SysNamespaceEntryPB::FAILED);
7222
0
      l.Commit();
7223
0
      return s;
7224
0
    }
7225
17
  }
7226
7227
2.63k
  if ((db_type == YQL_DATABASE_PGSQL && 
!pgsql_tables.empty()254
) ||
7228
2.63k
      
PREDICT_FALSE2.51k
(GetAtomicFlag(&FLAGS_TEST_hang_on_namespace_transition))) {
7229
    // Process the subsequent work in the background thread (normally PGSQL).
7230
125
    LOG(INFO) << "Keyspace create enqueued for later processing: " << ns->ToString();
7231
125
    RETURN_NOT_OK(background_tasks_thread_pool_->SubmitFunc(
7232
125
        std::bind(&CatalogManager::ProcessPendingNamespace, this, ns->id(), pgsql_tables, txn)));
7233
125
    return Status::OK();
7234
2.51k
  } else {
7235
    // All work is done, it's now safe to online the namespace (normally YQL).
7236
2.51k
    auto l = ns->LockForWrite();
7237
2.51k
    SysNamespaceEntryPB& metadata = ns->mutable_metadata()->mutable_dirty()->pb;
7238
2.51k
    if (metadata.state() == SysNamespaceEntryPB::PREPARING) {
7239
2.51k
      metadata.set_state(SysNamespaceEntryPB::RUNNING);
7240
2.51k
      return_status = sys_catalog_->Upsert(leader_ready_term(), ns);
7241
2.51k
      if (!return_status.ok()) {
7242
        // Diverging in-memory state from disk so the user can issue a delete if no new leader.
7243
4
        LOG(WARNING) << "Keyspace creation failed:" << return_status.ToString();
7244
4
        metadata.set_state(SysNamespaceEntryPB::FAILED);
7245
4
        return_status = CheckIfNoLongerLeaderAndSetupError(return_status, resp);
7246
2.51k
      } else {
7247
2.51k
        TRACE("Activated keyspace in sys-catalog");
7248
2.51k
        LOG(INFO) << "Activated keyspace: " << ns->ToString();
7249
2.51k
      }
7250
      // Commit the namespace in-memory state.
7251
2.51k
      l.Commit();
7252
2.51k
    } else {
7253
0
      LOG(WARNING) << "Keyspace has invalid state (" << metadata.state() << "), aborting create";
7254
0
    }
7255
2.51k
  }
7256
2.51k
  return return_status;
7257
2.63k
}
7258
7259
void CatalogManager::ProcessPendingNamespace(
7260
    NamespaceId id,
7261
    std::vector<scoped_refptr<TableInfo>> template_tables,
7262
126
    TransactionMetadata txn) {
7263
126
  LOG(INFO) << "ProcessPendingNamespace started for " << id;
7264
7265
  // Ensure that we are currently the Leader before handling DDL operations.
7266
126
  {
7267
126
    SCOPED_LEADER_SHARED_LOCK(l, this);
7268
126
    if (!l.catalog_status().ok()) {
7269
0
      LOG(WARNING) << "Catalog status failure: " << l.catalog_status().ToString();
7270
      // Don't try again, we have to reset in-memory state after losing leader election.
7271
0
      return;
7272
0
    }
7273
126
    if (!l.leader_status().ok()) {
7274
0
      LOG(WARNING) << "Leader status failure: " << l.leader_status().ToString();
7275
      // Don't try again, we have to reset in-memory state after losing leader election.
7276
0
      return;
7277
0
    }
7278
126
  }
7279
7280
126
  if (PREDICT_FALSE(GetAtomicFlag(&FLAGS_TEST_hang_on_namespace_transition))) {
7281
2
    LOG(INFO) << "Artificially waiting (" << FLAGS_catalog_manager_bg_task_wait_ms
7282
2
              << "ms) on namespace creation for " << id;
7283
2
    SleepFor(MonoDelta::FromMilliseconds(FLAGS_catalog_manager_bg_task_wait_ms));
7284
2
    WARN_NOT_OK(background_tasks_thread_pool_->SubmitFunc(
7285
2
        std::bind(&CatalogManager::ProcessPendingNamespace, this, id, template_tables, txn)),
7286
2
        "Could not submit ProcessPendingNamespaces to thread pool");
7287
2
    return;
7288
2
  }
7289
7290
124
  scoped_refptr<NamespaceInfo> ns;
7291
124
  {
7292
124
    LockGuard lock(mutex_);
7293
124
    ns = FindPtrOrNull(namespace_ids_map_, id);;
7294
124
  }
7295
124
  if (ns == nullptr) {
7296
0
    LOG(WARNING) << "Pending Namespace not found to finish creation: " << id;
7297
0
    return;
7298
0
  }
7299
7300
  // Copy the system tables necessary to create this namespace.  This can be time-intensive.
7301
124
  bool success = true;
7302
124
  if (!template_tables.empty()) {
7303
123
    auto s = CopyPgsqlSysTables(ns->id(), template_tables);
7304
123
    WARN_NOT_OK(s, "Error Copying PGSQL System Tables for Pending Namespace");
7305
123
    success = s.ok();
7306
123
  }
7307
7308
  // All work is done, change the namespace state regardless of success or failure.
7309
124
  {
7310
124
    auto l = ns->LockForWrite();
7311
124
    SysNamespaceEntryPB& metadata = ns->mutable_metadata()->mutable_dirty()->pb;
7312
124
    if (metadata.state() == SysNamespaceEntryPB::PREPARING) {
7313
124
      metadata.set_state(success ? 
SysNamespaceEntryPB::RUNNING123
:
SysNamespaceEntryPB::FAILED1
);
7314
124
      auto s = sys_catalog_->Upsert(leader_ready_term(), ns);
7315
124
      if (s.ok()) {
7316
123
        TRACE("Done processing keyspace");
7317
123
        LOG(INFO) << (success ? "Processed" : 
"Failed"0
) << " keyspace: " << ns->ToString();
7318
7319
        // Verify Transaction gets committed, which occurs after namespace create finishes.
7320
123
        if (success && metadata.has_transaction()) {
7321
91
          LOG(INFO) << "Enqueuing keyspace for Transaction Verification: " << ns->ToString();
7322
91
          std::function<Status(bool)> when_done =
7323
91
              std::bind(&CatalogManager::VerifyNamespacePgLayer, this, ns, _1);
7324
91
          WARN_NOT_OK(background_tasks_thread_pool_->SubmitFunc(
7325
91
              std::bind(&YsqlTransactionDdl::VerifyTransaction, ysql_transaction_.get(),
7326
91
                        txn, when_done)),
7327
91
              "Could not submit VerifyTransaction to thread pool");
7328
91
        }
7329
123
      } else {
7330
1
        metadata.set_state(SysNamespaceEntryPB::FAILED);
7331
1
        if (s.IsIllegalState() || s.IsAborted()) {
7332
0
          s = STATUS(ServiceUnavailable,
7333
0
              "operation requested can only be executed on a leader master, but this"
7334
0
              " master is no longer the leader", s.ToString());
7335
1
        } else {
7336
1
          s = s.CloneAndPrepend(Substitute(
7337
1
              "An error occurred while modifying keyspace to $0 in sys-catalog: $1",
7338
1
              metadata.state(), s.ToString()));
7339
1
        }
7340
1
        LOG(WARNING) << s.ToString();
7341
1
      }
7342
      // Commit the namespace in-memory state.
7343
124
      l.Commit();
7344
124
    } else {
7345
0
      LOG(WARNING) << "Bad keyspace state (" << metadata.state()
7346
0
                   << "), abandoning creation work for " << ns->ToString();
7347
0
    }
7348
124
  }
7349
124
}
7350
7351
Status CatalogManager::VerifyNamespacePgLayer(
7352
90
    scoped_refptr<NamespaceInfo> ns, bool rpc_success) {
7353
  // Upon Transaction completion, check pg system table using OID to ensure SUCCESS.
7354
90
  const auto pg_table_id = GetPgsqlTableId(atoi(kSystemNamespaceId), kPgDatabaseTableOid);
7355
90
  auto entry_exists = VERIFY_RESULT(
7356
90
      ysql_transaction_->PgEntryExists(pg_table_id, GetPgsqlDatabaseOid(ns->id())));
7357
0
  auto l = ns->LockForWrite();
7358
90
  SysNamespaceEntryPB& metadata = ns->mutable_metadata()->mutable_dirty()->pb;
7359
7360
  // #5981: Mark un-retryable rpc failures as pass to avoid infinite retry of GC'd txns.
7361
90
  bool txn_check_passed = entry_exists || 
!rpc_success2
;
7362
7363
90
  if (txn_check_passed) {
7364
    // Passed checks.  Remove the transaction from the entry since we're done processing it.
7365
88
    SCHECK_EQ(metadata.state(), SysNamespaceEntryPB::RUNNING, Aborted,
7366
85
              Substitute("Invalid Namespace state ($0), abandoning transaction GC work for $1",
7367
85
                 SysNamespaceEntryPB_State_Name(metadata.state()), ns->ToString()));
7368
85
    metadata.clear_transaction();
7369
85
    RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), ns));
7370
85
    if (entry_exists) {
7371
85
      LOG(INFO) << "Namespace transaction succeeded: " << ns->ToString();
7372
85
    } else {
7373
0
      LOG(WARNING) << "Unknown RPC Failure, removing transaction on namespace: " << ns->ToString();
7374
0
    }
7375
    // Commit the namespace in-memory state.
7376
85
    l.Commit();
7377
85
  } else {
7378
    // Transaction failed.  We need to delete this Database now.
7379
2
    SCHECK(metadata.state() == SysNamespaceEntryPB::RUNNING ||
7380
2
           metadata.state() == SysNamespaceEntryPB::FAILED, Aborted,
7381
2
           Substitute("Invalid Namespace state ($0), aborting delete.",
7382
2
                      SysNamespaceEntryPB_State_Name(metadata.state()), ns->ToString()));
7383
2
    LOG(INFO) << "Namespace transaction failed, deleting: " << ns->ToString();
7384
2
    metadata.set_state(SysNamespaceEntryPB::DELETING);
7385
2
    metadata.clear_transaction();
7386
2
    RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), ns));
7387
    // Commit the namespace in-memory state.
7388
2
    l.Commit();
7389
    // Async enqueue delete.
7390
2
    RETURN_NOT_OK(background_tasks_thread_pool_->SubmitFunc(
7391
2
        std::bind(&CatalogManager::DeleteYsqlDatabaseAsync, this, ns)));
7392
2
  }
7393
87
  return Status::OK();
7394
90
}
7395
7396
// Get the information about an in-progress create operation.
7397
Status CatalogManager::IsCreateNamespaceDone(const IsCreateNamespaceDoneRequestPB* req,
7398
3.56k
                                             IsCreateNamespaceDoneResponsePB* resp) {
7399
3.56k
  auto ns_pb = req->namespace_();
7400
7401
  // 1. Lookup the namespace and verify it exists.
7402
3.56k
  TRACE("Looking up keyspace");
7403
3.56k
  auto ns = VERIFY_NAMESPACE_FOUND(FindNamespace(ns_pb), resp);
7404
7405
3.56k
  TRACE("Locking keyspace");
7406
3.56k
  auto l = ns->LockForRead();
7407
3.56k
  auto metadata = l->pb;
7408
7409
3.56k
  switch (metadata.state()) {
7410
    // Success cases. Done and working.
7411
2.17k
    case SysNamespaceEntryPB::RUNNING:
7412
2.17k
      if (!ns->colocated()) {
7413
2.15k
        resp->set_done(true);
7414
2.15k
      } else {
7415
        // Verify system table created as well, if colocated.
7416
22
        IsCreateTableDoneRequestPB table_req;
7417
22
        IsCreateTableDoneResponsePB table_resp;
7418
22
        const auto parent_table_id = ns->id() + kColocatedParentTableIdSuffix;
7419
22
        table_req.mutable_table()->set_table_id(parent_table_id);
7420
22
        auto s = IsCreateTableDone(&table_req, &table_resp);
7421
22
        resp->set_done(table_resp.done());
7422
22
        if (!s.ok()) {
7423
0
          if (table_resp.has_error()) {
7424
0
            resp->mutable_error()->Swap(table_resp.mutable_error());
7425
0
          }
7426
0
          return s;
7427
0
        }
7428
22
      }
7429
2.17k
      break;
7430
    // These states indicate that a create completed but a subsequent remove was requested.
7431
2.17k
    case SysNamespaceEntryPB::DELETING:
7432
0
    case SysNamespaceEntryPB::DELETED:
7433
0
      resp->set_done(true);
7434
0
      break;
7435
    // Pending cases.  NOT DONE
7436
1.39k
    case SysNamespaceEntryPB::PREPARING:
7437
1.39k
      resp->set_done(false);
7438
1.39k
      break;
7439
    // Failure cases.  Done, but we need to give the user an error message.
7440
1
    case SysNamespaceEntryPB::FAILED:
7441
1
      resp->set_done(true);
7442
1
      return SetupError(resp->mutable_error(), MasterErrorPB::UNKNOWN_ERROR, STATUS(InternalError,
7443
1
              "Namespace Create Failed: not onlined."));
7444
0
    default:
7445
0
      Status s = STATUS_SUBSTITUTE(IllegalState, "IsCreateNamespaceDone failure: state=$0",
7446
0
                                   SysNamespaceEntryPB_State_Name(metadata.state()));
7447
0
      LOG(WARNING) << s.ToString();
7448
0
      resp->set_done(true);
7449
0
      return SetupError(resp->mutable_error(), MasterErrorPB::UNKNOWN_ERROR, s);
7450
3.56k
  }
7451
7452
3.56k
  return Status::OK();
7453
3.56k
}
7454
7455
Status CatalogManager::DeleteNamespace(const DeleteNamespaceRequestPB* req,
7456
                                       DeleteNamespaceResponsePB* resp,
7457
1.64k
                                       rpc::RpcContext* rpc) {
7458
1.64k
  auto status = DoDeleteNamespace(req, resp, rpc);
7459
1.64k
  if (!status.ok()) {
7460
11
    return SetupError(resp->mutable_error(), status);
7461
11
  }
7462
1.63k
  return status;
7463
1.64k
}
7464
7465
Status CatalogManager::DoDeleteNamespace(const DeleteNamespaceRequestPB* req,
7466
                                         DeleteNamespaceResponsePB* resp,
7467
1.64k
                                         rpc::RpcContext* rpc) {
7468
1.64k
  LOG(INFO) << "Servicing DeleteNamespace request from " << RequestorString(rpc)
7469
1.64k
            << ": " << req->ShortDebugString();
7470
7471
  // Lookup the namespace and verify if it exists.
7472
1.64k
  TRACE("Looking up keyspace");
7473
1.64k
  auto ns = 
VERIFY_RESULT1.64k
(FindNamespace(req->namespace_()));1.64k
7474
7475
1.64k
  if (req->has_database_type() && 
req->database_type() != ns->database_type()74
) {
7476
    // Could not find the right database to delete.
7477
0
    return STATUS(NotFound, "Keyspace not found", ns->name(),
7478
0
                  MasterError(MasterErrorPB::NAMESPACE_NOT_FOUND));
7479
0
  }
7480
1.64k
  {
7481
    // Don't allow deletion if the namespace is in a transient state.
7482
1.64k
    auto cur_state = ns->state();
7483
1.64k
    if (cur_state != SysNamespaceEntryPB::RUNNING && 
cur_state != SysNamespaceEntryPB::FAILED11
) {
7484
2
      if (cur_state == SysNamespaceEntryPB::DELETED) {
7485
1
        return STATUS(NotFound, "Keyspace already deleted", ns->name(),
7486
1
                      MasterError(MasterErrorPB::NAMESPACE_NOT_FOUND));
7487
1
      } else {
7488
1
        return STATUS_EC_FORMAT(
7489
1
            TryAgain, MasterError(MasterErrorPB::IN_TRANSITION_CAN_RETRY),
7490
1
            "Namespace deletion not allowed when State = $0",
7491
1
            SysNamespaceEntryPB::State_Name(cur_state));
7492
1
      }
7493
2
    }
7494
1.64k
  }
7495
7496
  // PGSQL has a completely forked implementation because it allows non-empty namespaces on delete.
7497
1.64k
  if (ns->database_type() == YQL_DATABASE_PGSQL) {
7498
101
    return DeleteYsqlDatabase(req, resp, rpc);
7499
101
  }
7500
7501
1.54k
  TRACE("Locking keyspace");
7502
1.54k
  auto l = ns->LockForWrite();
7503
7504
  // Only empty namespace can be deleted.
7505
1.54k
  TRACE("Looking for tables in the keyspace");
7506
1.54k
  {
7507
1.54k
    SharedLock lock(mutex_);
7508
1.54k
    
VLOG_WITH_FUNC0
(3) << "Acquired the catalog manager lock"0
;
7509
7510
32.5k
    for (const TableInfoMap::value_type& entry : *table_ids_map_) {
7511
32.5k
      auto ltm = entry.second->LockForRead();
7512
7513
32.5k
      if (!ltm->started_deleting() && 
ltm->namespace_id() == ns->id()29.9k
) {
7514
3
        return STATUS_EC_FORMAT(
7515
3
            InvalidArgument, MasterError(MasterErrorPB::NAMESPACE_IS_NOT_EMPTY),
7516
3
            "Cannot delete keyspace which has $0: $1 [id=$2], request: $3",
7517
3
            IsTable(ltm->pb) ? "table" : "index", ltm->name(), entry.second->id(),
7518
3
            req->ShortDebugString());
7519
3
      }
7520
32.5k
    }
7521
7522
    // Only empty namespace can be deleted.
7523
1.53k
    TRACE("Looking for types in the keyspace");
7524
7525
1.53k
    for (const UDTypeInfoMap::value_type& entry : udtype_ids_map_) {
7526
0
      auto ltm = entry.second->LockForRead();
7527
7528
0
      if (ltm->namespace_id() == ns->id()) {
7529
0
        return STATUS_EC_FORMAT(
7530
0
            InvalidArgument, MasterError(MasterErrorPB::NAMESPACE_IS_NOT_EMPTY),
7531
0
            "Cannot delete keyspace which has type: $0 [id=$1], request: $2",
7532
0
            ltm->name(), entry.second->id(), req->ShortDebugString());
7533
0
      }
7534
0
    }
7535
1.53k
  }
7536
7537
  // Disallow deleting namespaces with snapshot schedules.
7538
1.53k
  auto map = VERIFY_RESULT(MakeSnapshotSchedulesToObjectIdsMap(SysRowEntryType::NAMESPACE));
7539
0
  for (const auto& schedule_and_objects : map) {
7540
0
    for (const auto& id : schedule_and_objects.second) {
7541
0
      if (id == ns->id()) {
7542
0
        return STATUS_EC_FORMAT(
7543
0
            InvalidArgument, MasterError(MasterErrorPB::NAMESPACE_IS_NOT_EMPTY),
7544
0
            "Cannot delete keyspace which has schedule: $0, request: $1",
7545
0
            schedule_and_objects.first, req->ShortDebugString());
7546
0
      }
7547
0
    }
7548
0
  }
7549
7550
  // [Delete]. Skip the DELETING->DELETED state, since no tables are present in this namespace.
7551
1.53k
  TRACE("Updating metadata on disk");
7552
  // Update sys-catalog.
7553
1.53k
  Status s = sys_catalog_->Delete(leader_ready_term(), ns);
7554
1.53k
  if (!s.ok()) {
7555
    // The mutation will be aborted when 'l' exits the scope on early return.
7556
0
    s = s.CloneAndPrepend("An error occurred while updating sys-catalog");
7557
0
    LOG(WARNING) << s;
7558
0
    return CheckIfNoLongerLeader(s);
7559
0
  }
7560
7561
  // Update the in-memory state.
7562
1.53k
  TRACE("Committing in-memory state");
7563
1.53k
  l.Commit();
7564
7565
  // Remove the namespace from all CatalogManager mappings.
7566
1.53k
  {
7567
1.53k
    LockGuard lock(mutex_);
7568
1.53k
    if (namespace_names_mapper_[ns->database_type()].erase(ns->name()) < 1) {
7569
0
      LOG(WARNING) << Format("Could not remove namespace from names map, id=$1", ns->id());
7570
0
    }
7571
1.53k
    if (namespace_ids_map_.erase(ns->id()) < 1) {
7572
0
      LOG(WARNING) << Format("Could not remove namespace from ids map, id=$1", ns->id());
7573
0
    }
7574
1.53k
  }
7575
7576
  // Delete any permissions granted on this keyspace to any role. See comment in DeleteTable() for
7577
  // more details.
7578
1.53k
  string canonical_resource = get_canonical_keyspace(req->namespace_().name());
7579
1.53k
  RETURN_NOT_OK(permissions_manager_->RemoveAllPermissionsForResource(canonical_resource, resp));
7580
7581
1.53k
  LOG(INFO) << "Successfully deleted keyspace " << ns->ToString()
7582
1.53k
            << " per request from " << RequestorString(rpc);
7583
1.53k
  return Status::OK();
7584
1.53k
}
7585
7586
0
void CatalogManager::DeleteYcqlDatabaseAsync(scoped_refptr<NamespaceInfo> database) {
7587
0
  TRACE("Locking keyspace");
7588
0
  auto l = database->LockForWrite();
7589
7590
  // Only empty namespace can be deleted.
7591
0
  TRACE("Looking for tables in the keyspace");
7592
0
  {
7593
0
    SharedLock lock(mutex_);
7594
0
    VLOG_WITH_FUNC(3) << "Acquired the catalog manager lock";
7595
7596
0
    for (const TableInfoMap::value_type& entry : *table_ids_map_) {
7597
0
      auto ltm = entry.second->LockForRead();
7598
7599
0
      if (!ltm->started_deleting() && ltm->namespace_id() == database->id()) {
7600
0
        LOG(WARNING) << "Cannot delete keyspace which has " << ltm->name()
7601
0
          << " with id=" << entry.second->id();
7602
0
        return;
7603
0
      }
7604
0
    }
7605
0
  }
7606
7607
  // Only empty namespace can be deleted.
7608
0
  TRACE("Looking for types in the keyspace");
7609
0
  {
7610
0
    SharedLock lock(mutex_);
7611
0
    VLOG_WITH_FUNC(3) << "Acquired the catalog manager lock";
7612
7613
0
    for (const UDTypeInfoMap::value_type& entry : udtype_ids_map_) {
7614
0
      auto ltm = entry.second->LockForRead();
7615
7616
0
      if (ltm->namespace_id() == database->id()) {
7617
0
        LOG(WARNING) << "Cannot delete keyspace which has type: " << ltm->name()
7618
0
          << " with id=" << entry.second->id();
7619
0
        return;
7620
0
      }
7621
0
    }
7622
0
  }
7623
7624
  // [Delete]. Skip the DELETING->DELETED state, since no tables are present in this namespace.
7625
0
  TRACE("Updating metadata on disk");
7626
  // Update sys-catalog.
7627
0
  Status s = sys_catalog_->Delete(leader_ready_term(), database);
7628
0
  if (!s.ok()) {
7629
    // The mutation will be aborted when 'l' exits the scope on early return.
7630
0
    s = s.CloneAndPrepend(Substitute("An error occurred while updating sys-catalog: $0",
7631
0
                                     s.ToString()));
7632
0
    LOG(WARNING) << s.ToString();
7633
0
    return;
7634
0
  }
7635
7636
  // Update the in-memory state.
7637
0
  TRACE("Committing in-memory state");
7638
0
  l.Commit();
7639
7640
  // Remove the namespace from all CatalogManager mappings.
7641
0
  {
7642
0
    LockGuard lock(mutex_);
7643
0
    namespace_names_mapper_[database->database_type()].erase(database->name());
7644
0
    if (namespace_ids_map_.erase(database->id()) < 1) {
7645
0
      LOG(WARNING) << Format("Could not remove namespace from maps, id=$1", database->id());
7646
0
    }
7647
0
  }
7648
7649
  // Delete any permissions granted on this keyspace to any role. See comment in DeleteTable() for
7650
  // more details.
7651
0
  string canonical_resource = get_canonical_keyspace(database->name());
7652
0
  DeleteNamespaceResponsePB resp;
7653
0
  s = permissions_manager_->RemoveAllPermissionsForResource(canonical_resource, &resp);
7654
0
  if (s.ok()) {
7655
0
    LOG(INFO) << "Successfully deleted keyspace " << database->ToString();
7656
0
  } else {
7657
0
    LOG(WARNING) << "Error deleting keyspace " << database->ToString() << ": " << s;
7658
0
  }
7659
0
}
7660
7661
Status CatalogManager::DeleteYsqlDatabase(const DeleteNamespaceRequestPB* req,
7662
                                          DeleteNamespaceResponsePB* resp,
7663
101
                                          rpc::RpcContext* rpc) {
7664
  // Lookup database.
7665
101
  auto database = VERIFY_NAMESPACE_FOUND(FindNamespace(req->namespace_()), resp);
7666
7667
  // Make sure this is a YSQL database.
7668
101
  if (database->database_type() != YQL_DATABASE_PGSQL) {
7669
    // A non-YSQL namespace is found, but the rpc requests to drop a YSQL database.
7670
0
    Status s = STATUS(NotFound, "YSQL database not found", database->name());
7671
0
    return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s);
7672
0
  }
7673
7674
  // Set the Namespace to DELETING.
7675
101
  TRACE("Locking database");
7676
101
  auto l = database->LockForWrite();
7677
101
  SysNamespaceEntryPB &metadata = database->mutable_metadata()->mutable_dirty()->pb;
7678
101
  if (metadata.state() == SysNamespaceEntryPB::RUNNING ||
7679
101
      
metadata.state() == SysNamespaceEntryPB::FAILED9
) {
7680
101
    metadata.set_state(SysNamespaceEntryPB::DELETING);
7681
101
    RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), database));
7682
96
    TRACE("Marked keyspace for deletion in sys-catalog");
7683
    // Commit the namespace in-memory state.
7684
96
    l.Commit();
7685
96
  } else {
7686
0
    Status s = STATUS_SUBSTITUTE(IllegalState,
7687
0
        "Keyspace ($0) has invalid state ($1), aborting delete",
7688
0
        database->name(), metadata.state());
7689
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INTERNAL_ERROR, s);
7690
0
  }
7691
7692
96
  return background_tasks_thread_pool_->SubmitFunc(
7693
96
    std::bind(&CatalogManager::DeleteYsqlDatabaseAsync, this, database));
7694
101
}
7695
7696
100
void CatalogManager::DeleteYsqlDatabaseAsync(scoped_refptr<NamespaceInfo> database) {
7697
100
  TEST_PAUSE_IF_FLAG(TEST_hang_on_namespace_transition);
7698
7699
  // Lock database before removing content.
7700
100
  TRACE("Locking database");
7701
100
  auto l = database->LockForWrite();
7702
100
  SysNamespaceEntryPB &metadata = database->mutable_metadata()->mutable_dirty()->pb;
7703
7704
  // A DELETED Namespace has finished but was tombstoned to avoid immediately reusing the same ID.
7705
  // We consider a restart enough time, so we just need to remove it from the SysCatalog.
7706
100
  if (metadata.state() == SysNamespaceEntryPB::DELETED) {
7707
0
    Status s = sys_catalog_->Delete(leader_ready_term(), database);
7708
0
    WARN_NOT_OK(s, "SysCatalog DeleteItem for Namespace");
7709
0
    if (!s.ok()) {
7710
0
      return;
7711
0
    }
7712
100
  } else if (metadata.state() == SysNamespaceEntryPB::DELETING) {
7713
    // Delete all tables in the database.
7714
99
    TRACE("Delete all tables in YSQL database");
7715
99
    Status s = DeleteYsqlDBTables(database);
7716
99
    WARN_NOT_OK(s, "DeleteYsqlDBTables failed");
7717
99
    if (!s.ok()) {
7718
      // Move to FAILED so DeleteNamespace can be reissued by the user.
7719
4
      metadata.set_state(SysNamespaceEntryPB::FAILED);
7720
4
      l.Commit();
7721
4
      return;
7722
4
    }
7723
7724
    // Once all user-facing data has been offlined, move the Namespace to DELETED state.
7725
95
    metadata.set_state(SysNamespaceEntryPB::DELETED);
7726
95
    s = sys_catalog_->Upsert(leader_ready_term(), database);
7727
95
    WARN_NOT_OK(s, "SysCatalog Update for Namespace");
7728
95
    if (!s.ok()) {
7729
      // Move to FAILED so DeleteNamespace can be reissued by the user.
7730
1
      metadata.set_state(SysNamespaceEntryPB::FAILED);
7731
1
      l.Commit();
7732
1
      return;
7733
1
    }
7734
94
    TRACE("Marked keyspace as deleted in sys-catalog");
7735
94
  } else {
7736
1
    LOG(WARNING) << "Keyspace (" << database->name() << ") has invalid state ("
7737
1
                 << metadata.state() << "), aborting delete";
7738
1
    return;
7739
1
  }
7740
7741
  // Remove namespace from CatalogManager name mapping.  Will remove ID map after all Tables gone.
7742
94
  {
7743
94
    LockGuard lock(mutex_);
7744
94
    if (namespace_names_mapper_[database->database_type()].erase(database->name()) < 1) {
7745
0
      LOG(WARNING) << Format("Could not remove namespace from maps, name=$0, id=$1",
7746
0
                             database->name(), database->id());
7747
0
    }
7748
94
  }
7749
7750
  // Update the in-memory state.
7751
94
  TRACE("Committing in-memory state");
7752
94
  l.Commit();
7753
7754
  // DROP completed. Return status.
7755
94
  LOG(INFO) << "Successfully deleted YSQL database " << database->ToString();
7756
94
}
7757
7758
// IMPORTANT: If modifying, consider updating DeleteTable(), the singular deletion API.
7759
99
Status CatalogManager::DeleteYsqlDBTables(const scoped_refptr<NamespaceInfo>& database) {
7760
99
  TabletInfoPtr sys_tablet_info;
7761
99
  vector<pair<scoped_refptr<TableInfo>, TableInfo::WriteLock>> tables;
7762
99
  std::unordered_set<TableId> sys_table_ids;
7763
99
  {
7764
    // Lock the catalog to iterate over table_ids_map_.
7765
99
    SharedLock lock(mutex_);
7766
7767
99
    sys_tablet_info = tablet_map_->find(kSysCatalogTabletId)->second;
7768
7769
    // Populate tables and sys_table_ids.
7770
63.7k
    for (const TableInfoMap::value_type& entry : *table_ids_map_) {
7771
63.7k
      scoped_refptr<TableInfo> table = entry.second;
7772
63.7k
      if (table->namespace_id() != database->id()) {
7773
55.5k
        continue;
7774
55.5k
      }
7775
8.13k
      auto l = table->LockForWrite();
7776
8.13k
      if (l->started_deleting()) {
7777
49
        continue;
7778
49
      }
7779
8.13k
      RSTATUS_DCHECK(
7780
8.08k
          !l->pb.is_pg_shared_table(), Corruption, "Shared table found in database");
7781
7782
8.08k
      if (IsSystemTable(*table)) {
7783
8.00k
        sys_table_ids.insert(table->id());
7784
8.00k
      }
7785
7786
      // For regular (indexed) table, insert table info and lock in the front of the list. Else for
7787
      // index table, append them to the end. We do so so that we will commit and delete the indexed
7788
      // table first before its indexes.
7789
8.08k
      if (IsTable(l->pb)) {
7790
4.51k
        tables.insert(tables.begin(), {table, std::move(l)});
7791
4.51k
      } else {
7792
3.57k
        tables.push_back({table, std::move(l)});
7793
3.57k
      }
7794
8.08k
    }
7795
99
  }
7796
  // Remove the system tables from RAFT.
7797
99
  TRACE("Sending system table delete RPCs");
7798
8.00k
  for (auto &table_id : sys_table_ids) {
7799
8.00k
    RETURN_NOT_OK(sys_catalog_->DeleteYsqlSystemTable(table_id));
7800
8.00k
  }
7801
  // Remove the system tables from the system catalog TabletInfo.
7802
99
  RETURN_NOT_OK(RemoveTableIdsFromTabletInfo(sys_tablet_info, sys_table_ids));
7803
7804
  // Set all table states to DELETING as one batch RPC call.
7805
95
  TRACE("Sending delete table batch RPC to sys catalog");
7806
95
  vector<TableInfo *> tables_rpc;
7807
95
  tables_rpc.reserve(tables.size());
7808
8.08k
  for (auto &table_and_lock : tables) {
7809
8.08k
    tables_rpc.push_back(table_and_lock.first.get());
7810
8.08k
    auto &l = table_and_lock.second;
7811
    // Mark the table state as DELETING tablets.
7812
8.08k
    l.mutable_data()->set_state(SysTablesEntryPB::DELETING,
7813
8.08k
        Substitute("Started deleting at $0", LocalTimeAsString()));
7814
8.08k
  }
7815
  // Update all the table states in raft in bulk.
7816
95
  Status s = sys_catalog_->Upsert(leader_ready_term(), tables_rpc);
7817
95
  if (!s.ok()) {
7818
    // The mutation will be aborted when 'l' exits the scope on early return.
7819
0
    s = s.CloneAndPrepend(Substitute("An error occurred while updating sys tables: $0",
7820
0
                                     s.ToString()));
7821
0
    LOG(WARNING) << s.ToString();
7822
0
    return CheckIfNoLongerLeader(s);
7823
0
  }
7824
8.08k
  
for (auto &table_and_lock : tables)95
{
7825
8.08k
    auto &table = table_and_lock.first;
7826
8.08k
    auto &l = table_and_lock.second;
7827
    // Cancel all table busywork and commit the DELETING change.
7828
8.08k
    l.Commit();
7829
8.08k
    table->AbortTasks();
7830
8.08k
  }
7831
7832
  // Batch remove all relevant CDC streams, handle after releasing Table locks.
7833
95
  TRACE("Deleting CDC streams on table");
7834
95
  vector<TableId> id_list;
7835
95
  id_list.reserve(tables.size());
7836
8.08k
  for (auto &table_and_lock : tables) {
7837
8.08k
    id_list.push_back(table_and_lock.first->id());
7838
8.08k
  }
7839
95
  RETURN_NOT_OK(DeleteCDCStreamsForTables(id_list));
7840
7841
  // Send a DeleteTablet() RPC request to each tablet replica in the table.
7842
8.08k
  
for (auto &table_and_lock : tables)95
{
7843
8.08k
    auto &table = table_and_lock.first;
7844
    // TODO(pitr) undelete for YSQL tables
7845
8.08k
    RETURN_NOT_OK(DeleteTabletsAndSendRequests(table, {}));
7846
8.08k
  }
7847
7848
  // Invoke any background tasks and return (notably, table cleanup).
7849
95
  background_tasks_->Wake();
7850
95
  return Status::OK();
7851
95
}
7852
7853
// Get the information about an in-progress delete operation.
7854
Status CatalogManager::IsDeleteNamespaceDone(const IsDeleteNamespaceDoneRequestPB* req,
7855
1.74k
                                             IsDeleteNamespaceDoneResponsePB* resp) {
7856
1.74k
  auto ns_pb = req->namespace_();
7857
7858
  // Lookup the namespace and verify it exists.
7859
1.74k
  TRACE("Looking up keyspace");
7860
1.74k
  auto ns = FindNamespace(ns_pb);
7861
1.74k
  if (!ns.ok()) {
7862
    // Namespace no longer exists means success.
7863
1.55k
    LOG(INFO) << "Servicing IsDeleteNamespaceDone request for "
7864
1.55k
              << ns_pb.DebugString() << ": deleted (not found)";
7865
1.55k
    resp->set_done(true);
7866
1.55k
    return Status::OK();
7867
1.55k
  }
7868
7869
192
  TRACE("Locking keyspace");
7870
192
  auto l = (**ns).LockForRead();
7871
192
  auto& metadata = l->pb;
7872
7873
192
  if (metadata.state() == SysNamespaceEntryPB::DELETED) {
7874
73
    resp->set_done(true);
7875
119
  } else if (metadata.state() == SysNamespaceEntryPB::DELETING) {
7876
114
    resp->set_done(false);
7877
114
  } else {
7878
5
    Status s = STATUS_SUBSTITUTE(IllegalState,
7879
5
        "Servicing IsDeleteNamespaceDone request for $0: NOT deleted (state=$1)",
7880
5
        ns_pb.DebugString(), metadata.state());
7881
5
    LOG(WARNING) << s.ToString();
7882
    // Done != Successful.  We just want to let the user know the delete has finished processing.
7883
5
    resp->set_done(true);
7884
5
    return SetupError(resp->mutable_error(), MasterErrorPB::INTERNAL_ERROR, s);
7885
5
  }
7886
187
  return Status::OK();
7887
192
}
7888
7889
Status CatalogManager::AlterNamespace(const AlterNamespaceRequestPB* req,
7890
                                      AlterNamespaceResponsePB* resp,
7891
7
                                      rpc::RpcContext* rpc) {
7892
7
  LOG(INFO) << "Servicing AlterNamespace request from " << RequestorString(rpc)
7893
7
            << ": " << req->ShortDebugString();
7894
7895
7
  auto database = VERIFY_NAMESPACE_FOUND(FindNamespace(req->namespace_()), resp);
7896
7897
7
  if (req->namespace_().has_database_type() &&
7898
7
      
database->database_type() != req->namespace_().database_type()5
) {
7899
0
    Status s = STATUS(NotFound, "Database not found", database->name());
7900
0
    return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s);
7901
0
  }
7902
7903
7
  TRACE("Locking database");
7904
7
  auto l = database->LockForWrite();
7905
7906
  // Don't allow an alter if the namespace isn't running.
7907
7
  if (l->pb.state() != SysNamespaceEntryPB::RUNNING) {
7908
1
    Status s = STATUS_SUBSTITUTE(TryAgain, "Namespace not running.  State = $0",
7909
1
                                 SysNamespaceEntryPB::State_Name(l->pb.state()));
7910
1
    return SetupError(resp->mutable_error(), NamespaceMasterError(l->pb.state()), s);
7911
1
  }
7912
7913
6
  const string old_name = l->pb.name();
7914
7915
6
  if (req->has_new_name() && req->new_name() != old_name) {
7916
6
    const string new_name = req->new_name();
7917
7918
    // Verify that the new name does not exist.
7919
6
    NamespaceIdentifierPB ns_identifier;
7920
6
    ns_identifier.set_name(new_name);
7921
6
    if (req->namespace_().has_database_type()) {
7922
4
      ns_identifier.set_database_type(req->namespace_().database_type());
7923
4
    }
7924
    // TODO: This check will only work for YSQL once we add support for YSQL namespaces in
7925
    // namespace_name_map (#1476).
7926
6
    LockGuard lock(mutex_);
7927
6
    TRACE("Acquired catalog manager lock");
7928
6
    auto ns = FindNamespaceUnlocked(ns_identifier);
7929
6
    if (ns.ok() && 
req->namespace_().has_database_type()0
&&
7930
6
        
(**ns).database_type() == req->namespace_().database_type()0
) {
7931
0
      Status s = STATUS_SUBSTITUTE(AlreadyPresent, "Keyspace '$0' already exists", (**ns).name());
7932
0
      LOG(WARNING) << "Found keyspace: " << (**ns).id() << ". Failed altering keyspace with error: "
7933
0
                   << s << " Request:\n" << req->DebugString();
7934
0
      return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_ALREADY_PRESENT, s);
7935
0
    }
7936
7937
6
    namespace_names_mapper_[req->namespace_().database_type()][new_name] = database;
7938
6
    namespace_names_mapper_[req->namespace_().database_type()].erase(old_name);
7939
7940
6
    l.mutable_data()->pb.set_name(new_name);
7941
6
  }
7942
7943
6
  RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), database));
7944
7945
6
  TRACE("Committing in-memory state");
7946
6
  l.Commit();
7947
7948
6
  LOG(INFO) << "Successfully altered keyspace " << req->namespace_().name()
7949
6
            << " per request from " << RequestorString(rpc);
7950
6
  return Status::OK();
7951
6
}
7952
7953
Status CatalogManager::ListNamespaces(const ListNamespacesRequestPB* req,
7954
5.13k
                                      ListNamespacesResponsePB* resp) {
7955
5.13k
  NamespaceInfoMap namespace_ids_copy;
7956
5.13k
  {
7957
5.13k
    SharedLock lock(mutex_);
7958
5.13k
    namespace_ids_copy = namespace_ids_map_;
7959
5.13k
  }
7960
7961
24.3k
  for (const auto& entry : namespace_ids_copy) {
7962
24.3k
    const auto& namespace_info = *entry.second;
7963
    // If the request asks for namespaces for a specific database type, filter by the type.
7964
24.3k
    if (req->has_database_type() && 
namespace_info.database_type() != req->database_type()2.50k
) {
7965
848
      continue;
7966
848
    }
7967
    // Only return RUNNING namespaces.
7968
23.4k
    if (namespace_info.state() != SysNamespaceEntryPB::RUNNING) {
7969
162
      continue;
7970
162
    }
7971
7972
23.3k
    NamespaceIdentifierPB *ns = resp->add_namespaces();
7973
23.3k
    ns->set_id(namespace_info.id());
7974
23.3k
    ns->set_name(namespace_info.name());
7975
23.3k
    ns->set_database_type(namespace_info.database_type());
7976
23.3k
  }
7977
5.13k
  return Status::OK();
7978
5.13k
}
7979
7980
Status CatalogManager::GetNamespaceInfo(const GetNamespaceInfoRequestPB* req,
7981
                                        GetNamespaceInfoResponsePB* resp,
7982
6.03k
                                        rpc::RpcContext* rpc) {
7983
6.03k
  LOG(INFO) << __func__ << " from " << RequestorString(rpc) << ": " << req->ShortDebugString();
7984
7985
  // Look up the namespace and verify if it exists.
7986
6.03k
  TRACE("Looking up namespace");
7987
6.03k
  auto ns = VERIFY_NAMESPACE_FOUND(FindNamespace(req->namespace_()), resp);
7988
7989
6.03k
  resp->mutable_namespace_()->set_id(ns->id());
7990
6.03k
  resp->mutable_namespace_()->set_name(ns->name());
7991
6.03k
  resp->mutable_namespace_()->set_database_type(ns->database_type());
7992
6.03k
  resp->set_colocated(ns->colocated());
7993
6.03k
  return Status::OK();
7994
6.03k
}
7995
7996
Status CatalogManager::RedisConfigSet(
7997
182
    const RedisConfigSetRequestPB* req, RedisConfigSetResponsePB* resp, rpc::RpcContext* rpc) {
7998
182
  DCHECK(req->has_keyword());
7999
182
  const auto& key = req->keyword();
8000
182
  SysRedisConfigEntryPB config_entry;
8001
182
  config_entry.set_key(key);
8002
182
  *config_entry.mutable_args() = req->args();
8003
182
  bool created = false;
8004
8005
182
  TRACE("Acquired catalog manager lock");
8006
182
  LockGuard lock(mutex_);
8007
182
  scoped_refptr<RedisConfigInfo> cfg = FindPtrOrNull(redis_config_map_, req->keyword());
8008
182
  if (cfg == nullptr) {
8009
182
    created = true;
8010
182
    cfg = new RedisConfigInfo(key);
8011
182
    redis_config_map_[key] = cfg;
8012
182
  }
8013
8014
182
  auto wl = cfg->LockForWrite();
8015
182
  wl.mutable_data()->pb = std::move(config_entry);
8016
182
  if (created) {
8017
182
    CHECK_OK(sys_catalog_->Upsert(leader_ready_term(), cfg));
8018
182
  } else {
8019
0
    CHECK_OK(sys_catalog_->Upsert(leader_ready_term(), cfg));
8020
0
  }
8021
182
  wl.Commit();
8022
182
  return Status::OK();
8023
182
}
8024
8025
Status CatalogManager::RedisConfigGet(
8026
1.17k
    const RedisConfigGetRequestPB* req, RedisConfigGetResponsePB* resp, rpc::RpcContext* rpc) {
8027
1.17k
  DCHECK(req->has_keyword());
8028
1.17k
  resp->set_keyword(req->keyword());
8029
1.17k
  TRACE("Acquired catalog manager lock");
8030
1.17k
  SharedLock lock(mutex_);
8031
1.17k
  scoped_refptr<RedisConfigInfo> cfg = FindPtrOrNull(redis_config_map_, req->keyword());
8032
1.17k
  if (cfg == nullptr) {
8033
821
    Status s = STATUS_SUBSTITUTE(NotFound, "Redis config for $0 does not exists", req->keyword());
8034
821
    return SetupError(resp->mutable_error(), MasterErrorPB::REDIS_CONFIG_NOT_FOUND, s);
8035
821
  }
8036
355
  auto rci = cfg->LockForRead();
8037
355
  resp->mutable_args()->CopyFrom(rci->pb.args());
8038
355
  return Status::OK();
8039
1.17k
}
8040
8041
Status CatalogManager::CreateUDType(const CreateUDTypeRequestPB* req,
8042
                                    CreateUDTypeResponsePB* resp,
8043
47
                                    rpc::RpcContext* rpc) {
8044
47
  LOG(INFO) << "CreateUDType from " << RequestorString(rpc)
8045
47
            << ": " << req->DebugString();
8046
8047
47
  Status s;
8048
47
  scoped_refptr<UDTypeInfo> tp;
8049
47
  scoped_refptr<NamespaceInfo> ns;
8050
8051
  // Lookup the namespace and verify if it exists.
8052
47
  if (req->has_namespace_()) {
8053
47
    TRACE("Looking up namespace");
8054
47
    ns = VERIFY_NAMESPACE_FOUND(FindNamespace(req->namespace_()), resp);
8055
47
    if (ns->database_type() != YQLDatabase::YQL_DATABASE_CQL) {
8056
0
      Status s = STATUS(NotFound, "Namespace not found");
8057
0
      return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s);
8058
0
    }
8059
47
  }
8060
8061
  // Get all the referenced types (if any).
8062
47
  std::vector<std::string> referenced_udts;
8063
89
  for (const QLTypePB& field_type : req->field_types()) {
8064
89
    QLType::GetUserDefinedTypeIds(field_type, /* transitive = */ true, &referenced_udts);
8065
89
  }
8066
8067
47
  {
8068
47
    TRACE("Acquired catalog manager lock");
8069
47
    LockGuard lock(mutex_);
8070
8071
    // Verify that the type does not exist.
8072
47
    tp = FindPtrOrNull(udtype_names_map_, std::make_pair(ns->id(), req->name()));
8073
8074
47
    if (tp != nullptr) {
8075
1
      s = STATUS_SUBSTITUTE(AlreadyPresent,
8076
1
          "Type '$0.$1' already exists", ns->name(), req->name());
8077
1
      LOG(WARNING) << "Found type: " << tp->id() << ". Failed creating type with error: "
8078
1
                   << s.ToString() << " Request:\n" << req->DebugString();
8079
1
      return SetupError(resp->mutable_error(), MasterErrorPB::TYPE_ALREADY_PRESENT, s);
8080
1
    }
8081
8082
    // Verify that all referenced types actually exist.
8083
46
    for (const auto& udt_id : referenced_udts) {
8084
11
      if (FindPtrOrNull(udtype_ids_map_, udt_id) == nullptr) {
8085
          // This may be caused by a stale cache (e.g. referenced type name resolves to an old,
8086
          // deleted type). Return InvalidArgument so query layer will clear cache and retry.
8087
0
          s = STATUS_SUBSTITUTE(InvalidArgument,
8088
0
          "Type id '$0' referenced by type '$1' does not exist", udt_id, req->name());
8089
0
        return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s);
8090
0
      }
8091
11
    }
8092
8093
    // Construct the new type (generate fresh name and set fields).
8094
46
    UDTypeId new_id = GenerateIdUnlocked(SysRowEntryType::UDTYPE);
8095
46
    tp = new UDTypeInfo(new_id);
8096
46
    tp->mutable_metadata()->StartMutation();
8097
46
    SysUDTypeEntryPB *metadata = &tp->mutable_metadata()->mutable_dirty()->pb;
8098
46
    metadata->set_name(req->name());
8099
46
    metadata->set_namespace_id(ns->id());
8100
88
    for (const string& field_name : req->field_names()) {
8101
88
      metadata->add_field_names(field_name);
8102
88
    }
8103
8104
88
    for (const QLTypePB& field_type : req->field_types()) {
8105
88
      metadata->add_field_types()->CopyFrom(field_type);
8106
88
    }
8107
8108
    // Add the type to the in-memory maps.
8109
46
    udtype_ids_map_[tp->id()] = tp;
8110
46
    udtype_names_map_[std::make_pair(ns->id(), req->name())] = tp;
8111
46
    resp->set_id(tp->id());
8112
46
  }
8113
46
  TRACE("Inserted new user-defined type info into CatalogManager maps");
8114
8115
  // Update the on-disk system catalog.
8116
46
  s = sys_catalog_->Upsert(leader_ready_term(), tp);
8117
46
  if (!s.ok()) {
8118
0
    s = s.CloneAndPrepend(Substitute(
8119
0
        "An error occurred while inserting user-defined type to sys-catalog: $0", s.ToString()));
8120
0
    LOG(WARNING) << s.ToString();
8121
0
    return CheckIfNoLongerLeaderAndSetupError(s, resp);
8122
0
  }
8123
46
  TRACE("Wrote user-defined type to sys-catalog");
8124
8125
  // Commit the in-memory state.
8126
46
  tp->mutable_metadata()->CommitMutation();
8127
46
  LOG(INFO) << "Created user-defined type " << tp->ToString();
8128
46
  return Status::OK();
8129
46
}
8130
8131
Status CatalogManager::DeleteUDType(const DeleteUDTypeRequestPB* req,
8132
                                    DeleteUDTypeResponsePB* resp,
8133
54
                                    rpc::RpcContext* rpc) {
8134
54
  LOG(INFO) << "Servicing DeleteUDType request from " << RequestorString(rpc)
8135
54
            << ": " << req->ShortDebugString();
8136
8137
54
  scoped_refptr<UDTypeInfo> tp;
8138
54
  scoped_refptr<NamespaceInfo> ns;
8139
8140
54
  if (!req->has_type()) {
8141
0
    Status s = STATUS(InvalidArgument, "No type given", req->DebugString());
8142
0
    return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s);
8143
0
  }
8144
8145
  // Validate namespace.
8146
54
  if (req->type().has_namespace_()) {
8147
    // Lookup the namespace and verify if it exists.
8148
54
    TRACE("Looking up namespace");
8149
54
    ns = VERIFY_NAMESPACE_FOUND(FindNamespace(req->type().namespace_()), resp);
8150
54
  }
8151
8152
54
  {
8153
54
    LockGuard lock(mutex_);
8154
54
    TRACE("Acquired catalog manager lock");
8155
8156
54
    if (req->type().has_type_id()) {
8157
0
      tp = FindPtrOrNull(udtype_ids_map_, req->type().type_id());
8158
54
    } else if (req->type().has_type_name()) {
8159
54
      tp = FindPtrOrNull(udtype_names_map_, {ns->id(), req->type().type_name()});
8160
54
    }
8161
8162
54
    if (tp == nullptr) {
8163
2
      Status s = STATUS(NotFound, "The type does not exist", req->DebugString());
8164
2
      return SetupError(resp->mutable_error(), MasterErrorPB::TYPE_NOT_FOUND, s);
8165
2
    }
8166
8167
    // Checking if any table uses this type.
8168
    // TODO: this could be more efficient.
8169
982
    
for (const TableInfoMap::value_type& entry : *table_ids_map_)52
{
8170
982
      auto ltm = entry.second->LockForRead();
8171
982
      if (!ltm->started_deleting()) {
8172
7.08k
        for (const auto &col : ltm->schema().columns()) {
8173
7.08k
          if (col.type().main() == DataType::USER_DEFINED_TYPE &&
8174
7.08k
              
col.type().udtype_info().id() == tp->id()8
) {
8175
2
            Status s = STATUS(QLError,
8176
2
                Substitute("Cannot delete type '$0.$1'. It is used in column $2 of table $3",
8177
2
                    ns->name(), tp->name(), col.name(), ltm->name()));
8178
2
            return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s);
8179
2
          }
8180
7.08k
        }
8181
869
      }
8182
982
    }
8183
8184
    // Checking if any other type uses this type (i.e. in the case of nested types).
8185
    // TODO: this could be more efficient.
8186
73
    
for (const UDTypeInfoMap::value_type& entry : udtype_ids_map_)50
{
8187
73
      auto ltm = entry.second->LockForRead();
8188
8189
203
      for (int i = 0; i < ltm->field_types_size(); 
i++130
) {
8190
        // Only need to check direct (non-transitive) type dependencies here.
8191
        // This also means we report more precise errors for in-use types.
8192
134
        if (QLType::DoesUserDefinedTypeIdExist(ltm->field_types(i),
8193
134
                                      false /* transitive */,
8194
134
                                      tp->id())) {
8195
4
          Status s = STATUS(QLError,
8196
4
              Substitute("Cannot delete type '$0.$1'. It is used in field $2 of type '$3'",
8197
4
                  ns->name(), tp->name(), ltm->field_names(i), ltm->name()));
8198
4
          return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s);
8199
4
        }
8200
134
      }
8201
73
    }
8202
50
  }
8203
8204
46
  auto l = tp->LockForWrite();
8205
8206
46
  Status s = sys_catalog_->Delete(leader_ready_term(), tp);
8207
46
  if (!s.ok()) {
8208
    // The mutation will be aborted when 'l' exits the scope on early return.
8209
0
    s = s.CloneAndPrepend(Substitute("An error occurred while updating sys-catalog: $0",
8210
0
        s.ToString()));
8211
0
    LOG(WARNING) << s.ToString();
8212
0
    return CheckIfNoLongerLeaderAndSetupError(s, resp);
8213
0
  }
8214
8215
  // Remove it from the maps.
8216
46
  {
8217
46
    TRACE("Removing from maps");
8218
46
    LockGuard lock(mutex_);
8219
46
    if (udtype_ids_map_.erase(tp->id()) < 1) {
8220
0
      PANIC_RPC(rpc, "Could not remove user defined type from map, name=" + l->name());
8221
0
    }
8222
46
    if (udtype_names_map_.erase({ns->id(), tp->name()}) < 1) {
8223
0
      PANIC_RPC(rpc, "Could not remove user defined type from map, name=" + l->name());
8224
0
    }
8225
46
  }
8226
8227
  // Update the in-memory state.
8228
46
  TRACE("Committing in-memory state");
8229
46
  l.Commit();
8230
8231
46
  LOG(INFO) << "Successfully deleted user-defined type " << tp->ToString()
8232
46
            << " per request from " << RequestorString(rpc);
8233
8234
46
  return Status::OK();
8235
46
}
8236
8237
Status CatalogManager::GetUDTypeInfo(const GetUDTypeInfoRequestPB* req,
8238
                                     GetUDTypeInfoResponsePB* resp,
8239
56
                                     rpc::RpcContext* rpc) {
8240
56
  LOG(INFO) << "GetUDTypeInfo from " << RequestorString(rpc)
8241
56
            << ": " << req->DebugString();
8242
56
  Status s;
8243
56
  scoped_refptr<UDTypeInfo> tp;
8244
56
  scoped_refptr<NamespaceInfo> ns;
8245
8246
56
  if (!req->has_type()) {
8247
0
    s = STATUS(InvalidArgument, "Cannot get type, no type identifier given", req->DebugString());
8248
0
    return SetupError(resp->mutable_error(), MasterErrorPB::TYPE_NOT_FOUND, s);
8249
0
  }
8250
8251
56
  if (req->type().has_type_id()) {
8252
0
    tp = FindPtrOrNull(udtype_ids_map_, req->type().type_id());
8253
56
  } else if (req->type().has_type_name() && req->type().has_namespace_()) {
8254
    // Lookup the type and verify if it exists.
8255
56
    TRACE("Looking up namespace");
8256
56
    ns = VERIFY_NAMESPACE_FOUND(FindNamespace(req->type().namespace_()), resp);
8257
8258
56
    tp = FindPtrOrNull(udtype_names_map_, std::make_pair(ns->id(), req->type().type_name()));
8259
56
  }
8260
8261
56
  if (tp == nullptr) {
8262
7
    s = STATUS(InvalidArgument, "Couldn't find type", req->DebugString());
8263
7
    return SetupError(resp->mutable_error(), MasterErrorPB::TYPE_NOT_FOUND, s);
8264
7
  }
8265
8266
49
  {
8267
49
    auto type_lock = tp->LockForRead();
8268
8269
49
    UDTypeInfoPB* type_info = resp->mutable_udtype();
8270
8271
49
    type_info->set_name(tp->name());
8272
49
    type_info->set_id(tp->id());
8273
49
    type_info->mutable_namespace_()->set_id(type_lock->namespace_id());
8274
8275
144
    for (int i = 0; i < type_lock->field_names_size(); 
i++95
) {
8276
95
      type_info->add_field_names(type_lock->field_names(i));
8277
95
    }
8278
144
    for (int i = 0; i < type_lock->field_types_size(); 
i++95
) {
8279
95
      type_info->add_field_types()->CopyFrom(type_lock->field_types(i));
8280
95
    }
8281
8282
49
    LOG(INFO) << "Retrieved user-defined type " << tp->ToString();
8283
49
  }
8284
49
  return Status::OK();
8285
56
}
8286
8287
Status CatalogManager::ListUDTypes(const ListUDTypesRequestPB* req,
8288
0
                                   ListUDTypesResponsePB* resp) {
8289
0
  SharedLock lock(mutex_);
8290
8291
  // Lookup the namespace and verify that it exists.
8292
0
  auto ns = VERIFY_NAMESPACE_FOUND(FindNamespaceUnlocked(req->namespace_()), resp);
8293
8294
0
  for (const UDTypeInfoByNameMap::value_type& entry : udtype_names_map_) {
8295
0
    auto ltm = entry.second->LockForRead();
8296
8297
    // key is a pair <namespace_id, type_name>.
8298
0
    if (!ns->id().empty() && ns->id() != entry.first.first) {
8299
0
      continue; // Skip types from other namespaces.
8300
0
    }
8301
8302
0
    UDTypeInfoPB* udtype = resp->add_udtypes();
8303
0
    udtype->set_id(entry.second->id());
8304
0
    udtype->set_name(ltm->name());
8305
0
    for (int i = 0; i <= ltm->field_names_size(); i++) {
8306
0
      udtype->add_field_names(ltm->field_names(i));
8307
0
    }
8308
0
    for (int i = 0; i <= ltm->field_types_size(); i++) {
8309
0
      udtype->add_field_types()->CopyFrom(ltm->field_types(i));
8310
0
    }
8311
8312
0
    if (CHECK_NOTNULL(ns.get())) {
8313
0
      auto l = ns->LockForRead();
8314
0
      udtype->mutable_namespace_()->set_id(ns->id());
8315
0
      udtype->mutable_namespace_()->set_name(ns->name());
8316
0
    }
8317
0
  }
8318
0
  return Status::OK();
8319
0
}
8320
8321
Status CatalogManager::DisableTabletSplitting(
8322
    const DisableTabletSplittingRequestPB* req, DisableTabletSplittingResponsePB* resp,
8323
0
    rpc::RpcContext* rpc) {
8324
0
  const MonoDelta disable_duration = MonoDelta::FromMilliseconds(req->disable_duration_ms());
8325
0
  tablet_split_manager_.DisableSplittingFor(disable_duration);
8326
0
  return Status::OK();
8327
0
}
8328
8329
Status CatalogManager::IsTabletSplittingComplete(
8330
    const IsTabletSplittingCompleteRequestPB* req, IsTabletSplittingCompleteResponsePB* resp,
8331
0
    rpc::RpcContext* rpc) {
8332
0
  TableInfoMap table_info_map;
8333
0
  {
8334
0
    SharedLock lock(mutex_);
8335
0
    table_info_map = *table_ids_map_;
8336
0
  }
8337
0
  resp->set_is_tablet_splitting_complete(
8338
0
      tablet_split_manager_.IsTabletSplittingComplete(table_info_map));
8339
0
  return Status::OK();
8340
0
}
8341
8342
// For non-enterprise builds, this is a no-op.
8343
0
Status CatalogManager::DeleteCDCStreamsForTable(const TableId& table) {
8344
0
  return Status::OK();
8345
0
}
8346
8347
0
Status CatalogManager::DeleteCDCStreamsForTables(const vector<TableId>& table_ids) {
8348
0
  return Status::OK();
8349
0
}
8350
8351
8352
0
bool CatalogManager::CDCStreamExistsUnlocked(const CDCStreamId& stream_id) {
8353
0
  return false;
8354
0
}
8355
8356
14
Result<uint64_t> CatalogManager::IncrementYsqlCatalogVersion() {
8357
8358
14
  auto l = CHECK_NOTNULL(ysql_catalog_config_.get())->LockForWrite();
8359
14
  uint64_t new_version = l->pb.ysql_catalog_config().version() + 1;
8360
14
  l.mutable_data()->pb.mutable_ysql_catalog_config()->set_version(new_version);
8361
8362
  // Write to sys_catalog and in memory.
8363
14
  RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), ysql_catalog_config_));
8364
14
  l.Commit();
8365
8366
14
  if (FLAGS_log_ysql_catalog_versions) {
8367
0
    LOG_WITH_FUNC(WARNING) << "set catalog version: " << new_version
8368
0
                           << " (using old protobuf method)";
8369
0
  }
8370
8371
14
  return new_version;
8372
14
}
8373
8374
747
Status CatalogManager::InitDbFinished(Status initdb_status, int64_t term) {
8375
747
  if (initdb_status.ok()) {
8376
747
    LOG(INFO) << "initdb completed successfully";
8377
747
  } else {
8378
0
    LOG(ERROR) << "initdb failed: " << initdb_status;
8379
0
  }
8380
8381
747
  auto l = CHECK_NOTNULL(ysql_catalog_config_.get())->LockForWrite();
8382
747
  auto* mutable_ysql_catalog_config = l.mutable_data()->pb.mutable_ysql_catalog_config();
8383
747
  mutable_ysql_catalog_config->set_initdb_done(true);
8384
747
  if (!initdb_status.ok()) {
8385
0
    mutable_ysql_catalog_config->set_initdb_error(initdb_status.ToString());
8386
747
  } else {
8387
747
    mutable_ysql_catalog_config->clear_initdb_error();
8388
747
  }
8389
8390
747
  RETURN_NOT_OK(sys_catalog_->Upsert(term, ysql_catalog_config_));
8391
747
  l.Commit();
8392
747
  return Status::OK();
8393
747
}
8394
8395
CHECKED_STATUS CatalogManager::IsInitDbDone(
8396
    const IsInitDbDoneRequestPB* req,
8397
2.21k
    IsInitDbDoneResponsePB* resp) {
8398
2.21k
  auto l = CHECK_NOTNULL(ysql_catalog_config_.get())->LockForRead();
8399
2.21k
  const auto& ysql_catalog_config = l->pb.ysql_catalog_config();
8400
2.21k
  resp->set_pg_proc_exists(pg_proc_exists_.load(std::memory_order_acquire));
8401
2.21k
  resp->set_done(ysql_catalog_config.initdb_done());
8402
2.21k
  if (ysql_catalog_config.has_initdb_error() &&
8403
2.21k
      
!ysql_catalog_config.initdb_error().empty()0
) {
8404
0
    resp->set_initdb_error(ysql_catalog_config.initdb_error());
8405
0
  }
8406
2.21k
  return Status::OK();
8407
2.21k
}
8408
8409
Status CatalogManager::GetYsqlCatalogVersion(uint64_t* catalog_version,
8410
4.81M
                                             uint64_t* last_breaking_version) {
8411
4.81M
  auto table_info = GetTableInfo(kPgYbCatalogVersionTableId);
8412
4.81M
  if (table_info != nullptr) {
8413
349k
    RETURN_NOT_OK(sys_catalog_->ReadYsqlCatalogVersion(kPgYbCatalogVersionTableId,
8414
349k
                                                       catalog_version,
8415
349k
                                                       last_breaking_version));
8416
    // If the version is properly initialized, we're done.
8417
349k
    if ((!catalog_version || 
*catalog_version > 0341k
) &&
8418
349k
        
(346k
!last_breaking_version346k
||
*last_breaking_version > 0346k
)) {
8419
346k
      return Status::OK();
8420
346k
    }
8421
    // However, it's possible for a table to have no entries mid-migration or if migration fails.
8422
    // In this case we'd like to fall back to the legacy approach.
8423
349k
  }
8424
8425
4.46M
  auto l = ysql_catalog_config_->LockForRead();
8426
  // last_breaking_version is the last version (change) that invalidated ongoing transactions.
8427
  // If using the old (protobuf-based) version method, we do not have any information about
8428
  // breaking changes so assuming every change is a breaking change.
8429
4.46M
  if (catalog_version) {
8430
4.46M
    *catalog_version = l->pb.ysql_catalog_config().version();
8431
4.46M
  }
8432
4.46M
  if (last_breaking_version) {
8433
4.46M
    *last_breaking_version = l->pb.ysql_catalog_config().version();
8434
4.46M
  }
8435
4.46M
  return Status::OK();
8436
4.81M
}
8437
8438
2.91k
Status CatalogManager::InitializeTransactionTablesConfig(int64_t term) {
8439
2.91k
  SysTransactionTablesConfigEntryPB transaction_tables_config;
8440
2.91k
  transaction_tables_config.set_version(0);
8441
8442
  // Create in memory objects.
8443
2.91k
  transaction_tables_config_ = new SysConfigInfo(kTransactionTablesConfigType);
8444
8445
  // Prepare write.
8446
2.91k
  auto l = transaction_tables_config_->LockForWrite();
8447
2.91k
  *l.mutable_data()->pb.mutable_transaction_tables_config() = std::move(transaction_tables_config);
8448
8449
  // Write to sys_catalog and in memory.
8450
2.91k
  RETURN_NOT_OK(sys_catalog_->Upsert(term, transaction_tables_config_));
8451
2.91k
  l.Commit();
8452
8453
2.91k
  return Status::OK();
8454
2.91k
}
8455
8456
1.09k
Status CatalogManager::IncrementTransactionTablesVersion() {
8457
1.09k
  auto l = CHECK_NOTNULL(transaction_tables_config_.get())->LockForWrite();
8458
1.09k
  uint64_t new_version = l->pb.transaction_tables_config().version() + 1;
8459
1.09k
  l.mutable_data()->pb.mutable_transaction_tables_config()->set_version(new_version);
8460
8461
  // Write to sys_catalog and in memory.
8462
1.09k
  RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), transaction_tables_config_));
8463
1.09k
  l.Commit();
8464
8465
1.09k
  LOG(INFO) << "Set transaction tables version: " << new_version;
8466
8467
1.09k
  return Status::OK();
8468
1.09k
}
8469
8470
4.81M
uint64_t CatalogManager::GetTransactionTablesVersion() {
8471
4.81M
  auto l = CHECK_NOTNULL(transaction_tables_config_.get())->LockForRead();
8472
4.81M
  return l->pb.transaction_tables_config().version();
8473
4.81M
}
8474
8475
93
Status CatalogManager::RegisterTsFromRaftConfig(const consensus::RaftPeerPB& peer) {
8476
93
  NodeInstancePB instance_pb;
8477
93
  instance_pb.set_permanent_uuid(peer.permanent_uuid());
8478
93
  instance_pb.set_instance_seqno(0);
8479
8480
93
  TSRegistrationPB registration_pb;
8481
93
  auto* common = registration_pb.mutable_common();
8482
93
  *common->mutable_private_rpc_addresses() = peer.last_known_private_addr();
8483
93
  *common->mutable_broadcast_addresses() = peer.last_known_broadcast_addr();
8484
93
  *common->mutable_cloud_info() = peer.cloud_info();
8485
8486
  // Todo(Rahul) : May need to be changed when we implement table level overrides.
8487
93
  {
8488
93
    auto l = ClusterConfig()->LockForRead();
8489
    // If the config has no replication info, use empty string for the placement uuid, otherwise
8490
    // calculate it from the reported peer.
8491
93
    auto placement_uuid = l->pb.has_replication_info()
8492
93
        ? VERIFY_RESULT(CatalogManagerUtil::GetPlacementUuidFromRaftPeer(
8493
93
                            l->pb.replication_info(), peer))
8494
93
        : 
""62
;
8495
0
    common->set_placement_uuid(placement_uuid);
8496
93
  }
8497
0
  return master_->ts_manager()->RegisterTS(instance_pb, registration_pb, master_->MakeCloudInfoPB(),
8498
93
                                           &master_->proxy_cache(),
8499
93
                                           RegisteredThroughHeartbeat::kFalse);
8500
93
}
8501
8502
void CatalogManager::ReconcileTabletReplicasInLocalMemoryWithReport(
8503
    const scoped_refptr<TabletInfo>& tablet,
8504
    const std::string& sender_uuid,
8505
    const ConsensusStatePB& consensus_state,
8506
113k
    const ReportedTabletPB& report) {
8507
113k
  auto replica_locations = std::make_shared<TabletReplicaMap>();
8508
113k
  auto prev_rl = tablet->GetReplicaLocations();
8509
8510
336k
  for (const consensus::RaftPeerPB& peer : consensus_state.config().peers()) {
8511
336k
    shared_ptr<TSDescriptor> ts_desc;
8512
336k
    if (!peer.has_permanent_uuid()) {
8513
0
      LOG_WITH_PREFIX(WARNING) << "Missing UUID for peer" << peer.ShortDebugString();
8514
0
      continue;
8515
0
    }
8516
336k
    if (!master_->ts_manager()->LookupTSByUUID(peer.permanent_uuid(), &ts_desc)) {
8517
93
      if (!GetAtomicFlag(&FLAGS_enable_register_ts_from_raft)) {
8518
0
        LOG_WITH_PREFIX(WARNING) << "Tablet server has never reported in. "
8519
0
        << "Not including in replica locations map yet. Peer: " << peer.ShortDebugString()
8520
0
        << "; Tablet: " << tablet->ToString();
8521
0
        continue;
8522
0
      }
8523
8524
93
      LOG_WITH_PREFIX(INFO) << "Tablet server has never reported in. Registering the ts using "
8525
93
                            << "the raft config. Peer: " << peer.ShortDebugString()
8526
93
                            << "; Tablet: " << tablet->ToString();
8527
93
      Status s = RegisterTsFromRaftConfig(peer);
8528
93
      if (!s.ok()) {
8529
9
        LOG_WITH_PREFIX(WARNING) << "Could not register ts from raft config: " << s
8530
9
                                 << " Skip updating the replica map.";
8531
9
        continue;
8532
9
      }
8533
8534
      // Guaranteed to find the ts since we just registered.
8535
84
      master_->ts_manager()->LookupTSByUUID(peer.permanent_uuid(), &ts_desc);
8536
84
      if (!ts_desc.get()) {
8537
0
        LOG_WITH_PREFIX(WARNING) << "Could not find ts with uuid " << peer.permanent_uuid()
8538
0
                                 << " after registering from raft config. Skip updating the replica"
8539
0
                                 << " map.";
8540
0
        continue;
8541
0
      }
8542
84
    }
8543
8544
    // Do not update replicas in the NOT_STARTED or BOOTSTRAPPING state (unless they are stale).
8545
336k
    bool use_existing = false;
8546
336k
    const TabletReplica* existing_replica = nullptr;
8547
336k
    auto it = prev_rl->find(ts_desc->permanent_uuid());
8548
336k
    if (it != prev_rl->end()) {
8549
192k
      existing_replica = &it->second;
8550
192k
    }
8551
336k
    if (existing_replica && 
peer.permanent_uuid() != sender_uuid192k
) {
8552
      // IsStarting returns true if state == NOT_STARTED or state == BOOTSTRAPPING.
8553
127k
      use_existing = existing_replica->IsStarting() && 
!existing_replica->IsStale()713
;
8554
127k
    }
8555
336k
    if (use_existing) {
8556
713
      InsertOrDie(replica_locations.get(), existing_replica->ts_desc->permanent_uuid(),
8557
713
          *existing_replica);
8558
336k
    } else {
8559
336k
      TabletReplica replica;
8560
336k
      CreateNewReplicaForLocalMemory(ts_desc.get(), &consensus_state, report, &replica);
8561
336k
      auto result = replica_locations.get()->insert({replica.ts_desc->permanent_uuid(), replica});
8562
336k
      LOG_IF
(FATAL, !result.second) << "duplicate uuid: " << replica.ts_desc->permanent_uuid()6
;
8563
336k
      if (existing_replica) {
8564
191k
        result.first->second.UpdateDriveInfo(existing_replica->drive_info);
8565
191k
      }
8566
336k
    }
8567
336k
  }
8568
8569
  // Update the local tablet replica set. This deviates from persistent state during bootstrapping.
8570
113k
  tablet->SetReplicaLocations(replica_locations);
8571
113k
  tablet_locations_version_.fetch_add(1, std::memory_order_acq_rel);
8572
113k
}
8573
8574
void CatalogManager::UpdateTabletReplicaInLocalMemory(TSDescriptor* ts_desc,
8575
                                                      const ConsensusStatePB* consensus_state,
8576
                                                      const ReportedTabletPB& report,
8577
327k
                                                      const scoped_refptr<TabletInfo>& tablet) {
8578
327k
  TabletReplica replica;
8579
327k
  CreateNewReplicaForLocalMemory(ts_desc, consensus_state, report, &replica);
8580
327k
  tablet->UpdateReplicaLocations(replica);
8581
327k
  tablet_locations_version_.fetch_add(1, std::memory_order_acq_rel);
8582
327k
}
8583
8584
void CatalogManager::CreateNewReplicaForLocalMemory(TSDescriptor* ts_desc,
8585
                                                    const ConsensusStatePB* consensus_state,
8586
                                                    const ReportedTabletPB& report,
8587
664k
                                                    TabletReplica* new_replica) {
8588
  // Tablets in state NOT_STARTED or BOOTSTRAPPING don't have a consensus.
8589
664k
  if (consensus_state == nullptr) {
8590
3.41k
    new_replica->role = PeerRole::NON_PARTICIPANT;
8591
3.41k
    new_replica->member_type = PeerMemberType::UNKNOWN_MEMBER_TYPE;
8592
660k
  } else {
8593
660k
    CHECK(consensus_state != nullptr) << "No cstate: " << ts_desc->permanent_uuid()
8594
2
                                      << " - " << report.state();
8595
660k
    new_replica->role = GetConsensusRole(ts_desc->permanent_uuid(), *consensus_state);
8596
660k
    new_replica->member_type = GetConsensusMemberType(ts_desc->permanent_uuid(), *consensus_state);
8597
660k
  }
8598
664k
  if (report.has_should_disable_lb_move()) {
8599
660k
    new_replica->should_disable_lb_move = report.should_disable_lb_move();
8600
660k
  }
8601
664k
  if (
report.has_fs_data_dir()664k
) {
8602
664k
    new_replica->fs_data_dir = report.fs_data_dir();
8603
664k
  }
8604
664k
  new_replica->state = report.state();
8605
664k
  new_replica->ts_desc = ts_desc;
8606
664k
  if (!ts_desc->registered_through_heartbeat()) {
8607
5.05k
    new_replica->time_updated = MonoTime::Now() - ts_desc->TimeSinceHeartbeat();
8608
5.05k
  }
8609
664k
}
8610
8611
Status CatalogManager::GetTabletPeer(const TabletId& tablet_id,
8612
2.79M
                                     std::shared_ptr<TabletPeer>* ret_tablet_peer) const {
8613
  // Note: CatalogManager has only one table, 'sys_catalog', with only
8614
  // one tablet.
8615
8616
2.79M
  if (PREDICT_FALSE(!IsInitialized())) {
8617
    // Master puts up the consensus service first and then initiates catalog manager's creation
8618
    // asynchronously. So this case is possible, but harmless. The RPC will simply be retried.
8619
    // Previously, because we weren't checking for this condition, we would fatal down stream.
8620
112
    const string& reason = "CatalogManager is not yet initialized";
8621
112
    YB_LOG_EVERY_N
(WARNING, 1000) << reason13
;
8622
112
    return STATUS(ServiceUnavailable, reason);
8623
112
  }
8624
8625
18.4E
  CHECK(sys_catalog_) << "sys_catalog_ must be initialized!";
8626
8627
2.79M
  if (master_->opts().IsShellMode()) {
8628
181
    return STATUS_SUBSTITUTE(NotFound,
8629
181
        "In shell mode: no tablet_id $0 exists in CatalogManager.", tablet_id);
8630
181
  }
8631
8632
2.79M
  
if (2.79M
sys_catalog_->tablet_id() == tablet_id2.79M
&& sys_catalog_->tablet_peer().get() != nullptr &&
8633
2.79M
      sys_catalog_->tablet_peer()->CheckRunning().ok()) {
8634
2.79M
    *ret_tablet_peer = tablet_peer();
8635
18.4E
  } else {
8636
18.4E
    return STATUS_SUBSTITUTE(NotFound,
8637
18.4E
        "no SysTable in the RUNNING state exists with tablet_id $0 in CatalogManager", tablet_id);
8638
18.4E
  }
8639
2.79M
  return Status::OK();
8640
2.79M
}
8641
8642
2.84M
const NodeInstancePB& CatalogManager::NodeInstance() const {
8643
2.84M
  return master_->instance_pb();
8644
2.84M
}
8645
8646
28.9k
Status CatalogManager::GetRegistration(ServerRegistrationPB* reg) const {
8647
28.9k
  return master_->GetRegistration(reg, server::RpcOnly::kTrue);
8648
28.9k
}
8649
8650
57
Status CatalogManager::UpdateMastersListInMemoryAndDisk() {
8651
57
  DCHECK(master_->opts().IsShellMode());
8652
8653
57
  if (!master_->opts().IsShellMode()) {
8654
0
    return STATUS(IllegalState, "Cannot update master's info when process is not in shell mode.");
8655
0
  }
8656
8657
57
  consensus::ConsensusStatePB consensus_state;
8658
57
  RETURN_NOT_OK(GetCurrentConfig(&consensus_state));
8659
8660
57
  if (!consensus_state.has_config()) {
8661
0
    return STATUS(NotFound, "No Raft config found.");
8662
0
  }
8663
8664
57
  RETURN_NOT_OK(sys_catalog_->ConvertConfigToMasterAddresses(consensus_state.config()));
8665
57
  RETURN_NOT_OK(sys_catalog_->CreateAndFlushConsensusMeta(master_->fs_manager(),
8666
57
                                                          consensus_state.config(),
8667
57
                                                          consensus_state.current_term()));
8668
8669
57
  return Status::OK();
8670
57
}
8671
8672
7.94k
Status CatalogManager::EnableBgTasks() {
8673
7.94k
  LockGuard lock(mutex_);
8674
  // Initialize refresh_ysql_tablespace_info_task_. This will be used to
8675
  // manage the background task that refreshes tablespace info. This task
8676
  // will be started by the CatalogManagerBgTasks below.
8677
7.94k
  refresh_ysql_tablespace_info_task_.Bind(&master_->messenger()->scheduler());
8678
8679
7.94k
  background_tasks_.reset(new CatalogManagerBgTasks(this));
8680
7.94k
  RETURN_NOT_OK_PREPEND(background_tasks_->Init(),
8681
7.94k
                        "Failed to initialize catalog manager background tasks");
8682
8683
  // Add bg thread to rebuild yql system partitions.
8684
7.94k
  refresh_yql_partitions_task_.Bind(&master_->messenger()->scheduler());
8685
8686
7.94k
  RETURN_NOT_OK(background_tasks_thread_pool_->SubmitFunc(
8687
7.94k
      [this]() { RebuildYQLSystemPartitions(); }));
8688
8689
7.94k
  return Status::OK();
8690
7.94k
}
8691
8692
163
Status CatalogManager::StartRemoteBootstrap(const StartRemoteBootstrapRequestPB& req) {
8693
163
  const TabletId& tablet_id = req.tablet_id();
8694
163
  std::unique_lock<std::mutex> l(remote_bootstrap_mtx_, std::try_to_lock);
8695
163
  if (!l.owns_lock()) {
8696
105
    return STATUS_SUBSTITUTE(AlreadyPresent,
8697
105
        "Remote bootstrap of tablet $0 already in progress", tablet_id);
8698
105
  }
8699
8700
58
  if (!master_->opts().IsShellMode()) {
8701
0
    return STATUS(IllegalState, "Cannot bootstrap a master which is not in shell mode.");
8702
0
  }
8703
8704
58
  LOG(INFO) << "Starting remote bootstrap: " << req.ShortDebugString();
8705
8706
58
  HostPort bootstrap_peer_addr = HostPortFromPB(DesiredHostPort(
8707
58
      req.source_broadcast_addr(), req.source_private_addr(), req.source_cloud_info(),
8708
58
      master_->MakeCloudInfoPB()));
8709
8710
58
  const string& bootstrap_peer_uuid = req.bootstrap_peer_uuid();
8711
58
  int64_t leader_term = req.caller_term();
8712
8713
58
  std::shared_ptr<TabletPeer> old_tablet_peer;
8714
58
  RaftGroupMetadataPtr meta;
8715
58
  bool replacing_tablet = false;
8716
8717
58
  if (tablet_exists_) {
8718
0
    old_tablet_peer = tablet_peer();
8719
    // Nothing to recover if the remote bootstrap client start failed the last time.
8720
0
    if (old_tablet_peer) {
8721
0
      meta = old_tablet_peer->tablet_metadata();
8722
0
      replacing_tablet = true;
8723
0
    }
8724
0
  }
8725
8726
58
  if (replacing_tablet) {
8727
    // Make sure the existing tablet peer is shut down and tombstoned.
8728
0
    RETURN_NOT_OK(tserver::HandleReplacingStaleTablet(meta,
8729
0
                                                      old_tablet_peer,
8730
0
                                                      tablet_id,
8731
0
                                                      master_->fs_manager()->uuid(),
8732
0
                                                      leader_term));
8733
0
  }
8734
8735
58
  LOG_WITH_PREFIX(INFO) << " Initiating remote bootstrap from peer " << bootstrap_peer_uuid
8736
58
            << " (" << bootstrap_peer_addr.ToString() << ").";
8737
8738
58
  auto rb_client = std::make_unique<tserver::RemoteBootstrapClient>(
8739
58
      tablet_id, master_->fs_manager());
8740
8741
  // Download and persist the remote superblock in TABLET_DATA_COPYING state.
8742
58
  if (replacing_tablet) {
8743
0
    RETURN_NOT_OK(rb_client->SetTabletToReplace(meta, leader_term));
8744
0
  }
8745
58
  RETURN_NOT_OK(rb_client->Start(
8746
58
      bootstrap_peer_uuid, &master_->proxy_cache(), bootstrap_peer_addr, &meta));
8747
  // This SetupTabletPeer is needed by rb_client to perform the remote bootstrap/fetch.
8748
  // And the SetupTablet below to perform "local bootstrap" cannot be done until the remote fetch
8749
  // has succeeded. So keeping them seperate for now.
8750
58
  sys_catalog_->SetupTabletPeer(meta);
8751
58
  if (PREDICT_FALSE(FLAGS_TEST_inject_latency_during_remote_bootstrap_secs)) {
8752
1
    LOG(INFO) << "Injecting " << FLAGS_TEST_inject_latency_during_remote_bootstrap_secs
8753
1
              << " seconds of latency for test";
8754
1
    SleepFor(MonoDelta::FromSeconds(FLAGS_TEST_inject_latency_during_remote_bootstrap_secs));
8755
1
  }
8756
8757
  // From this point onward, the superblock is persisted in TABLET_DATA_COPYING
8758
  // state, and we need to tombstone the tablet if additional steps prior to
8759
  // getting to a TABLET_DATA_READY state fail.
8760
58
  tablet_exists_ = true;
8761
8762
  // Download all of the remote files.
8763
58
  TOMBSTONE_NOT_OK(rb_client->FetchAll(tablet_peer()->status_listener()),
8764
58
                   meta,
8765
58
                   master_->fs_manager()->uuid(),
8766
58
                   Substitute("Remote bootstrap: Unable to fetch data from remote peer $0 ($1)",
8767
58
                              bootstrap_peer_uuid, bootstrap_peer_addr.ToString()),
8768
58
                   nullptr);
8769
8770
  // Write out the last files to make the new replica visible and update the
8771
  // TabletDataState in the superblock to TABLET_DATA_READY.
8772
  // Finish() will call EndRemoteSession() and wait for the leader to successfully submit a
8773
  // ChangeConfig request (to change this master's role from PRE_VOTER or PRE_OBSERVER to VOTER or
8774
  // OBSERVER respectively). If the RPC times out, we will ignore the error (since the leader could
8775
  // have successfully submitted the ChangeConfig request and failed to respond before in time)
8776
  // and check the committed config until we find that this master's role has changed, or until we
8777
  // time out which will cause us to tombstone the tablet.
8778
58
  TOMBSTONE_NOT_OK(rb_client->Finish(),
8779
58
                   meta,
8780
58
                   master_->fs_manager()->uuid(),
8781
58
                   "Remote bootstrap: Failed calling Finish()",
8782
58
                   nullptr);
8783
8784
  // Synchronous tablet open for "local bootstrap".
8785
58
  RETURN_NOT_OK(tserver::ShutdownAndTombstoneTabletPeerNotOk(
8786
58
      sys_catalog_->OpenTablet(meta), sys_catalog_->tablet_peer(), meta,
8787
58
      master_->fs_manager()->uuid(), "Remote bootstrap: Failed opening sys catalog"));
8788
8789
  // Set up the in-memory master list and also flush the cmeta.
8790
58
  RETURN_NOT_OK(UpdateMastersListInMemoryAndDisk());
8791
8792
58
  master_->SetShellMode(false);
8793
8794
  // Call VerifyChangeRoleSucceeded only after we have set shell mode to false. Otherwise,
8795
  // CatalogManager::GetTabletPeer will always return an error, and the consensus will never get
8796
  // updated.
8797
58
  auto status = rb_client->VerifyChangeRoleSucceeded(
8798
58
      sys_catalog_->tablet_peer()->shared_consensus());
8799
8800
58
  if (!status.ok()) {
8801
0
    LOG_WITH_PREFIX(WARNING) << "Remote bootstrap finished. "
8802
0
                             << "Failed calling VerifyChangeRoleSucceeded: "
8803
0
                             << status.ToString();
8804
58
  } else {
8805
58
    LOG_WITH_PREFIX(INFO) << "Remote bootstrap finished successfully";
8806
58
  }
8807
8808
58
  LOG(INFO) << "Master completed remote bootstrap and is out of shell mode.";
8809
8810
58
  RETURN_NOT_OK(EnableBgTasks());
8811
8812
58
  return Status::OK();
8813
58
}
8814
8815
CHECKED_STATUS CatalogManager::SendAlterTableRequest(const scoped_refptr<TableInfo>& table,
8816
9.83k
                                                     const AlterTableRequestPB* req) {
8817
9.83k
  auto tablets = table->GetTablets();
8818
8819
9.83k
  bool is_ysql_table_with_transaction_metadata =
8820
9.83k
      table->GetTableType() == TableType::PGSQL_TABLE_TYPE &&
8821
9.83k
      
req != nullptr7.67k
&&
8822
9.83k
      
req->has_transaction()5.70k
&&
8823
9.83k
      
req->transaction().has_transaction_id()520
;
8824
8825
9.83k
  bool alter_table_has_add_or_drop_column_step = false;
8826
9.83k
  if (req && 
(5.89k
req->alter_schema_steps_size()5.89k
||
req->has_alter_properties()5.32k
)) {
8827
578
    for (const AlterTableRequestPB::Step& step : req->alter_schema_steps()) {
8828
571
      if (step.type() == AlterTableRequestPB::ADD_COLUMN ||
8829
571
          
step.type() == AlterTableRequestPB::DROP_COLUMN245
) {
8830
542
        alter_table_has_add_or_drop_column_step = true;
8831
542
        break;
8832
542
      }
8833
571
    }
8834
578
  }
8835
8836
9.83k
  TransactionId txn_id = TransactionId::Nil();
8837
9.83k
  if (is_ysql_table_with_transaction_metadata && 
alter_table_has_add_or_drop_column_step520
) {
8838
390
    {
8839
390
      LOG(INFO) << "Persist transaction metadata into SysTableEntryPB for table ID " << table->id();
8840
390
      TRACE("Locking table");
8841
390
      auto l = table->LockForWrite();
8842
390
      auto& tablet_data = *l.mutable_data();
8843
390
      auto& table_pb = tablet_data.pb;
8844
390
      table_pb.mutable_transaction()->CopyFrom(req->transaction());
8845
8846
      // Update sys-catalog with the transaction ID.
8847
390
      TRACE("Updating table metadata on disk");
8848
390
      RETURN_NOT_OK(master_->catalog_manager_impl()->sys_catalog_->Upsert(
8849
390
          master_->catalog_manager()->leader_ready_term(), table.get()));
8850
8851
      // Update the in-memory state.
8852
390
      TRACE("Committing in-memory state");
8853
390
      l.Commit();
8854
390
    }
8855
390
    txn_id = VERIFY_RESULT(FullyDecodeTransactionId(req->transaction().transaction_id()));
8856
390
  }
8857
8858
27.7k
  
for (const scoped_refptr<TabletInfo>& tablet : tablets)9.83k
{
8859
27.7k
    auto call = std::make_shared<AsyncAlterTable>(master_, AsyncTaskPool(), tablet, table, txn_id);
8860
27.7k
    tablet->table()->AddTask(call);
8861
27.7k
    if (PREDICT_FALSE(FLAGS_TEST_slowdown_alter_table_rpcs_ms > 0)) {
8862
0
      LOG(INFO) << "Sleeping for " << tablet->id() << " "
8863
0
                << FLAGS_TEST_slowdown_alter_table_rpcs_ms
8864
0
                << "ms before sending async alter table request";
8865
0
      SleepFor(MonoDelta::FromMilliseconds(FLAGS_TEST_slowdown_alter_table_rpcs_ms));
8866
0
    }
8867
27.7k
    RETURN_NOT_OK(ScheduleTask(call));
8868
27.7k
  }
8869
9.83k
  return Status::OK();
8870
9.83k
}
8871
8872
void CatalogManager::SendCopartitionTabletRequest(const scoped_refptr<TabletInfo>& tablet,
8873
0
                                                  const scoped_refptr<TableInfo>& table) {
8874
0
  auto call = std::make_shared<AsyncCopartitionTable>(master_, AsyncTaskPool(), tablet, table);
8875
0
  table->AddTask(call);
8876
0
  WARN_NOT_OK(ScheduleTask(call), "Failed to send copartition table request");
8877
0
}
8878
8879
Status CatalogManager::SendSplitTabletRequest(
8880
    const scoped_refptr<TabletInfo>& tablet, std::array<TabletId, kNumSplitParts> new_tablet_ids,
8881
140
    const std::string& split_encoded_key, const std::string& split_partition_key) {
8882
140
  VLOG(2) << "Scheduling SplitTablet request to leader tserver for source tablet ID: "
8883
0
          << tablet->tablet_id() << ", after-split tablet IDs: " << AsString(new_tablet_ids);
8884
140
  auto call = std::make_shared<AsyncSplitTablet>(
8885
140
      master_, AsyncTaskPool(), tablet, new_tablet_ids, split_encoded_key, split_partition_key,
8886
140
      &tablet_split_manager_);
8887
140
  tablet->table()->AddTask(call);
8888
140
  return ScheduleTask(call);
8889
140
}
8890
8891
void CatalogManager::DeleteTabletReplicas(
8892
24.9k
    TabletInfo* tablet, const std::string& msg, HideOnly hide_only) {
8893
24.9k
  auto locations = tablet->GetReplicaLocations();
8894
24.9k
  LOG(INFO) << "Sending DeleteTablet for " << locations->size()
8895
24.9k
            << " replicas of tablet " << tablet->tablet_id();
8896
73.9k
  for (const auto& r : *locations) {
8897
73.9k
    SendDeleteTabletRequest(tablet->tablet_id(), TABLET_DATA_DELETED, boost::none, tablet->table(),
8898
73.9k
                            r.second.ts_desc, msg, hide_only);
8899
73.9k
  }
8900
24.9k
}
8901
8902
13.9k
Status CatalogManager::CheckIfForbiddenToDeleteTabletOf(const scoped_refptr<TableInfo>& table) {
8903
  // Do not delete the system catalog tablet.
8904
13.9k
  if (IsSystemTable(*table)) {
8905
8.00k
    return STATUS(InvalidArgument, "It is not allowed to delete system tables");
8906
8.00k
  }
8907
  // Do not delete the tablet of a colocated table.
8908
5.97k
  if (table->IsColocatedUserTable()) {
8909
117
    return STATUS(InvalidArgument, "It is not allowed to delete tablets of the colocated tables.");
8910
117
  }
8911
5.85k
  return Status::OK();
8912
5.97k
}
8913
8914
Status CatalogManager::DeleteTabletsAndSendRequests(
8915
13.9k
    const TableInfoPtr& table, const RepeatedBytes& retained_by_snapshot_schedules) {
8916
  // Silently fail if tablet deletion is forbidden so table deletion can continue executing.
8917
13.9k
  if (!CheckIfForbiddenToDeleteTabletOf(table).ok()) {
8918
8.11k
    return Status::OK();
8919
8.11k
  }
8920
8921
5.85k
  auto tablets = table->GetTablets(IncludeInactive::kTrue);
8922
8923
46.7k
  std::sort(tablets.begin(), tablets.end(), [](const auto& lhs, const auto& rhs) {
8924
46.7k
    return lhs->tablet_id() < rhs->tablet_id();
8925
46.7k
  });
8926
8927
5.85k
  string deletion_msg = "Table deleted at " + LocalTimeAsString();
8928
5.85k
  RETURN_NOT_OK(DeleteTabletListAndSendRequests(
8929
5.85k
      tablets, deletion_msg, retained_by_snapshot_schedules));
8930
8931
5.85k
  if (table->IsColocatedParentTable()) {
8932
8
    LockGuard lock(mutex_);
8933
8
    colocated_tablet_ids_map_.erase(table->namespace_id());
8934
5.84k
  } else if (table->IsTablegroupParentTable()) {
8935
    // In the case of dropped tablegroup parent table, need to delete tablegroup info.
8936
52
    LockGuard lock(mutex_);
8937
52
    const auto& tablegroup_id = table_tablegroup_ids_map_[table->id()];
8938
52
    tablegroup_ids_map_.erase(tablegroup_id);
8939
52
    tablegroup_tablet_ids_map_[table->namespace_id()].erase(tablegroup_id);
8940
52
    table_tablegroup_ids_map_.erase(table->id());
8941
52
  }
8942
5.85k
  return Status::OK();
8943
5.85k
}
8944
8945
Status CatalogManager::DeleteTabletListAndSendRequests(
8946
    const std::vector<scoped_refptr<TabletInfo>>& tablets, const std::string& deletion_msg,
8947
5.85k
    const google::protobuf::RepeatedPtrField<std::string>& retained_by_snapshot_schedules) {
8948
5.85k
  struct TabletData {
8949
5.85k
    TabletInfoPtr tablet;
8950
5.85k
    TabletInfo::WriteLock lock;
8951
5.85k
    HideOnly hide_only;
8952
5.85k
  };
8953
5.85k
  std::vector<TabletData> tablets_data;
8954
5.85k
  tablets_data.reserve(tablets.size());
8955
5.85k
  std::vector<TabletInfo*> tablet_infos;
8956
5.85k
  tablet_infos.reserve(tablets_data.size());
8957
5.85k
  std::vector<TabletInfoPtr> marked_as_hidden;
8958
8959
  // Grab tablets and tablet write locks. The list should already be in tablet_id sorted order.
8960
5.85k
  {
8961
5.85k
    SharedLock read_lock(mutex_);
8962
24.9k
    for (const auto& tablet : tablets) {
8963
24.9k
      tablets_data.push_back(TabletData {
8964
24.9k
        .tablet = tablet,
8965
24.9k
        .lock = tablet->LockForWrite(),
8966
        // Hide tablet if it is retained by snapshot schedule, or is part of a cdc stream.
8967
24.9k
        .hide_only = HideOnly(!retained_by_snapshot_schedules.empty()),
8968
24.9k
      });
8969
24.9k
      if (!tablets_data.back().hide_only) {
8970
        // Also check if this tablet is part of a cdc stream and is not already hidden. If this is
8971
        // a cdc stream producer and is already hidden, then we should delete this tablet.
8972
24.9k
        tablets_data.back().hide_only = HideOnly(
8973
24.9k
            IsTableCdcProducer(*tablet->table()) && 
!tablets_data.back().lock->ListedAsHidden()0
);
8974
24.9k
      }
8975
8976
24.9k
      tablet_infos.emplace_back(tablet.get());
8977
24.9k
    }
8978
5.85k
  }
8979
8980
  // Use the same hybrid time for all hidden tablets.
8981
5.85k
  HybridTime hide_hybrid_time = master_->clock()->Now();
8982
8983
  // Mark the tablets as deleted.
8984
24.9k
  for (auto& tablet_data : tablets_data) {
8985
24.9k
    auto& tablet = tablet_data.tablet;
8986
24.9k
    auto& tablet_lock = tablet_data.lock;
8987
8988
24.9k
    bool was_hidden = tablet_lock->ListedAsHidden();
8989
    // Inactive tablet now, so remove it from partitions_.
8990
    // After all the tablets have been deleted from the tservers, we remove it from tablets_.
8991
24.9k
    tablet->table()->RemoveTablet(tablet->id(), DeactivateOnly::kTrue);
8992
8993
24.9k
    if (tablet_data.hide_only) {
8994
12
      LOG(INFO) << "Hiding tablet " << tablet->tablet_id();
8995
12
      tablet_lock.mutable_data()->pb.set_hide_hybrid_time(hide_hybrid_time.ToUint64());
8996
12
      *tablet_lock.mutable_data()->pb.mutable_retained_by_snapshot_schedules() =
8997
12
          retained_by_snapshot_schedules;
8998
24.9k
    } else {
8999
24.9k
      LOG(INFO) << "Deleting tablet " << tablet->tablet_id();
9000
24.9k
      tablet_lock.mutable_data()->set_state(SysTabletsEntryPB::DELETED, deletion_msg);
9001
24.9k
    }
9002
24.9k
    if (tablet_lock->ListedAsHidden() && 
!was_hidden12
) {
9003
12
      marked_as_hidden.push_back(tablet);
9004
12
    }
9005
24.9k
  }
9006
9007
  // Update all the tablet states in raft in bulk.
9008
5.85k
  RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), tablet_infos));
9009
9010
  // Commit the change.
9011
24.9k
  
for (auto& tablet_data : tablets_data)5.85k
{
9012
24.9k
    auto& tablet = tablet_data.tablet;
9013
24.9k
    auto& tablet_lock = tablet_data.lock;
9014
9015
24.9k
    tablet_lock.Commit();
9016
24.9k
    LOG(INFO) << (tablet_data.hide_only ? 
"Hid tablet "12
:
"Deleted tablet "24.9k
) << tablet->tablet_id();
9017
9018
24.9k
    DeleteTabletReplicas(tablet.get(), deletion_msg, tablet_data.hide_only);
9019
24.9k
  }
9020
9021
5.85k
  if (!marked_as_hidden.empty()) {
9022
4
    LockGuard lock(mutex_);
9023
4
    hidden_tablets_.insert(hidden_tablets_.end(), marked_as_hidden.begin(), marked_as_hidden.end());
9024
4
  }
9025
9026
5.85k
  return Status::OK();
9027
5.85k
}
9028
9029
void CatalogManager::SendDeleteTabletRequest(
9030
    const TabletId& tablet_id,
9031
    TabletDataState delete_type,
9032
    const boost::optional<int64_t>& cas_config_opid_index_less_or_equal,
9033
    const scoped_refptr<TableInfo>& table,
9034
    TSDescriptor* ts_desc,
9035
    const string& reason,
9036
73.9k
    bool hide_only) {
9037
73.9k
  if (PREDICT_FALSE(GetAtomicFlag(&FLAGS_TEST_disable_tablet_deletion))) {
9038
3
    return;
9039
3
  }
9040
73.9k
  LOG_WITH_PREFIX(INFO)
9041
73.9k
      << (hide_only ? 
"Hiding"36
:
"Deleting"73.8k
) << " tablet " << tablet_id << " on peer "
9042
73.9k
      << ts_desc->permanent_uuid() << " with delete type "
9043
73.9k
      << TabletDataState_Name(delete_type) << " (" << reason << ")";
9044
73.9k
  auto call = std::make_shared<AsyncDeleteReplica>(master_, AsyncTaskPool(),
9045
73.9k
      ts_desc->permanent_uuid(), table, tablet_id, delete_type,
9046
73.9k
      cas_config_opid_index_less_or_equal, reason);
9047
73.9k
  if (hide_only) {
9048
36
    call->set_hide_only(hide_only);
9049
36
  }
9050
73.9k
  if (table != nullptr) {
9051
73.9k
    table->AddTask(call);
9052
73.9k
  }
9053
9054
73.9k
  auto status = ScheduleTask(call);
9055
73.9k
  WARN_NOT_OK(status, Substitute("Failed to send delete request for tablet $0", tablet_id));
9056
  // TODO(bogdan): does the pending delete semantics need to change?
9057
73.9k
  if (status.ok()) {
9058
73.9k
    ts_desc->AddPendingTabletDelete(tablet_id);
9059
73.9k
  }
9060
73.9k
}
9061
9062
void CatalogManager::SendLeaderStepDownRequest(
9063
    const scoped_refptr<TabletInfo>& tablet, const ConsensusStatePB& cstate,
9064
    const string& change_config_ts_uuid, bool should_remove,
9065
54.0k
    const string& new_leader_uuid) {
9066
54.0k
  auto task = std::make_shared<AsyncTryStepDown>(
9067
54.0k
      master_, AsyncTaskPool(), tablet, cstate, change_config_ts_uuid, should_remove,
9068
54.0k
      new_leader_uuid);
9069
54.0k
  tablet->table()->AddTask(task);
9070
54.0k
  Status status = ScheduleTask(task);
9071
54.0k
  WARN_NOT_OK(status, Substitute("Failed to send new $0 request", task->type_name()));
9072
54.0k
}
9073
9074
// TODO: refactor this into a joint method with the add one.
9075
void CatalogManager::SendRemoveServerRequest(
9076
    const scoped_refptr<TabletInfo>& tablet, const ConsensusStatePB& cstate,
9077
1.71k
    const string& change_config_ts_uuid) {
9078
  // Check if the user wants the leader to be stepped down.
9079
1.71k
  auto task = std::make_shared<AsyncRemoveServerTask>(
9080
1.71k
      master_, AsyncTaskPool(), tablet, cstate, change_config_ts_uuid);
9081
1.71k
  tablet->table()->AddTask(task);
9082
1.71k
  WARN_NOT_OK(ScheduleTask(task), Substitute("Failed to send new $0 request", task->type_name()));
9083
1.71k
}
9084
9085
void CatalogManager::SendAddServerRequest(
9086
    const scoped_refptr<TabletInfo>& tablet, PeerMemberType member_type,
9087
2.17k
    const ConsensusStatePB& cstate, const string& change_config_ts_uuid) {
9088
2.17k
  auto task = std::make_shared<AsyncAddServerTask>(master_, AsyncTaskPool(), tablet, member_type,
9089
2.17k
      cstate, change_config_ts_uuid);
9090
2.17k
  tablet->table()->AddTask(task);
9091
2.17k
  WARN_NOT_OK(
9092
2.17k
      ScheduleTask(task),
9093
2.17k
      Substitute("Failed to send AddServer of tserver $0 to tablet $1",
9094
2.17k
                 change_config_ts_uuid, tablet.get()->ToString()));
9095
2.17k
}
9096
9097
void CatalogManager::GetPendingServerTasksUnlocked(
9098
    const TableId &table_uuid,
9099
    TabletToTabletServerMap *add_replica_tasks_map,
9100
    TabletToTabletServerMap *remove_replica_tasks_map,
9101
891k
    TabletToTabletServerMap *stepdown_leader_tasks_map) {
9102
9103
891k
  auto table = GetTableInfoUnlocked(table_uuid);
9104
891k
  for (const auto& task : table->GetTasks()) {
9105
128k
    TabletToTabletServerMap* outputMap = nullptr;
9106
128k
    if (task->type() == MonitoredTask::ASYNC_ADD_SERVER) {
9107
415
      outputMap = add_replica_tasks_map;
9108
127k
    } else if (task->type() == MonitoredTask::ASYNC_REMOVE_SERVER) {
9109
662
      outputMap = remove_replica_tasks_map;
9110
127k
    } else if (task->type() == MonitoredTask::ASYNC_TRY_STEP_DOWN) {
9111
      // Store new_leader_uuid instead of change_config_ts_uuid.
9112
569
      auto raft_task = static_cast<AsyncTryStepDown*>(task.get());
9113
569
      (*stepdown_leader_tasks_map)[raft_task->tablet_id()] = raft_task->new_leader_uuid();
9114
569
      continue;
9115
569
    }
9116
127k
    if (outputMap) {
9117
1.07k
      auto raft_task = static_cast<CommonInfoForRaftTask*>(task.get());
9118
1.07k
      (*outputMap)[raft_task->tablet_id()] = raft_task->change_config_ts_uuid();
9119
1.07k
    }
9120
127k
  }
9121
891k
}
9122
9123
void CatalogManager::ExtractTabletsToProcess(
9124
    TabletInfos *tablets_to_delete,
9125
1.56M
    TableToTabletInfos *tablets_to_process) {
9126
1.56M
  SharedLock lock(mutex_);
9127
9128
  // TODO: At the moment we loop through all the tablets
9129
  //       we can keep a set of tablets waiting for "assignment"
9130
  //       or just a counter to avoid to take the lock and loop through the tablets
9131
  //       if everything is "stable".
9132
9133
34.3M
  for (const TabletInfoMap::value_type& entry : *tablet_map_) {
9134
34.3M
    scoped_refptr<TabletInfo> tablet = entry.second;
9135
34.3M
    auto table = tablet->table();
9136
34.3M
    if (!table) {
9137
      // Tablet is orphaned or in preparing state, continue.
9138
0
      continue;
9139
0
    }
9140
9141
    // acquire table lock before tablets.
9142
34.3M
    auto table_lock = table->LockForRead();
9143
34.3M
    auto tablet_lock = tablet->LockForRead();
9144
9145
    // If the table is deleted or the tablet was replaced at table creation time.
9146
34.3M
    if (tablet_lock->is_deleted() || 
table_lock->started_deleting()32.2M
) {
9147
      // Process this table deletion only once (tombstones for table may remain longer).
9148
2.04M
      if (table_ids_map_->find(tablet->table()->id()) != table_ids_map_->end()) {
9149
2.04M
        tablets_to_delete->push_back(tablet);
9150
2.04M
      }
9151
      // Don't process deleted tables regardless.
9152
2.04M
      continue;
9153
2.04M
    }
9154
9155
    // Running tablets.
9156
32.2M
    if (tablet_lock->is_running()) {
9157
      // TODO: handle last update > not responding timeout?
9158
32.1M
      continue;
9159
32.1M
    }
9160
9161
    // Tablets not yet assigned or with a report just received.
9162
86.9k
    (*tablets_to_process)[tablet->table()->id()].push_back(tablet);
9163
86.9k
  }
9164
1.56M
}
9165
9166
1.51M
bool CatalogManager::AreTablesDeleting() {
9167
1.51M
  SharedLock lock(mutex_);
9168
9169
79.7M
  for (const TableInfoMap::value_type& entry : *table_ids_map_) {
9170
79.7M
    scoped_refptr<TableInfo> table(entry.second);
9171
79.7M
    auto table_lock = table->LockForRead();
9172
    // TODO(jason): possibly change this to started_deleting when we begin removing DELETED tables
9173
    // from table_ids_map_ (see CleanUpDeletedTables).
9174
79.7M
    if (table_lock->is_deleting()) {
9175
48
      return true;
9176
48
    }
9177
79.7M
  }
9178
1.51M
  return false;
9179
1.51M
}
9180
9181
struct DeferredAssignmentActions {
9182
  std::vector<TabletInfo*> modified_tablets;
9183
  std::vector<TabletInfo*> needs_create_rpc;
9184
};
9185
9186
void CatalogManager::HandleAssignPreparingTablet(TabletInfo* tablet,
9187
48.5k
                                                 DeferredAssignmentActions* deferred) {
9188
  // The tablet was just created (probably by a CreateTable RPC).
9189
  // Update the state to "creating" to be ready for the creation request.
9190
48.5k
  tablet->mutable_metadata()->mutable_dirty()->set_state(
9191
48.5k
    SysTabletsEntryPB::CREATING, "Sending initial creation of tablet");
9192
48.5k
  deferred->modified_tablets.push_back(tablet);
9193
48.5k
  deferred->needs_create_rpc.push_back(tablet);
9194
48.5k
  VLOG
(1) << "Assign new tablet " << tablet->ToString()0
;
9195
48.5k
}
9196
9197
void CatalogManager::HandleAssignCreatingTablet(TabletInfo* tablet,
9198
                                                DeferredAssignmentActions* deferred,
9199
38.3k
                                                vector<scoped_refptr<TabletInfo>>* new_tablets) {
9200
38.3k
  MonoDelta time_since_updated =
9201
38.3k
      MonoTime::Now().GetDeltaSince(tablet->last_update_time());
9202
38.3k
  int64_t remaining_timeout_ms =
9203
38.3k
      FLAGS_tablet_creation_timeout_ms - time_since_updated.ToMilliseconds();
9204
9205
38.3k
  if (tablet->LockForRead()->pb.has_split_parent_tablet_id()) {
9206
    // No need to recreate post-split tablets, since this is always done on source tablet replicas.
9207
405
    VLOG
(2) << "Post-split tablet " << AsString(tablet) << " still being created."0
;
9208
405
    return;
9209
405
  }
9210
  // Skip the tablet if the assignment timeout is not yet expired.
9211
37.9k
  if (remaining_timeout_ms > 0) {
9212
37.9k
    VLOG(2) << "Tablet " << tablet->ToString() << " still being created. "
9213
0
            << remaining_timeout_ms << "ms remain until timeout.";
9214
37.9k
    return;
9215
37.9k
  }
9216
9217
10
  const PersistentTabletInfo& old_info = tablet->metadata().state();
9218
9219
  // The "tablet creation" was already sent, but we didn't receive an answer
9220
  // within the timeout. So the tablet will be replaced by a new one.
9221
10
  TabletInfoPtr replacement;
9222
10
  {
9223
10
    LockGuard lock(mutex_);
9224
10
    replacement = CreateTabletInfo(tablet->table().get(), old_info.pb.partition());
9225
10
  }
9226
10
  LOG(WARNING) << "Tablet " << tablet->ToString() << " was not created within "
9227
10
               << "the allowed timeout. Replacing with a new tablet "
9228
10
               << replacement->tablet_id();
9229
9230
10
  tablet->table()->ReplaceTablet(tablet, replacement);
9231
10
  {
9232
10
    LockGuard lock(mutex_);
9233
10
    auto tablet_map_checkout = tablet_map_.CheckOut();
9234
10
    (*tablet_map_checkout)[replacement->tablet_id()] = replacement;
9235
10
  }
9236
9237
  // Mark old tablet as replaced.
9238
10
  tablet->mutable_metadata()->mutable_dirty()->set_state(
9239
10
    SysTabletsEntryPB::REPLACED,
9240
10
    Substitute("Replaced by $0 at $1",
9241
10
               replacement->tablet_id(), LocalTimeAsString()));
9242
9243
  // Mark new tablet as being created.
9244
10
  replacement->mutable_metadata()->mutable_dirty()->set_state(
9245
10
    SysTabletsEntryPB::CREATING,
9246
10
    Substitute("Replacement for $0", tablet->tablet_id()));
9247
9248
10
  deferred->modified_tablets.push_back(tablet);
9249
10
  deferred->modified_tablets.push_back(replacement.get());
9250
10
  deferred->needs_create_rpc.push_back(replacement.get());
9251
10
  VLOG(1) << "Replaced tablet " << tablet->tablet_id()
9252
0
          << " with " << replacement->tablet_id()
9253
0
          << " (table " << tablet->table()->ToString() << ")";
9254
9255
10
  new_tablets->push_back(replacement);
9256
10
}
9257
9258
// TODO: we could batch the IO onto a background thread.
9259
Status CatalogManager::HandleTabletSchemaVersionReport(
9260
97.4k
    TabletInfo *tablet, uint32_t version, const scoped_refptr<TableInfo>& table_info) {
9261
97.4k
  scoped_refptr<TableInfo> table;
9262
97.4k
  if (table_info) {
9263
31.5k
    table = table_info;
9264
65.9k
  } else {
9265
65.9k
    table = tablet->table();
9266
65.9k
  }
9267
9268
  // Update the schema version if it's the latest.
9269
97.4k
  tablet->set_reported_schema_version(table->id(), version);
9270
97.4k
  
VLOG_WITH_PREFIX_AND_FUNC156
(1)
9271
156
      << "Tablet " << tablet->tablet_id() << " reported version " << version;
9272
9273
  // Verify if it's the last tablet report, and the alter completed.
9274
97.4k
  {
9275
97.4k
    auto l = table->LockForRead();
9276
97.4k
    if (l->pb.state() != SysTablesEntryPB::ALTERING) {
9277
69.7k
      
VLOG_WITH_PREFIX_AND_FUNC1
(2) << "Table " << table->ToString() << " is not altering"1
;
9278
69.7k
      return Status::OK();
9279
69.7k
    }
9280
9281
27.6k
    uint32_t current_version = l->pb.version();
9282
27.6k
    if (table->IsAlterInProgress(current_version)) {
9283
17.1k
      
VLOG_WITH_PREFIX_AND_FUNC0
(2) << "Table " << table->ToString() << " has IsAlterInProgress ("
9284
0
                                   << current_version << ")";
9285
17.1k
      return Status::OK();
9286
17.1k
    }
9287
27.6k
  }
9288
9289
10.5k
  return MultiStageAlterTable::LaunchNextTableInfoVersionIfNecessary(this, table, version);
9290
27.6k
}
9291
9292
Status CatalogManager::ProcessPendingAssignmentsPerTable(
9293
21.4k
    const TableId& table_id, const TabletInfos& tablets, CMGlobalLoadState* global_load_state) {
9294
21.4k
  VLOG
(1) << "Processing pending assignments"0
;
9295
9296
21.4k
  TSDescriptorVector ts_descs = GetAllLiveNotBlacklistedTServers();
9297
9298
  // Initialize this table load state.
9299
21.4k
  CMPerTableLoadState table_load_state(global_load_state);
9300
21.4k
  InitializeTableLoadState(table_id, ts_descs, &table_load_state);
9301
21.4k
  table_load_state.SortLoad();
9302
9303
  // Take write locks on all tablets to be processed, and ensure that they are
9304
  // unlocked at the end of this scope.
9305
86.9k
  for (const scoped_refptr<TabletInfo>& tablet : tablets) {
9306
86.9k
    tablet->mutable_metadata()->StartMutation();
9307
86.9k
  }
9308
21.4k
  ScopedInfoCommitter<TabletInfo> unlocker_in(&tablets);
9309
9310
  // Any tablets created by the helper functions will also be created in a
9311
  // locked state, so we must ensure they are unlocked before we return to
9312
  // avoid deadlocks.
9313
21.4k
  TabletInfos new_tablets;
9314
21.4k
  ScopedInfoCommitter<TabletInfo> unlocker_out(&new_tablets);
9315
9316
21.4k
  DeferredAssignmentActions deferred;
9317
9318
  // Iterate over each of the tablets and handle it, whatever state
9319
  // it may be in. The actions required for the tablet are collected
9320
  // into 'deferred'.
9321
86.9k
  for (const scoped_refptr<TabletInfo>& tablet : tablets) {
9322
86.9k
    SysTabletsEntryPB::State t_state = tablet->metadata().state().pb.state();
9323
9324
86.9k
    switch (t_state) {
9325
48.5k
      case SysTabletsEntryPB::PREPARING:
9326
48.5k
        HandleAssignPreparingTablet(tablet.get(), &deferred);
9327
48.5k
        break;
9328
9329
38.3k
      case SysTabletsEntryPB::CREATING:
9330
38.3k
        HandleAssignCreatingTablet(tablet.get(), &deferred, &new_tablets);
9331
38.3k
        break;
9332
9333
144
      default:
9334
144
        VLOG(2) << "Nothing to do for tablet " << tablet->tablet_id() << ": state = "
9335
0
                << SysTabletsEntryPB_State_Name(t_state);
9336
144
        break;
9337
86.9k
    }
9338
86.9k
  }
9339
9340
  // Nothing to do.
9341
21.4k
  if (deferred.modified_tablets.empty() &&
9342
21.4k
      
deferred.needs_create_rpc.empty()13.1k
) {
9343
13.1k
    return Status::OK();
9344
13.1k
  }
9345
9346
  // For those tablets which need to be created in this round, assign replicas.
9347
8.30k
  Status s;
9348
8.30k
  std::unordered_set<TableInfo*> ok_status_tables;
9349
48.4k
  for (TabletInfo *tablet : deferred.needs_create_rpc) {
9350
    // NOTE: if we fail to select replicas on the first pass (due to
9351
    // insufficient Tablet Servers being online), we will still try
9352
    // again unless the tablet/table creation is cancelled.
9353
48.4k
    LOG(INFO) << "Selecting replicas for tablet " << tablet->id();
9354
48.4k
    s = SelectReplicasForTablet(ts_descs, tablet, &table_load_state, global_load_state);
9355
48.4k
    if (!s.ok()) {
9356
21
      s = s.CloneAndPrepend(Substitute(
9357
21
          "An error occurred while selecting replicas for tablet $0: $1",
9358
21
          tablet->tablet_id(), s.ToString()));
9359
21
      tablet->table()->SetCreateTableErrorStatus(s);
9360
21
      break;
9361
48.4k
    } else {
9362
48.4k
      ok_status_tables.emplace(tablet->table().get());
9363
48.4k
    }
9364
48.4k
  }
9365
9366
  // Update the sys catalog with the new set of tablets/metadata.
9367
8.30k
  if (s.ok()) {
9368
    // If any of the ok_status_tables had an error in the previous iterations, we
9369
    // need to clear up the error status to reflect that all the create tablets have now
9370
    // succeded.
9371
8.28k
    for (TableInfo* table : ok_status_tables) {
9372
8.28k
      table->SetCreateTableErrorStatus(Status::OK());
9373
8.28k
    }
9374
9375
8.28k
    s = sys_catalog_->Upsert(leader_ready_term(), deferred.modified_tablets);
9376
8.28k
    if (!s.ok()) {
9377
1
      s = s.CloneAndPrepend("An error occurred while persisting the updated tablet metadata");
9378
1
    }
9379
8.28k
  }
9380
9381
8.30k
  if (!s.ok()) {
9382
22
    LOG(WARNING) << "Aborting the current task due to error: " << s.ToString();
9383
    // If there was an error, abort any mutations started by the current task.
9384
    // NOTE: Lock order should be lock_ -> table -> tablet.
9385
    // We currently have a bunch of tablets locked and need to unlock first to ensure this holds.
9386
9387
22
    std::sort(new_tablets.begin(), new_tablets.end(), [](const auto& lhs, const auto& rhs) {
9388
0
      return lhs->table().get() < rhs->table().get();
9389
0
    });
9390
22
    {
9391
22
      std::string current_table_name;
9392
22
      TableInfoPtr current_table;
9393
22
      for (auto& tablet_to_remove : new_tablets) {
9394
0
        if (tablet_to_remove->table()->RemoveTablet(tablet_to_remove->tablet_id())) {
9395
0
          if (VLOG_IS_ON(1)) {
9396
0
            if (current_table != tablet_to_remove->table()) {
9397
0
              current_table = tablet_to_remove->table();
9398
0
              current_table_name = current_table->name();
9399
0
            }
9400
0
            LOG(INFO) << "Removed tablet " << tablet_to_remove->tablet_id() << " from table "
9401
0
                      << current_table_name;
9402
0
          }
9403
0
        }
9404
0
      }
9405
22
    }
9406
9407
22
    unlocker_out.Abort();  // tablet.unlock
9408
22
    unlocker_in.Abort();
9409
9410
22
    {
9411
22
      LockGuard lock(mutex_); // lock_.lock
9412
22
      auto tablet_map_checkout = tablet_map_.CheckOut();
9413
22
      for (auto& tablet_to_remove : new_tablets) {
9414
        // Potential race condition above, but it's okay if a background thread deleted this.
9415
0
        tablet_map_checkout->erase(tablet_to_remove->tablet_id());
9416
0
      }
9417
22
    }
9418
22
    return s;
9419
22
  }
9420
9421
  // Send DeleteTablet requests to tablet servers serving deleted tablets.
9422
  // This is asynchronous / non-blocking.
9423
48.4k
  
for (auto* tablet : deferred.modified_tablets)8.28k
{
9424
48.4k
    if (tablet->metadata().dirty().is_deleted()) {
9425
      // Actual delete, because we delete tablet replica.
9426
10
      DeleteTabletReplicas(tablet, tablet->metadata().dirty().pb.state_msg(), HideOnly::kFalse);
9427
10
    }
9428
48.4k
  }
9429
  // Send the CreateTablet() requests to the servers. This is asynchronous / non-blocking.
9430
8.28k
  return SendCreateTabletRequests(deferred.needs_create_rpc);
9431
8.30k
}
9432
9433
Status CatalogManager::SelectReplicasForTablet(
9434
    const TSDescriptorVector& ts_descs, TabletInfo* tablet,
9435
48.4k
    CMPerTableLoadState* per_table_state, CMGlobalLoadState* global_state) {
9436
48.4k
  auto table_guard = tablet->table()->LockForRead();
9437
9438
48.4k
  if (!table_guard->pb.IsInitialized()) {
9439
0
    return STATUS_SUBSTITUTE(InvalidArgument,
9440
0
        "TableInfo for tablet $0 is not initialized (aborted CreateTable attempt?)",
9441
0
        tablet->tablet_id());
9442
0
  }
9443
9444
48.4k
  const auto& replication_info =
9445
48.4k
    VERIFY_RESULT(GetTableReplicationInfo(table_guard->pb.replication_info(),
9446
48.4k
          tablet->table()->TablespaceIdForTableCreation()));
9447
9448
  // Select the set of replicas for the tablet.
9449
0
  ConsensusStatePB* cstate = tablet->mutable_metadata()->mutable_dirty()
9450
48.4k
          ->pb.mutable_committed_consensus_state();
9451
48.4k
  
VLOG_WITH_FUNC0
(3) << "Committed consensus state: " << AsString(cstate)0
;
9452
48.4k
  cstate->set_current_term(kMinimumTerm);
9453
48.4k
  consensus::RaftConfigPB *config = cstate->mutable_config();
9454
48.4k
  config->set_opid_index(consensus::kInvalidOpIdIndex);
9455
9456
48.4k
  Status s = HandlePlacementUsingReplicationInfo(
9457
48.4k
      replication_info, ts_descs, config, per_table_state, global_state);
9458
48.4k
  if (!s.ok()) {
9459
21
    return s;
9460
21
  }
9461
9462
48.4k
  std::ostringstream out;
9463
48.4k
  out << "Initial tserver uuids for tablet " << tablet->tablet_id() << ": ";
9464
140k
  for (const RaftPeerPB& peer : config->peers()) {
9465
140k
    out << peer.permanent_uuid() << " ";
9466
140k
  }
9467
9468
48.4k
  if (VLOG_IS_ON(0)) {
9469
48.4k
    out.str();
9470
48.4k
  }
9471
9472
48.4k
  
VLOG_WITH_FUNC1
(3) << "Committed consensus state has been updated to: " << AsString(cstate)1
;
9473
9474
48.4k
  return Status::OK();
9475
48.4k
}
9476
9477
void CatalogManager::GetTsDescsFromPlacementInfo(const PlacementInfoPB& placement_info,
9478
                                                 const TSDescriptorVector& all_ts_descs,
9479
105k
                                                 TSDescriptorVector* ts_descs) {
9480
105k
  ts_descs->clear();
9481
312k
  for (const auto& ts_desc : all_ts_descs) {
9482
312k
    if (placement_info.has_placement_uuid()) {
9483
5.47k
      string placement_uuid = placement_info.placement_uuid();
9484
5.47k
      if (ts_desc->placement_uuid() == placement_uuid) {
9485
3.51k
        ts_descs->push_back(ts_desc);
9486
3.51k
      }
9487
306k
    } else if (ts_desc->placement_uuid() == "") {
9488
      // Since the placement info has no placement id, we know it is live, so we add this ts.
9489
306k
      ts_descs->push_back(ts_desc);
9490
306k
    }
9491
312k
  }
9492
105k
}
9493
9494
Status CatalogManager::HandlePlacementUsingReplicationInfo(
9495
    const ReplicationInfoPB& replication_info,
9496
    const TSDescriptorVector& all_ts_descs,
9497
    consensus::RaftConfigPB* config,
9498
    CMPerTableLoadState* per_table_state,
9499
48.4k
    CMGlobalLoadState* global_state) {
9500
  // Validate if we have enough tservers to put the replicas.
9501
48.4k
  ValidateReplicationInfoRequestPB req;
9502
48.4k
  req.mutable_replication_info()->CopyFrom(replication_info);
9503
48.4k
  ValidateReplicationInfoResponsePB resp;
9504
48.4k
  RETURN_NOT_OK(ValidateReplicationInfo(&req, &resp));
9505
9506
48.4k
  TSDescriptorVector ts_descs;
9507
48.4k
  GetTsDescsFromPlacementInfo(replication_info.live_replicas(), all_ts_descs, &ts_descs);
9508
48.4k
  RETURN_NOT_OK(HandlePlacementUsingPlacementInfo(
9509
48.4k
      replication_info.live_replicas(), ts_descs, PeerMemberType::VOTER,
9510
48.4k
      config, per_table_state, global_state));
9511
48.5k
  
for (int i = 0; 48.4k
i < replication_info.read_replicas_size();
i++148
) {
9512
148
    GetTsDescsFromPlacementInfo(replication_info.read_replicas(i), all_ts_descs, &ts_descs);
9513
148
    RETURN_NOT_OK(HandlePlacementUsingPlacementInfo(
9514
148
        replication_info.read_replicas(i), ts_descs, PeerMemberType::OBSERVER,
9515
148
        config, per_table_state, global_state));
9516
148
  }
9517
48.4k
  return Status::OK();
9518
48.4k
}
9519
9520
Status CatalogManager::HandlePlacementUsingPlacementInfo(const PlacementInfoPB& placement_info,
9521
                                                         const TSDescriptorVector& ts_descs,
9522
                                                         PeerMemberType member_type,
9523
                                                         consensus::RaftConfigPB* config,
9524
                                                         CMPerTableLoadState* per_table_state,
9525
48.5k
                                                         CMGlobalLoadState* global_state) {
9526
48.5k
  size_t nreplicas = GetNumReplicasFromPlacementInfo(placement_info);
9527
48.5k
  size_t ntservers = ts_descs.size();
9528
  // Keep track of servers we've already selected, so that we don't attempt to
9529
  // put two replicas on the same host.
9530
48.5k
  set<TabletServerId> already_selected_ts;
9531
48.5k
  if (placement_info.placement_blocks().empty()) {
9532
    // If we don't have placement info, just place the replicas as before, distributed across the
9533
    // whole cluster.
9534
    // We cannot put more than ntservers replicas.
9535
47.7k
    nreplicas = min(nreplicas, ntservers);
9536
47.7k
    SelectReplicas(ts_descs, nreplicas, config, &already_selected_ts, member_type,
9537
47.7k
                   per_table_state, global_state);
9538
47.7k
  } else {
9539
    // TODO(bogdan): move to separate function
9540
    //
9541
    // If we do have placement info, we'll try to use the same power of two algorithm, but also
9542
    // match the requested policies. We'll assign the minimum requested replicas in each combination
9543
    // of cloud.region.zone and then if we still have leftover replicas, we'll assign those
9544
    // in any of the allowed areas.
9545
799
    auto all_allowed_ts = VERIFY_RESULT(FindTServersForPlacementInfo(placement_info, ts_descs));
9546
9547
    // Loop through placements and assign to respective available TSs.
9548
0
    size_t min_replica_count_sum = 0;
9549
1.05k
    for (const auto& pb : placement_info.placement_blocks()) {
9550
      // This works because currently we don't allow placement blocks to overlap.
9551
1.05k
      auto available_ts_descs = VERIFY_RESULT(FindTServersForPlacementBlock(pb, ts_descs));
9552
0
      size_t available_ts_descs_size = available_ts_descs.size();
9553
1.05k
      size_t min_num_replicas = pb.min_num_replicas();
9554
      // We cannot put more than the available tablet servers in that placement block.
9555
1.05k
      size_t num_replicas = min(min_num_replicas, available_ts_descs_size);
9556
1.05k
      min_replica_count_sum += min_num_replicas;
9557
1.05k
      SelectReplicas(available_ts_descs, num_replicas, config, &already_selected_ts, member_type,
9558
1.05k
                     per_table_state, global_state);
9559
1.05k
    }
9560
9561
799
    size_t replicas_left = nreplicas - min_replica_count_sum;
9562
799
    size_t max_tservers_left = all_allowed_ts.size() - already_selected_ts.size();
9563
    // Upper bounded by the tservers left.
9564
799
    replicas_left = min(replicas_left, max_tservers_left);
9565
799
    DCHECK_GE(replicas_left, 0);
9566
799
    if (replicas_left > 0) {
9567
      // No need to do an extra check here, as we checked early if we have enough to cover all
9568
      // requested placements and checked individually per placement info, if we could cover the
9569
      // minimums.
9570
12
      SelectReplicas(all_allowed_ts, replicas_left, config, &already_selected_ts, member_type,
9571
12
                     per_table_state, global_state);
9572
12
    }
9573
799
  }
9574
48.5k
  return Status::OK();
9575
48.5k
}
9576
9577
Result<vector<shared_ptr<TSDescriptor>>> CatalogManager::FindTServersForPlacementInfo(
9578
    const PlacementInfoPB& placement_info,
9579
1.67k
    const TSDescriptorVector& ts_descs) const {
9580
9581
1.67k
  vector<shared_ptr<TSDescriptor>> all_allowed_ts;
9582
6.38k
  for (const auto& ts : ts_descs) {
9583
7.84k
    for (const auto& pb : placement_info.placement_blocks()) {
9584
7.84k
      if (ts->MatchesCloudInfo(pb.cloud_info())) {
9585
3.32k
        all_allowed_ts.push_back(ts);
9586
3.32k
        break;
9587
3.32k
      }
9588
7.84k
    }
9589
6.38k
  }
9590
9591
1.67k
  return all_allowed_ts;
9592
1.67k
}
9593
9594
Result<vector<shared_ptr<TSDescriptor>>> CatalogManager::FindTServersForPlacementBlock(
9595
    const PlacementBlockPB& placement_block,
9596
2.19k
    const TSDescriptorVector& ts_descs) {
9597
9598
2.19k
  vector<shared_ptr<TSDescriptor>> allowed_ts;
9599
2.19k
  const auto& cloud_info = placement_block.cloud_info();
9600
8.85k
  for (const auto& ts : ts_descs) {
9601
8.85k
    if (ts->MatchesCloudInfo(cloud_info)) {
9602
3.29k
      allowed_ts.push_back(ts);
9603
3.29k
    }
9604
8.85k
  }
9605
9606
2.19k
  return allowed_ts;
9607
2.19k
}
9608
9609
8.27k
Status CatalogManager::SendCreateTabletRequests(const vector<TabletInfo*>& tablets) {
9610
8.27k
  auto schedules_to_tablets_map = VERIFY_RESULT(MakeSnapshotSchedulesToObjectIdsMap(
9611
8.27k
      SysRowEntryType::TABLET));
9612
48.3k
  for (TabletInfo *tablet : tablets) {
9613
48.3k
    const consensus::RaftConfigPB& config =
9614
48.3k
        tablet->metadata().dirty().pb.committed_consensus_state().config();
9615
48.3k
    tablet->set_last_update_time(MonoTime::Now());
9616
48.3k
    std::vector<SnapshotScheduleId> schedules;
9617
48.3k
    for (const auto& pair : schedules_to_tablets_map) {
9618
39
      if (std::binary_search(pair.second.begin(), pair.second.end(), tablet->id())) {
9619
24
        schedules.push_back(pair.first);
9620
24
      }
9621
39
    }
9622
140k
    for (const RaftPeerPB& peer : config.peers()) {
9623
140k
      auto task = std::make_shared<AsyncCreateReplica>(master_, AsyncTaskPool(),
9624
140k
          peer.permanent_uuid(), tablet, schedules);
9625
140k
      tablet->table()->AddTask(task);
9626
140k
      WARN_NOT_OK(ScheduleTask(task), "Failed to send new tablet request");
9627
140k
    }
9628
48.3k
  }
9629
9630
8.27k
  return Status::OK();
9631
8.27k
}
9632
9633
// If responses have been received from sufficient replicas (including hinted leader),
9634
// pick proposed leader and start election.
9635
void CatalogManager::StartElectionIfReady(
9636
153k
    const consensus::ConsensusStatePB& cstate, TabletInfo* tablet) {
9637
153k
  auto replicas = tablet->GetReplicaLocations();
9638
153k
  int num_voters = 0;
9639
459k
  for (const auto& peer : cstate.config().peers()) {
9640
459k
    if (peer.member_type() == PeerMemberType::VOTER) {
9641
457k
      ++num_voters;
9642
457k
    }
9643
459k
  }
9644
153k
  int majority_size = num_voters / 2 + 1;
9645
153k
  int running_voters = 0;
9646
459k
  for (const auto& replica : *replicas) {
9647
459k
    if (replica.second.member_type == PeerMemberType::VOTER) {
9648
457k
      ++running_voters;
9649
457k
    }
9650
459k
  }
9651
9652
153k
  
VLOG_WITH_PREFIX0
(4)
9653
0
      << __func__ << ": T " << tablet->tablet_id() << ": " << AsString(*replicas) << ", voters: "
9654
0
      << running_voters << "/" << majority_size;
9655
9656
153k
  if (running_voters < majority_size) {
9657
0
    VLOG_WITH_PREFIX(4) << __func__ << ": Not enough voters";
9658
0
    return;
9659
0
  }
9660
9661
153k
  ReplicationInfoPB replication_info;
9662
153k
  {
9663
153k
    auto l = ClusterConfig()->LockForRead();
9664
153k
    replication_info = l->pb.replication_info();
9665
153k
  }
9666
9667
  // Find tservers that can be leaders for a tablet.
9668
153k
  TSDescriptorVector ts_descs = GetAllLiveNotBlacklistedTServers();
9669
9670
153k
  std::vector<std::string> possible_leaders;
9671
459k
  for (const auto& replica : *replicas) {
9672
935k
    for (const auto& ts_desc : ts_descs) {
9673
935k
      if (ts_desc->permanent_uuid() == replica.first) {
9674
459k
        if (ts_desc->IsAcceptingLeaderLoad(replication_info)) {
9675
457k
          possible_leaders.push_back(replica.first);
9676
457k
        }
9677
459k
        break;
9678
459k
      }
9679
935k
    }
9680
459k
  }
9681
9682
153k
  if (FLAGS_TEST_create_table_leader_hint_min_lexicographic) {
9683
6
    std::string min_lexicographic;
9684
18
    for (const auto& peer : cstate.config().peers()) {
9685
18
      if (peer.member_type() == PeerMemberType::VOTER) {
9686
18
        if (min_lexicographic.empty() || 
peer.permanent_uuid() < min_lexicographic12
) {
9687
6
          min_lexicographic = peer.permanent_uuid();
9688
6
        }
9689
18
      }
9690
18
    }
9691
6
    if (min_lexicographic.empty() || !replicas->count(min_lexicographic)) {
9692
0
      LOG_WITH_PREFIX(INFO)
9693
0
          << __func__ << ": Min lexicographic is not yet ready: " << min_lexicographic;
9694
0
      return;
9695
0
    }
9696
6
    possible_leaders = { min_lexicographic };
9697
6
  }
9698
9699
153k
  if (possible_leaders.empty()) {
9700
124
    
VLOG_WITH_PREFIX0
(4) << __func__ << ": Cannot pick candidate"0
;
9701
124
    return;
9702
124
  }
9703
9704
153k
  if (!tablet->InitiateElection()) {
9705
106k
    
VLOG_WITH_PREFIX0
(4) << __func__ << ": Already initiated"0
;
9706
106k
    return;
9707
106k
  }
9708
9709
47.8k
  const auto& protege = RandomElement(possible_leaders);
9710
9711
47.8k
  LOG_WITH_PREFIX(INFO)
9712
47.8k
      << "Starting election at " << tablet->tablet_id() << " in favor of " << protege;
9713
9714
47.8k
  auto task = std::make_shared<AsyncStartElection>(master_, AsyncTaskPool(), protege, tablet);
9715
47.8k
  tablet->table()->AddTask(task);
9716
47.8k
  WARN_NOT_OK(task->Run(), "Failed to send new tablet start election request");
9717
47.8k
}
9718
9719
shared_ptr<TSDescriptor> CatalogManager::SelectReplica(
9720
    const TSDescriptorVector& ts_descs,
9721
    set<TabletServerId>* excluded,
9722
140k
    CMPerTableLoadState* per_table_state, CMGlobalLoadState* global_state) {
9723
140k
  shared_ptr<TSDescriptor> found_ts;
9724
144k
  for (const auto& sorted_load : per_table_state->sorted_load_) {
9725
    // Don't consider a tserver that has already been considered for this tablet.
9726
144k
    if (excluded->count(sorted_load)) {
9727
3
      continue;
9728
3
    }
9729
    // Only choose from the set of allowed tservers for this tablet.
9730
290k
    
auto it = std::find_if(ts_descs.begin(), ts_descs.end(), [&sorted_load](const auto& ts) 144k
{
9731
290k
      return ts->permanent_uuid() == sorted_load;
9732
290k
    });
9733
9734
144k
    if (it != ts_descs.end()) {
9735
140k
      found_ts = *it;
9736
140k
      break;
9737
140k
    }
9738
144k
  }
9739
9740
140k
  return found_ts;
9741
140k
}
9742
9743
void CatalogManager::SelectReplicas(
9744
    const TSDescriptorVector& ts_descs, size_t nreplicas, consensus::RaftConfigPB* config,
9745
    set<TabletServerId>* already_selected_ts, PeerMemberType member_type,
9746
48.8k
    CMPerTableLoadState* per_table_state, CMGlobalLoadState* global_state) {
9747
48.8k
  DCHECK_LE(nreplicas, ts_descs.size());
9748
9749
189k
  for (size_t i = 0; i < nreplicas; 
++i140k
) {
9750
140k
    shared_ptr<TSDescriptor> ts = SelectReplica(
9751
140k
        ts_descs, already_selected_ts, per_table_state, global_state);
9752
140k
    InsertOrDie(already_selected_ts, ts->permanent_uuid());
9753
    // Update the load state at global and table level.
9754
140k
    per_table_state->per_ts_load_[ts->permanent_uuid()]++;
9755
140k
    global_state->per_ts_load_[ts->permanent_uuid()]++;
9756
140k
    per_table_state->SortLoad();
9757
9758
    // Increment the number of pending replicas so that we take this selection into
9759
    // account when assigning replicas for other tablets of the same table. This
9760
    // value decays back to 0 over time.
9761
140k
    ts->IncrementRecentReplicaCreations();
9762
9763
140k
    TSRegistrationPB reg = ts->GetRegistration();
9764
9765
140k
    RaftPeerPB *peer = config->add_peers();
9766
140k
    peer->set_permanent_uuid(ts->permanent_uuid());
9767
9768
    // TODO: This is temporary, we will use only UUIDs.
9769
140k
    TakeRegistration(reg.mutable_common(), peer);
9770
140k
    peer->set_member_type(member_type);
9771
140k
  }
9772
48.8k
}
9773
9774
Status CatalogManager::ConsensusStateToTabletLocations(const consensus::ConsensusStatePB& cstate,
9775
216k
                                                       TabletLocationsPB* locs_pb) {
9776
559k
  for (const consensus::RaftPeerPB& peer : cstate.config().peers()) {
9777
559k
    TabletLocationsPB_ReplicaPB* replica_pb = locs_pb->add_replicas();
9778
559k
    if (!peer.has_permanent_uuid()) {
9779
0
      return STATUS_SUBSTITUTE(IllegalState, "Missing UUID $0", peer.ShortDebugString());
9780
0
    }
9781
559k
    replica_pb->set_role(GetConsensusRole(peer.permanent_uuid(), cstate));
9782
559k
    if (
peer.has_member_type()559k
) {
9783
559k
      replica_pb->set_member_type(peer.member_type());
9784
18.4E
    } else {
9785
18.4E
      replica_pb->set_member_type(PeerMemberType::UNKNOWN_MEMBER_TYPE);
9786
18.4E
    }
9787
559k
    TSInfoPB* tsinfo_pb = replica_pb->mutable_ts_info();
9788
559k
    tsinfo_pb->set_permanent_uuid(peer.permanent_uuid());
9789
559k
    CopyRegistration(peer, tsinfo_pb);
9790
559k
  }
9791
216k
  return Status::OK();
9792
216k
}
9793
9794
Status CatalogManager::BuildLocationsForTablet(const scoped_refptr<TabletInfo>& tablet,
9795
                                               TabletLocationsPB* locs_pb,
9796
516k
                                               IncludeInactive include_inactive) {
9797
516k
  {
9798
516k
    auto l_tablet = tablet->LockForRead();
9799
516k
    if (l_tablet->is_hidden() && 
!include_inactive0
) {
9800
0
      return STATUS_FORMAT(NotFound, "Tablet hidden", tablet->id());
9801
0
    }
9802
516k
    locs_pb->set_table_id(l_tablet->pb.table_id());
9803
516k
    *locs_pb->mutable_table_ids() = l_tablet->pb.table_ids();
9804
516k
  }
9805
9806
  // For system tables, the set of replicas is always the set of masters.
9807
516k
  if (system_tablets_.find(tablet->id()) != system_tablets_.end()) {
9808
216k
    consensus::ConsensusStatePB master_consensus;
9809
216k
    RETURN_NOT_OK(GetCurrentConfig(&master_consensus));
9810
216k
    locs_pb->set_tablet_id(tablet->tablet_id());
9811
216k
    locs_pb->set_stale(false);
9812
216k
    const auto initial_size = locs_pb->replicas_size();
9813
216k
    RETURN_NOT_OK(ConsensusStateToTabletLocations(master_consensus, locs_pb));
9814
216k
    const auto capabilities = Capabilities();
9815
    // Set capabilities of master node for all newly created system table locations.
9816
216k
    for (auto i = locs_pb->mutable_replicas()->begin() + initial_size,
9817
775k
        end = locs_pb->mutable_replicas()->end(); i != end; 
++i558k
) {
9818
558k
      *i->mutable_ts_info()->mutable_capabilities() = google::protobuf::RepeatedField<CapabilityId>(
9819
558k
          capabilities.begin(), capabilities.end());
9820
558k
    }
9821
216k
    return Status::OK();
9822
216k
  }
9823
9824
300k
  TSRegistrationPB reg;
9825
9826
300k
  std::shared_ptr<const TabletReplicaMap> locs;
9827
300k
  consensus::ConsensusStatePB cstate;
9828
300k
  {
9829
300k
    auto l_tablet = tablet->LockForRead();
9830
300k
    if (PREDICT_FALSE(l_tablet->is_deleted())) {
9831
358
      std::vector<TabletId> split_tablet_ids;
9832
358
      for (const auto& split_tablet_id : l_tablet->pb.split_tablet_ids()) {
9833
4
        split_tablet_ids.push_back(split_tablet_id);
9834
4
      }
9835
358
      return STATUS(
9836
358
          NotFound, "Tablet deleted", l_tablet->pb.state_msg(),
9837
358
          SplitChildTabletIdsData(split_tablet_ids));
9838
358
    }
9839
9840
299k
    if (PREDICT_FALSE(!l_tablet->is_running())) {
9841
9.69k
      return STATUS_FORMAT(ServiceUnavailable, "Tablet $0 not running", tablet->id());
9842
9.69k
    }
9843
9844
290k
    locs = tablet->GetReplicaLocations();
9845
290k
    if (locs->empty() && 
l_tablet->pb.has_committed_consensus_state()398
) {
9846
398
      cstate = l_tablet->pb.committed_consensus_state();
9847
398
    }
9848
9849
290k
    const auto& metadata = tablet->metadata().state().pb;
9850
290k
    locs_pb->mutable_partition()->CopyFrom(metadata.partition());
9851
290k
    locs_pb->set_split_depth(metadata.split_depth());
9852
290k
    locs_pb->set_split_parent_tablet_id(metadata.split_parent_tablet_id());
9853
290k
    for (const auto& split_tablet_id : metadata.split_tablet_ids()) {
9854
148
      *locs_pb->add_split_tablet_ids() = split_tablet_id;
9855
148
    }
9856
290k
  }
9857
9858
0
  locs_pb->set_tablet_id(tablet->tablet_id());
9859
290k
  locs_pb->set_stale(locs->empty());
9860
9861
  // If the locations are cached.
9862
290k
  if (!locs->empty()) {
9863
289k
    if (cstate.IsInitialized() &&
9864
289k
            
locs->size() != implicit_cast<size_t>(cstate.config().peers_size())0
) {
9865
0
      LOG(WARNING) << "Cached tablet replicas " << locs->size() << " does not match consensus "
9866
0
                   << cstate.config().peers_size();
9867
0
    }
9868
9869
853k
    for (const auto& replica : *locs) {
9870
853k
      TabletLocationsPB_ReplicaPB* replica_pb = locs_pb->add_replicas();
9871
853k
      replica_pb->set_role(replica.second.role);
9872
853k
      replica_pb->set_member_type(replica.second.member_type);
9873
853k
      auto tsinfo_pb = replica.second.ts_desc->GetTSInformationPB();
9874
9875
853k
      TSInfoPB* out_ts_info = replica_pb->mutable_ts_info();
9876
853k
      out_ts_info->set_permanent_uuid(tsinfo_pb->tserver_instance().permanent_uuid());
9877
853k
      CopyRegistration(tsinfo_pb->registration().common(), out_ts_info);
9878
853k
      out_ts_info->set_placement_uuid(tsinfo_pb->registration().common().placement_uuid());
9879
853k
      *out_ts_info->mutable_capabilities() = tsinfo_pb->registration().capabilities();
9880
853k
    }
9881
289k
    return Status::OK();
9882
289k
  }
9883
9884
  // If the locations were not cached.
9885
  // TODO: Why would this ever happen? See KUDU-759.
9886
480
  if (cstate.IsInitialized()) {
9887
398
    RETURN_NOT_OK(ConsensusStateToTabletLocations(cstate, locs_pb));
9888
398
  }
9889
9890
480
  return Status::OK();
9891
480
}
9892
9893
1.51M
Result<shared_ptr<tablet::AbstractTablet>> CatalogManager::GetSystemTablet(const TabletId& id) {
9894
1.51M
  const auto iter = system_tablets_.find(id);
9895
1.51M
  if (iter == system_tablets_.end()) {
9896
0
    return STATUS_SUBSTITUTE(InvalidArgument, "$0 is not a valid system tablet id", id);
9897
0
  }
9898
1.51M
  return iter->second;
9899
1.51M
}
9900
9901
Status CatalogManager::GetTabletLocations(
9902
19.7k
    const TabletId& tablet_id, TabletLocationsPB* locs_pb, IncludeInactive include_inactive) {
9903
19.7k
  scoped_refptr<TabletInfo> tablet_info;
9904
19.7k
  {
9905
19.7k
    SharedLock lock(mutex_);
9906
19.7k
    if (!FindCopy(*tablet_map_, tablet_id, &tablet_info)) {
9907
3.35k
      return STATUS_SUBSTITUTE(NotFound, "Unknown tablet $0", tablet_id);
9908
3.35k
    }
9909
19.7k
  }
9910
16.3k
  Status s = GetTabletLocations(tablet_info, locs_pb, include_inactive);
9911
9912
16.3k
  auto num_replicas = GetReplicationFactorForTablet(tablet_info);
9913
16.3k
  if (
num_replicas.ok()16.3k
&& *num_replicas > 0 &&
9914
16.3k
      implicit_cast<size_t>(locs_pb->replicas().size()) != *num_replicas) {
9915
1.05k
    YB_LOG_EVERY_N_SECS(WARNING, 1)
9916
405
        << "Expected replicas " << num_replicas << " but found "
9917
405
        << locs_pb->replicas().size() << " for tablet " << tablet_info->id() << ": "
9918
405
        << locs_pb->ShortDebugString() << THROTTLE_MSG;
9919
1.05k
  }
9920
16.3k
  return s;
9921
19.7k
}
9922
9923
Status CatalogManager::GetTabletLocations(
9924
    scoped_refptr<TabletInfo> tablet_info,
9925
    TabletLocationsPB* locs_pb,
9926
86.0k
    IncludeInactive include_inactive) {
9927
86.0k
  DCHECK_EQ(locs_pb->replicas().size(), 0);
9928
86.0k
  locs_pb->mutable_replicas()->Clear();
9929
86.0k
  return BuildLocationsForTablet(tablet_info, locs_pb, include_inactive);
9930
86.0k
}
9931
9932
Status CatalogManager::GetTableLocations(
9933
    const GetTableLocationsRequestPB* req,
9934
228k
    GetTableLocationsResponsePB* resp) {
9935
228k
  VLOG
(4) << "GetTableLocations: " << req->ShortDebugString()3
;
9936
9937
  // If start-key is > end-key report an error instead of swap the two
9938
  // since probably there is something wrong app-side.
9939
228k
  if (req->has_partition_key_start() && 
req->has_partition_key_end()72.7k
9940
228k
      && 
req->partition_key_start() > req->partition_key_end()1
) {
9941
1
    return STATUS(InvalidArgument, "start partition key is greater than the end partition key");
9942
1
  }
9943
9944
228k
  if (req->max_returned_locations() <= 0) {
9945
0
    return STATUS(InvalidArgument, "max_returned_locations must be greater than 0");
9946
0
  }
9947
9948
228k
  scoped_refptr<TableInfo> table = 
VERIFY_RESULT228k
(FindTable(req->table()));228k
9949
9950
228k
  if (table->IsCreateInProgress()) {
9951
7.31k
    resp->set_creating(true);
9952
7.31k
  }
9953
9954
228k
  auto l = table->LockForRead();
9955
228k
  RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp));
9956
9957
228k
  vector<scoped_refptr<TabletInfo>> tablets;
9958
228k
  table->GetTabletsInRange(req, &tablets);
9959
9960
228k
  IncludeInactive include_inactive(req->has_include_inactive() && 
req->include_inactive()492
);
9961
228k
  bool require_tablets_runnings = req->require_tablets_running();
9962
9963
228k
  int expected_live_replicas = 0;
9964
228k
  int expected_read_replicas = 0;
9965
228k
  GetExpectedNumberOfReplicas(&expected_live_replicas, &expected_read_replicas);
9966
366k
  for (const scoped_refptr<TabletInfo>& tablet : tablets) {
9967
366k
    TabletLocationsPB* locs_pb = resp->add_tablet_locations();
9968
366k
    locs_pb->set_expected_live_replicas(expected_live_replicas);
9969
366k
    locs_pb->set_expected_read_replicas(expected_read_replicas);
9970
366k
    auto status = BuildLocationsForTablet(tablet, locs_pb, include_inactive);
9971
366k
    if (!status.ok()) {
9972
      // Not running.
9973
9.67k
      if (require_tablets_runnings) {
9974
9.53k
        resp->mutable_tablet_locations()->Clear();
9975
9.53k
        return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, status);
9976
9.53k
      }
9977
138
      resp->mutable_tablet_locations()->RemoveLast();
9978
138
    }
9979
366k
  }
9980
9981
219k
  resp->set_table_type(l->pb.table_type());
9982
219k
  resp->set_partition_list_version(l->pb.partition_list_version());
9983
9984
219k
  return Status::OK();
9985
228k
}
9986
9987
5.43M
Status CatalogManager::GetCurrentConfig(consensus::ConsensusStatePB* cpb) const {
9988
5.43M
  auto tablet_peer = sys_catalog_->tablet_peer();
9989
5.43M
  auto consensus = tablet_peer ? 
tablet_peer->shared_consensus()5.41M
:
nullptr18.0k
;
9990
5.43M
  if (!consensus) {
9991
17.1k
    std::string uuid = master_->fs_manager()->uuid();
9992
17.1k
    return STATUS_FORMAT(IllegalState, "Node $0 peer not initialized.", uuid);
9993
17.1k
  }
9994
9995
5.41M
  *cpb = consensus->ConsensusState(CONSENSUS_CONFIG_COMMITTED);
9996
9997
5.41M
  return Status::OK();
9998
5.43M
}
9999
10000
0
void CatalogManager::DumpState(std::ostream* out, bool on_disk_dump) const {
10001
0
  NamespaceInfoMap namespace_ids_copy;
10002
0
  TableInfoMap ids_copy;
10003
0
  TableInfoByNameMap names_copy;
10004
0
  TabletInfoMap tablets_copy;
10005
10006
  // Copy the internal state so that, if the output stream blocks,
10007
  // we don't end up holding the lock for a long time.
10008
0
  {
10009
0
    SharedLock lock(mutex_);
10010
0
    namespace_ids_copy = namespace_ids_map_;
10011
0
    ids_copy = *table_ids_map_;
10012
0
    names_copy = table_names_map_;
10013
0
    tablets_copy = *tablet_map_;
10014
0
  }
10015
10016
0
  *out << "Dumping current state of master.\nNamespaces:\n";
10017
0
  for (const NamespaceInfoMap::value_type& e : namespace_ids_copy) {
10018
0
    NamespaceInfo* t = e.second.get();
10019
0
    auto l = t->LockForRead();
10020
0
    const NamespaceName& name = l->name();
10021
10022
0
    *out << t->id() << ":\n";
10023
0
    *out << "  name: \"" << strings::CHexEscape(name) << "\"\n";
10024
0
    *out << "  metadata: " << l->pb.ShortDebugString() << "\n";
10025
0
  }
10026
10027
0
  *out << "Tables:\n";
10028
0
  for (const TableInfoMap::value_type& e : ids_copy) {
10029
0
    TableInfo* t = e.second.get();
10030
0
    TabletInfos table_tablets;
10031
0
    {
10032
0
      auto l = t->LockForRead();
10033
0
      const TableName& name = l->name();
10034
0
      const NamespaceId& namespace_id = l->namespace_id();
10035
      // Find namespace by its ID.
10036
0
      scoped_refptr<NamespaceInfo> ns = FindPtrOrNull(namespace_ids_copy, namespace_id);
10037
10038
0
      *out << t->id() << ":\n";
10039
0
      *out << "  namespace id: \"" << strings::CHexEscape(namespace_id) << "\"\n";
10040
10041
0
      if (ns != nullptr) {
10042
0
        *out << "  namespace name: \"" << strings::CHexEscape(ns->name()) << "\"\n";
10043
0
      }
10044
10045
0
      *out << "  name: \"" << strings::CHexEscape(name) << "\"\n";
10046
      // Erase from the map, so later we can check that we don't have
10047
      // any orphaned tables in the by-name map that aren't in the
10048
      // by-id map.
10049
0
      if (names_copy.erase({namespace_id, name}) != 1) {
10050
0
        *out << "  [not present in by-name map]\n";
10051
0
      }
10052
0
      *out << "  metadata: " << l->pb.ShortDebugString() << "\n";
10053
10054
0
      *out << "  tablets:\n";
10055
0
      table_tablets = t->GetTablets();
10056
0
    }
10057
0
    for (const scoped_refptr<TabletInfo>& tablet : table_tablets) {
10058
0
      auto l_tablet = tablet->LockForRead();
10059
0
      *out << "    " << tablet->tablet_id() << ": "
10060
0
           << l_tablet->pb.ShortDebugString() << "\n";
10061
10062
0
      if (tablets_copy.erase(tablet->tablet_id()) != 1) {
10063
0
        *out << "  [ERROR: not present in CM tablet map!]\n";
10064
0
      }
10065
0
    }
10066
0
  }
10067
10068
0
  if (!tablets_copy.empty()) {
10069
0
    *out << "Orphaned tablets (not referenced by any table):\n";
10070
0
    for (const TabletInfoMap::value_type& entry : tablets_copy) {
10071
0
      const scoped_refptr<TabletInfo>& tablet = entry.second;
10072
0
      auto l_tablet = tablet->LockForRead();
10073
0
      *out << "    " << tablet->tablet_id() << ": "
10074
0
           << l_tablet->pb.ShortDebugString() << "\n";
10075
0
    }
10076
0
  }
10077
10078
0
  if (!names_copy.empty()) {
10079
0
    *out << "Orphaned tables (in by-name map, but not id map):\n";
10080
0
    for (const TableInfoByNameMap::value_type& e : names_copy) {
10081
0
      *out << e.second->id() << ":\n";
10082
0
      *out << "  namespace id: \"" << strings::CHexEscape(e.first.first) << "\"\n";
10083
0
      *out << "  name: \"" << CHexEscape(e.first.second) << "\"\n";
10084
0
    }
10085
0
  }
10086
10087
0
  master_->DumpMasterOptionsInfo(out);
10088
10089
0
  if (on_disk_dump) {
10090
0
    consensus::ConsensusStatePB cur_consensus_state;
10091
    // TODO: proper error handling below.
10092
0
    CHECK_OK(GetCurrentConfig(&cur_consensus_state));
10093
0
    *out << "Current raft config: " << cur_consensus_state.ShortDebugString() << "\n";
10094
0
  }
10095
0
}
10096
10097
Status CatalogManager::PeerStateDump(const vector<RaftPeerPB>& peers,
10098
                                     const DumpMasterStateRequestPB* req,
10099
0
                                     DumpMasterStateResponsePB* resp) {
10100
0
  std::unique_ptr<MasterClusterProxy> peer_proxy;
10101
0
  Endpoint sockaddr;
10102
0
  MonoTime timeout = MonoTime::Now();
10103
0
  DumpMasterStateRequestPB peer_req;
10104
0
  rpc::RpcController rpc;
10105
10106
0
  timeout.AddDelta(MonoDelta::FromMilliseconds(FLAGS_master_ts_rpc_timeout_ms));
10107
0
  rpc.set_deadline(timeout);
10108
0
  peer_req.set_on_disk(req->on_disk());
10109
0
  peer_req.set_return_dump_as_string(req->return_dump_as_string());
10110
0
  string dump;
10111
10112
0
  for (const RaftPeerPB& peer : peers) {
10113
0
    HostPort hostport = HostPortFromPB(DesiredHostPort(peer, master_->MakeCloudInfoPB()));
10114
0
    peer_proxy = std::make_unique<MasterClusterProxy>(&master_->proxy_cache(), hostport);
10115
10116
0
    DumpMasterStateResponsePB peer_resp;
10117
0
    rpc.Reset();
10118
10119
0
    RETURN_NOT_OK(peer_proxy->DumpState(peer_req, &peer_resp, &rpc));
10120
10121
0
    if (peer_resp.has_error()) {
10122
0
      LOG(WARNING) << "Hit err " << peer_resp.ShortDebugString() << " during peer "
10123
0
        << peer.ShortDebugString() << " state dump.";
10124
0
      return StatusFromPB(peer_resp.error().status());
10125
0
    } else if (req->return_dump_as_string()) {
10126
0
      dump += peer_resp.dump();
10127
0
    }
10128
0
  }
10129
10130
0
  if (req->return_dump_as_string()) {
10131
0
    resp->set_dump(resp->dump() + dump);
10132
0
  }
10133
0
  return Status::OK();
10134
0
}
10135
10136
1.56M
void CatalogManager::ReportMetrics() {
10137
  // Report metrics on how many tservers are alive.
10138
1.56M
  TSDescriptorVector ts_descs;
10139
1.56M
  master_->ts_manager()->GetAllLiveDescriptors(&ts_descs);
10140
1.56M
  const auto num_live_servers = ts_descs.size();
10141
1.56M
  metric_num_tablet_servers_live_->set_value(narrow_cast<uint32_t>(num_live_servers));
10142
10143
1.56M
  master_->ts_manager()->GetAllDescriptors(&ts_descs);
10144
1.56M
  metric_num_tablet_servers_dead_->set_value(
10145
1.56M
      narrow_cast<uint32_t>(ts_descs.size() - num_live_servers));
10146
1.56M
}
10147
10148
994k
void CatalogManager::ResetMetrics() {
10149
994k
  metric_num_tablet_servers_live_->set_value(0);
10150
994k
  metric_num_tablet_servers_dead_->set_value(0);
10151
994k
}
10152
10153
10154
377k
std::string CatalogManager::LogPrefix() const {
10155
377k
  if (tablet_peer()) {
10156
377k
    return consensus::MakeTabletLogPrefix(
10157
377k
        tablet_peer()->tablet_id(), tablet_peer()->permanent_uuid());
10158
377k
  } else {
10159
58
    return consensus::MakeTabletLogPrefix(
10160
58
        kSysCatalogTabletId, master_->fs_manager()->uuid());
10161
58
  }
10162
377k
}
10163
10164
0
void CatalogManager::SetLoadBalancerEnabled(bool is_enabled) {
10165
0
  load_balance_policy_->SetLoadBalancerEnabled(is_enabled);
10166
0
}
10167
10168
1
bool CatalogManager::IsLoadBalancerEnabled() {
10169
1
  return load_balance_policy_->IsLoadBalancerEnabled();
10170
1
}
10171
10172
1.54M
MonoDelta CatalogManager::TimeSinceElectedLeader() {
10173
1.54M
  return MonoTime::Now() - time_elected_leader_.load();
10174
1.54M
}
10175
10176
37
Status CatalogManager::GoIntoShellMode() {
10177
37
  if (master_->IsShellMode()) {
10178
0
    return STATUS(IllegalState, "Master is already in shell mode.");
10179
0
  }
10180
10181
37
  LOG(INFO) << "Starting going into shell mode.";
10182
37
  master_->SetShellMode(true);
10183
10184
37
  {
10185
37
    LockGuard lock(mutex_);
10186
37
    RETURN_NOT_OK(sys_catalog_->GoIntoShellMode());
10187
37
    background_tasks_->Shutdown();
10188
37
    background_tasks_.reset();
10189
37
  }
10190
0
  {
10191
37
    std::lock_guard<std::mutex> l(remote_bootstrap_mtx_);
10192
37
    tablet_exists_ = false;
10193
37
  }
10194
10195
37
  LOG(INFO) << "Done going into shell mode.";
10196
10197
37
  return Status::OK();
10198
37
}
10199
10200
334
Status CatalogManager::GetClusterConfig(GetMasterClusterConfigResponsePB* resp) {
10201
334
  return GetClusterConfig(resp->mutable_cluster_config());
10202
334
}
10203
10204
7.14M
Status CatalogManager::GetClusterConfig(SysClusterConfigEntryPB* config) {
10205
7.14M
  auto cluster_config = ClusterConfig();
10206
7.14M
  DCHECK
(cluster_config) << "Missing cluster config for master!"934
;
10207
7.14M
  auto l = cluster_config->LockForRead();
10208
7.14M
  *config = l->pb;
10209
7.14M
  return Status::OK();
10210
7.14M
}
10211
10212
Status CatalogManager::SetClusterConfig(
10213
172
    const ChangeMasterClusterConfigRequestPB* req, ChangeMasterClusterConfigResponsePB* resp) {
10214
172
  SysClusterConfigEntryPB config(req->cluster_config());
10215
10216
172
  if (config.has_server_blacklist()) {
10217
27
    config.mutable_server_blacklist()->set_initial_replica_load(narrow_cast<int32_t>(
10218
27
        GetNumRelevantReplicas(config.server_blacklist(), false /* leaders_only */)));
10219
27
    LOG(INFO) << Format("Set blacklist of total tservers: $0, with initial load: $1",
10220
27
                    config.server_blacklist().hosts().size(),
10221
27
                    config.server_blacklist().initial_replica_load());
10222
27
  }
10223
172
  if (config.has_leader_blacklist()) {
10224
22
    config.mutable_leader_blacklist()->set_initial_leader_load(narrow_cast<int32_t>(
10225
22
        GetNumRelevantReplicas(config.leader_blacklist(), true /* leaders_only */)));
10226
22
    LOG(INFO) << Format("Set leader blacklist of total tservers: $0, with initial load: $1",
10227
22
                    config.leader_blacklist().hosts().size(),
10228
22
                    config.leader_blacklist().initial_leader_load());
10229
22
  }
10230
10231
172
  auto cluster_config = ClusterConfig();
10232
172
  auto l = cluster_config->LockForWrite();
10233
  // We should only set the config, if the caller provided us with a valid update to the
10234
  // existing config.
10235
172
  if (l->pb.version() != config.version()) {
10236
0
    Status s = STATUS_SUBSTITUTE(IllegalState,
10237
0
      "Config version does not match, got $0, but most recent one is $1. Should call Get again",
10238
0
      config.version(), l->pb.version());
10239
0
    return SetupError(resp->mutable_error(), MasterErrorPB::CONFIG_VERSION_MISMATCH, s);
10240
0
  }
10241
10242
172
  if (config.cluster_uuid() != l->pb.cluster_uuid()) {
10243
1
    Status s = STATUS(InvalidArgument, "Config cluster UUID cannot be updated");
10244
1
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_CLUSTER_CONFIG, s);
10245
1
  }
10246
10247
  // TODO(bogdan): should this live here?
10248
171
  const ReplicationInfoPB& replication_info = config.replication_info();
10249
188
  for (int i = 0; i < replication_info.read_replicas_size(); 
i++17
) {
10250
17
    if (!replication_info.read_replicas(i).has_placement_uuid()) {
10251
0
      Status s = STATUS(IllegalState,
10252
0
                        "All read-only clusters must have a placement uuid specified");
10253
0
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_CLUSTER_CONFIG, s);
10254
0
    }
10255
17
  }
10256
10257
  // Validate placement information according to rules defined.
10258
171
  if (replication_info.has_live_replicas()) {
10259
134
    Status s = CatalogManagerUtil::IsPlacementInfoValid(replication_info.live_replicas());
10260
134
    if (!s.ok()) {
10261
0
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_CLUSTER_CONFIG, s);
10262
0
    }
10263
134
  }
10264
10265
171
  l.mutable_data()->pb.CopyFrom(config);
10266
  // Bump the config version, to indicate an update.
10267
171
  l.mutable_data()->pb.set_version(config.version() + 1);
10268
10269
171
  LOG(INFO) << "Updating cluster config to " << config.version() + 1;
10270
10271
171
  RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), cluster_config.get()));
10272
10273
171
  l.Commit();
10274
10275
171
  return Status::OK();
10276
171
}
10277
10278
Status CatalogManager::ValidateReplicationInfo(
10279
56.8k
    const ValidateReplicationInfoRequestPB* req, ValidateReplicationInfoResponsePB* resp) {
10280
56.8k
  TSDescriptorVector all_ts_descs;
10281
56.8k
  {
10282
56.8k
    BlacklistSet blacklist = VERIFY_RESULT(BlacklistSetFromPB());
10283
0
    master_->ts_manager()->GetAllLiveDescriptors(&all_ts_descs, blacklist);
10284
56.8k
  }
10285
  // We don't need any validation checks for read replica placements
10286
  // because they aren't a part of any raft quorum underneath.
10287
  // Technically, it is ok to have even 0 read replica nodes for them upfront.
10288
  // We only need it for the primary cluster replicas.
10289
0
  TSDescriptorVector ts_descs;
10290
56.8k
  GetTsDescsFromPlacementInfo(req->replication_info().live_replicas(), all_ts_descs, &ts_descs);
10291
56.8k
  Status s = CheckValidPlacementInfo(req->replication_info().live_replicas(), all_ts_descs, resp);
10292
56.8k
  if (!s.ok()) {
10293
34
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_TABLE_REPLICATION_INFO, s);
10294
34
  }
10295
56.8k
  return Status::OK();
10296
56.8k
}
10297
10298
Status CatalogManager::SetPreferredZones(
10299
3
    const SetPreferredZonesRequestPB* req, SetPreferredZonesResponsePB* resp) {
10300
3
  auto cluster_config = ClusterConfig();
10301
3
  auto l = cluster_config->LockForWrite();
10302
3
  auto replication_info = l.mutable_data()->pb.mutable_replication_info();
10303
3
  replication_info->clear_affinitized_leaders();
10304
10305
5
  for (const auto& cloud_info : req->preferred_zones()) {
10306
5
    const auto& placement_info = replication_info->live_replicas();
10307
5
    if (!CatalogManagerUtil::DoesPlacementInfoContainCloudInfo(placement_info, cloud_info)) {
10308
0
      Status s = STATUS_FORMAT(InvalidArgument, "Placement info $0 does not contain cloud info $1",
10309
0
                               placement_info, TSDescriptor::generate_placement_id(cloud_info));
10310
0
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_CLUSTER_CONFIG, s);
10311
0
    }
10312
5
    *replication_info->add_affinitized_leaders() = cloud_info;
10313
5
  }
10314
10315
3
  l.mutable_data()->pb.set_version(l.mutable_data()->pb.version() + 1);
10316
10317
3
  LOG(INFO) << "Updating cluster config to " << l.mutable_data()->pb.version();
10318
10319
3
  Status s = sys_catalog_->Upsert(leader_ready_term(), cluster_config.get());
10320
3
  if (!s.ok()) {
10321
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_CLUSTER_CONFIG, s);
10322
0
  }
10323
10324
3
  l.Commit();
10325
10326
3
  return Status::OK();
10327
3
}
10328
10329
55.7k
Result<size_t> CatalogManager::GetReplicationFactor() {
10330
55.7k
  auto cluster_config = ClusterConfig();
10331
55.7k
  DCHECK
(cluster_config) << "Missing cluster config for master!"0
;
10332
55.7k
  auto l = cluster_config->LockForRead();
10333
55.7k
  const ReplicationInfoPB& replication_info = l->pb.replication_info();
10334
55.7k
  return GetNumReplicasFromPlacementInfo(replication_info.live_replicas());
10335
55.7k
}
10336
10337
Result<size_t> CatalogManager::GetReplicationFactorForTablet(
10338
16.3k
    const scoped_refptr<TabletInfo>& tablet) {
10339
  // For system tables, the set of replicas is always the set of masters.
10340
16.3k
  if (system_tablets_.find(tablet->id()) != system_tablets_.end()) {
10341
481
    consensus::ConsensusStatePB master_consensus;
10342
481
    RETURN_NOT_OK(GetCurrentConfig(&master_consensus));
10343
481
    return master_consensus.config().peers().size();
10344
481
  }
10345
15.9k
  int num_live_replicas = 0, num_read_replicas = 0;
10346
15.9k
  GetExpectedNumberOfReplicas(&num_live_replicas, &num_read_replicas);
10347
15.9k
  return num_live_replicas + num_read_replicas;
10348
16.3k
}
10349
10350
259k
void CatalogManager::GetExpectedNumberOfReplicas(int* num_live_replicas, int* num_read_replicas) {
10351
259k
  auto l = ClusterConfig()->LockForRead();
10352
259k
  const ReplicationInfoPB& replication_info = l->pb.replication_info();
10353
259k
  *num_live_replicas = narrow_cast<int>(GetNumReplicasFromPlacementInfo(
10354
259k
      replication_info.live_replicas()));
10355
259k
  for (const auto& read_replica_placement_info : replication_info.read_replicas()) {
10356
1.93k
    *num_read_replicas += read_replica_placement_info.num_replicas();
10357
1.93k
  }
10358
259k
}
10359
10360
6.81k
Result<string> CatalogManager::placement_uuid() const {
10361
6.81k
  auto cluster_config = ClusterConfig();
10362
6.81k
  if (!cluster_config) {
10363
0
    return STATUS(IllegalState, "Missing cluster config for master!");
10364
0
  }
10365
6.81k
  auto l = cluster_config->LockForRead();
10366
6.81k
  const ReplicationInfoPB& replication_info = l->pb.replication_info();
10367
6.81k
  return replication_info.live_replicas().placement_uuid();
10368
6.81k
}
10369
10370
Status CatalogManager::IsLoadBalanced(const IsLoadBalancedRequestPB* req,
10371
239
                                      IsLoadBalancedResponsePB* resp) {
10372
239
  if (req->has_expected_num_servers()) {
10373
238
    TSDescriptorVector ts_descs;
10374
238
    master_->ts_manager()->GetAllLiveDescriptors(&ts_descs);
10375
10376
238
    if (implicit_cast<size_t>(req->expected_num_servers()) > ts_descs.size()) {
10377
9
      Status s = STATUS_SUBSTITUTE(IllegalState,
10378
9
          "Found $0, which is below the expected number of servers $1.",
10379
9
          ts_descs.size(), req->expected_num_servers());
10380
9
      return SetupError(resp->mutable_error(), MasterErrorPB::CAN_RETRY_LOAD_BALANCE_CHECK, s);
10381
9
    }
10382
238
  }
10383
10384
230
  Status s = load_balance_policy_->IsIdle();
10385
230
  if (!s.ok()) {
10386
194
    return SetupError(resp->mutable_error(), MasterErrorPB::CAN_RETRY_LOAD_BALANCE_CHECK, s);
10387
194
  }
10388
10389
36
  return Status::OK();
10390
230
}
10391
10392
Status CatalogManager::IsLoadBalancerIdle(const IsLoadBalancerIdleRequestPB* req,
10393
2.48k
                                          IsLoadBalancerIdleResponsePB* resp) {
10394
2.48k
  Status s = load_balance_policy_->IsIdle();
10395
2.48k
  if (!s.ok()) {
10396
1.98k
    return SetupError(resp->mutable_error(), MasterErrorPB::LOAD_BALANCER_RECENTLY_ACTIVE, s);
10397
1.98k
  }
10398
10399
502
  return Status::OK();
10400
2.48k
}
10401
10402
Status CatalogManager::AreLeadersOnPreferredOnly(const AreLeadersOnPreferredOnlyRequestPB* req,
10403
147
                                                 AreLeadersOnPreferredOnlyResponsePB* resp) {
10404
  // If we have cluster replication info, then only fetch live tservers (ignore read replicas).
10405
147
  TSDescriptorVector ts_descs;
10406
147
  string live_replicas_placement_uuid = "";
10407
147
  {
10408
147
    auto l = ClusterConfig()->LockForRead();
10409
147
    const ReplicationInfoPB& cluster_replication_info = l->pb.replication_info();
10410
147
    if (cluster_replication_info.has_live_replicas()) {
10411
109
      live_replicas_placement_uuid = cluster_replication_info.live_replicas().placement_uuid();
10412
109
    }
10413
147
  }
10414
10415
147
  {
10416
147
    BlacklistSet blacklist = VERIFY_RESULT(BlacklistSetFromPB());
10417
147
    if (live_replicas_placement_uuid.empty()) {
10418
146
      master_->ts_manager()->GetAllLiveDescriptors(&ts_descs, blacklist);
10419
146
    } else {
10420
1
      master_->ts_manager()->GetAllLiveDescriptorsInCluster(
10421
1
          &ts_descs, live_replicas_placement_uuid,
10422
1
          blacklist);
10423
1
    }
10424
147
  }
10425
10426
  // Only need to fetch if txn tables are not using preferred zones.
10427
0
  vector<TableInfoPtr> tables;
10428
147
  if (!FLAGS_transaction_tables_use_preferred_zones) {
10429
147
    tables = master_->catalog_manager()->GetTables(GetTablesMode::kRunning);
10430
147
  }
10431
10432
147
  auto l = ClusterConfig()->LockForRead();
10433
147
  Status s = CatalogManagerUtil::AreLeadersOnPreferredOnly(
10434
147
      ts_descs, l->pb.replication_info(), tables);
10435
147
  if (!s.ok()) {
10436
132
    return SetupError(
10437
132
        resp->mutable_error(), MasterErrorPB::CAN_RETRY_ARE_LEADERS_ON_PREFERRED_ONLY_CHECK, s);
10438
132
  }
10439
10440
15
  return Status::OK();
10441
147
}
10442
10443
2.49k
int64_t CatalogManager::GetNumRelevantReplicas(const BlacklistPB& blacklist, bool leaders_only) {
10444
2.49k
  int64_t res = 0;
10445
2.49k
  SharedLock lock(mutex_);
10446
87.1k
  for (const TabletInfoMap::value_type& entry : *tablet_map_) {
10447
87.1k
    scoped_refptr<TabletInfo> tablet = entry.second;
10448
87.1k
    auto l = tablet->LockForRead();
10449
    // Not checking being created on purpose as we do not want initial load to be under accounted.
10450
87.1k
    if (!tablet->table() ||
10451
87.1k
        PREDICT_FALSE(l->is_deleted())) {
10452
0
      continue;
10453
0
    }
10454
10455
87.1k
    auto locs = tablet->GetReplicaLocations();
10456
134k
    for (const auto& replica : *locs) {
10457
134k
      if (leaders_only && 
replica.second.role != PeerRole::LEADER21.9k
) {
10458
14.6k
        continue;
10459
14.6k
      }
10460
368k
      
for (int i = 0; 119k
i < blacklist.hosts_size();
i++248k
) {
10461
295k
        if (replica.second.ts_desc->IsRunningOn(blacklist.hosts(i))) {
10462
46.9k
          ++res;
10463
46.9k
          break;
10464
46.9k
        }
10465
295k
      }
10466
119k
    }
10467
87.1k
  }
10468
10469
2.49k
  return res;
10470
2.49k
}
10471
10472
Status CatalogManager::FillHeartbeatResponse(const TSHeartbeatRequestPB* req,
10473
0
                                             TSHeartbeatResponsePB* resp) {
10474
0
  return Status::OK();
10475
0
}
10476
10477
2.06k
Status CatalogManager::GetLoadMoveCompletionPercent(GetLoadMovePercentResponsePB* resp) {
10478
2.06k
  return GetLoadMoveCompletionPercent(resp, false);
10479
2.06k
}
10480
10481
388
Status CatalogManager::GetLeaderBlacklistCompletionPercent(GetLoadMovePercentResponsePB* resp) {
10482
388
  return GetLoadMoveCompletionPercent(resp, true);
10483
388
}
10484
10485
Status CatalogManager::GetLoadMoveCompletionPercent(GetLoadMovePercentResponsePB* resp,
10486
2.44k
                                                    bool blacklist_leader) {
10487
2.44k
  auto l = ClusterConfig()->LockForRead();
10488
10489
  // Fine to pass in empty defaults if server_blacklist or leader_blacklist is not filled.
10490
2.44k
  const BlacklistPB& state = blacklist_leader ? 
l->pb.leader_blacklist()388
:
l->pb.server_blacklist()2.06k
;
10491
2.44k
  int64_t blacklist_replicas = GetNumRelevantReplicas(state, blacklist_leader);
10492
2.44k
  int64_t initial_load = (blacklist_leader) ?
10493
2.06k
                                
state.initial_leader_load()388
: state.initial_replica_load();
10494
  // If we are starting up and don't find any load on the tservers, return progress as 0.
10495
  // We expect that by blacklist_progress_initial_delay_secs time, this should go away and if the
10496
  // load is reported as 0 on the blacklisted tservers after this time then it means that
10497
  // the transfer is successfully complete.
10498
2.44k
  if (blacklist_replicas == 0 &&
10499
2.44k
  
TimeSinceElectedLeader() <= MonoDelta::FromSeconds(FLAGS_blacklist_progress_initial_delay_secs)1.19k
) {
10500
970
      LOG(INFO) << "Master leadership has changed. Reporting progress as 0 until the catalog " <<
10501
970
                   "manager gets the correct estimates of the remaining load on the blacklisted" <<
10502
970
                   "tservers.";
10503
970
      resp->set_percent(0);
10504
970
      resp->set_total(initial_load);
10505
970
      resp->set_remaining(initial_load);
10506
970
      return Status::OK();
10507
970
  }
10508
10509
  // On change of master leader, initial_load_ information may be lost temporarily. Reset to
10510
  // current value to avoid reporting progress percent as 100. Note that doing so will report
10511
  // progress percent as 0 instead.
10512
  // TODO(Sanket): This might be no longer relevant after we persist and load the initial load
10513
  // on failover. Need to investigate.
10514
1.47k
  if (initial_load < blacklist_replicas) {
10515
0
    LOG(INFO) << Format("Initial load: $0, current load: $1."
10516
0
              " Initial load is less than the current load. Probably a master leader change."
10517
0
              " Reporting progress as 0", state.initial_replica_load(),
10518
0
              blacklist_replicas);
10519
0
    initial_load = blacklist_replicas;
10520
0
  }
10521
10522
1.47k
  LOG(INFO) << "Blacklisted count " << blacklist_replicas
10523
1.47k
            << " across " << state.hosts_size()
10524
1.47k
            << " servers, with initial load " << initial_load;
10525
10526
  // Case when a blacklisted servers did not have any starting load.
10527
1.47k
  if (initial_load == 0) {
10528
64
    resp->set_percent(100);
10529
64
    return Status::OK();
10530
64
  }
10531
10532
1.41k
  resp->set_percent(
10533
1.41k
      100 - (static_cast<double>(blacklist_replicas) * 100 / initial_load));
10534
1.41k
  resp->set_remaining(blacklist_replicas);
10535
1.41k
  resp->set_total(initial_load);
10536
10537
1.41k
  return Status::OK();
10538
1.47k
}
10539
10540
3.10k
void CatalogManager::AbortAndWaitForAllTasks(const vector<scoped_refptr<TableInfo>>& tables) {
10541
3.61k
  for (const auto& t : tables) {
10542
3.61k
    VLOG
(1) << "Aborting tasks for table " << t->ToString()0
;
10543
3.61k
    t->AbortTasksAndClose();
10544
3.61k
  }
10545
3.61k
  for (const auto& t : tables) {
10546
3.61k
    VLOG
(1) << "Waiting on Aborting tasks for table " << t->ToString()0
;
10547
3.61k
    t->WaitTasksCompletion();
10548
3.61k
  }
10549
3.10k
  VLOG
(1) << "Waiting on Aborting tasks done"0
;
10550
3.10k
}
10551
10552
497k
void CatalogManager::HandleNewTableId(const TableId& table_id) {
10553
497k
  if (table_id == kPgProcTableId) {
10554
    // Needed to track whether initdb has started running.
10555
763
    pg_proc_exists_.store(true, std::memory_order_release);
10556
763
  }
10557
497k
}
10558
10559
499k
scoped_refptr<TableInfo> CatalogManager::NewTableInfo(TableId id) {
10560
499k
  return make_scoped_refptr<TableInfo>(id, tasks_tracker_);
10561
499k
}
10562
10563
363k
Status CatalogManager::ScheduleTask(std::shared_ptr<RetryingTSRpcTask> task) {
10564
363k
  Status s = async_task_pool_->SubmitFunc([task]() {
10565
363k
      WARN_NOT_OK(task->Run(), "Failed task");
10566
363k
  });
10567
  // If we are not able to enqueue, abort the task.
10568
363k
  if (!s.ok()) {
10569
0
    task->AbortAndReturnPrevState(s);
10570
0
  }
10571
363k
  return s;
10572
363k
}
10573
10574
Status CatalogManager::CollectTable(
10575
    const TableDescription& table_description,
10576
    CollectFlags flags,
10577
    std::vector<TableDescription>* all_tables,
10578
45
    std::unordered_set<NamespaceId>* parent_colocated_table_ids) {
10579
45
  auto lock = table_description.table_info->LockForRead();
10580
45
  if (lock->started_hiding()) {
10581
4
    
VLOG_WITH_PREFIX_AND_FUNC0
(4)
10582
0
        << "Rejected hidden table: " << AsString(table_description.table_info);
10583
4
    return Status::OK();
10584
4
  }
10585
41
  if (lock->started_deleting()) {
10586
0
    VLOG_WITH_PREFIX_AND_FUNC(4)
10587
0
        << "Rejected deleted table: " << AsString(table_description.table_info);
10588
0
    return Status::OK();
10589
0
  }
10590
41
  if (flags.Test(CollectFlag::kIncludeParentColocatedTable) && 
lock->pb.colocated()30
) {
10591
    // If a table is colocated, add its parent colocated table as well.
10592
0
    const auto parent_table_id =
10593
0
        table_description.namespace_info->id() + kColocatedParentTableIdSuffix;
10594
0
    auto result = parent_colocated_table_ids->insert(parent_table_id);
10595
0
    if (result.second) {
10596
      // We have not processed this parent table id yet, so do that now.
10597
0
      TableIdentifierPB parent_table_pb;
10598
0
      parent_table_pb.set_table_id(parent_table_id);
10599
0
      parent_table_pb.mutable_namespace_()->set_id(table_description.namespace_info->id());
10600
0
      all_tables->push_back(VERIFY_RESULT(DescribeTable(
10601
0
          parent_table_pb, flags.Test(CollectFlag::kSucceedIfCreateInProgress))));
10602
0
    }
10603
0
  }
10604
41
  all_tables->push_back(table_description);
10605
10606
41
  if (flags.Test(CollectFlag::kAddIndexes)) {
10607
6
    TRACE(Substitute("Locking object with id $0", table_description.table_info->id()));
10608
10609
6
    if (lock->is_index()) {
10610
0
      return STATUS(InvalidArgument, "Expected table, but found index",
10611
0
                    table_description.table_info->id(),
10612
0
                    MasterError(MasterErrorPB::INVALID_TABLE_TYPE));
10613
0
    }
10614
10615
6
    if (lock->table_type() == PGSQL_TABLE_TYPE) {
10616
0
      return STATUS(InvalidArgument, "Getting indexes for YSQL table is not supported",
10617
0
                    table_description.table_info->id(),
10618
0
                    MasterError(MasterErrorPB::INVALID_TABLE_TYPE));
10619
0
    }
10620
10621
6
    auto collect_index_flags = flags;
10622
    // Don't need to collect indexes for index.
10623
6
    collect_index_flags.Reset(CollectFlag::kAddIndexes);
10624
6
    for (const auto& index_info : lock->pb.indexes()) {
10625
0
      LOG_IF(DFATAL, table_description.table_info->id() != index_info.indexed_table_id())
10626
0
              << "Wrong indexed table id in index descriptor";
10627
0
      TableIdentifierPB index_id_pb;
10628
0
      index_id_pb.set_table_id(index_info.table_id());
10629
0
      index_id_pb.mutable_namespace_()->set_id(table_description.namespace_info->id());
10630
0
      auto index_description = VERIFY_RESULT(DescribeTable(
10631
0
          index_id_pb, flags.Test(CollectFlag::kSucceedIfCreateInProgress)));
10632
0
      RETURN_NOT_OK(CollectTable(
10633
0
          index_description, collect_index_flags, all_tables, parent_colocated_table_ids));
10634
0
    }
10635
6
  }
10636
10637
41
  return Status::OK();
10638
41
}
10639
10640
Result<vector<TableDescription>> CatalogManager::CollectTables(
10641
    const google::protobuf::RepeatedPtrField<TableIdentifierPB>& table_identifiers,
10642
    CollectFlags flags,
10643
51
    std::unordered_set<NamespaceId>* namespaces) {
10644
51
  std::vector<std::pair<TableInfoPtr, CollectFlags>> table_with_flags;
10645
10646
51
  {
10647
51
    SharedLock lock(mutex_);
10648
53
    for (const auto& table_id_pb : table_identifiers) {
10649
53
      if (table_id_pb.table_name().empty() && 
table_id_pb.table_id().empty()47
&&
10650
53
          
table_id_pb.has_namespace_()39
) {
10651
39
        auto namespace_info = FindNamespaceUnlocked(table_id_pb.namespace_());
10652
39
        if (!namespace_info.ok()) {
10653
0
          if (namespace_info.status().IsNotFound()) {
10654
0
            continue;
10655
0
          }
10656
0
          return namespace_info.status();
10657
0
        }
10658
39
        if (namespaces) {
10659
39
          namespaces->insert((**namespace_info).id());
10660
39
        }
10661
10662
10663
39
        auto ns_collect_flags = flags;
10664
        // Don't collect indexes, since they should be in the same namespace and will be collected
10665
        // as regular tables.
10666
        // It is necessary because we don't support kAddIndexes for YSQL tables.
10667
39
        ns_collect_flags.Reset(CollectFlag::kAddIndexes);
10668
39
        
VLOG_WITH_PREFIX_AND_FUNC0
(1)
10669
0
            << "Collecting all tables from: " << (**namespace_info).ToString() << ", specified as: "
10670
0
            << table_id_pb.namespace_().ShortDebugString();
10671
8.74k
        for (const auto& id_and_table : *table_ids_map_) {
10672
8.74k
          if (id_and_table.second->is_system()) {
10673
8.67k
            
VLOG_WITH_PREFIX_AND_FUNC0
(4) << "Rejected system table: " << AsString(id_and_table)0
;
10674
8.67k
            continue;
10675
8.67k
          }
10676
64
          auto lock = id_and_table.second->LockForRead();
10677
64
          if (lock->namespace_id() != (**namespace_info).id()) {
10678
33
            
VLOG_WITH_PREFIX_AND_FUNC0
(4)
10679
0
                << "Rejected table from other namespace: " << AsString(id_and_table);
10680
33
            continue;
10681
33
          }
10682
31
          
VLOG_WITH_PREFIX_AND_FUNC0
(4) << "Accepted: " << AsString(id_and_table)0
;
10683
31
          table_with_flags.emplace_back(id_and_table.second, ns_collect_flags);
10684
31
        }
10685
39
      } else {
10686
14
        auto table = VERIFY_RESULT(FindTableUnlocked(table_id_pb));
10687
0
        VLOG_WITH_PREFIX_AND_FUNC(1) << "Collecting table: " << table->ToString();
10688
14
        table_with_flags.emplace_back(table, flags);
10689
14
      }
10690
53
    }
10691
51
  }
10692
10693
51
  std::sort(table_with_flags.begin(), table_with_flags.end(), [](const auto& p1, const auto& p2) {
10694
7
    return p1.first->id() < p2.first->id();
10695
7
  });
10696
51
  std::vector<TableDescription> all_tables;
10697
51
  std::unordered_set<NamespaceId> parent_colocated_table_ids;
10698
51
  const TableId* table_id = nullptr;
10699
51
  for (auto& table_and_flags : table_with_flags) {
10700
45
    if (table_id && 
*table_id == table_and_flags.first->id()7
) {
10701
0
      return STATUS_FORMAT(InternalError, "Table collected twice $0", *table_id);
10702
0
    }
10703
45
    auto description = VERIFY_RESULT(DescribeTable(
10704
45
        table_and_flags.first,
10705
45
        table_and_flags.second.Test(CollectFlag::kSucceedIfCreateInProgress)));
10706
45
    RETURN_NOT_OK(CollectTable(
10707
45
        description, table_and_flags.second, &all_tables, &parent_colocated_table_ids));
10708
45
    table_id = &table_and_flags.first->id();
10709
45
  }
10710
10711
51
  return all_tables;
10712
51
}
10713
10714
Result<std::vector<TableDescription>> CatalogManager::CollectTables(
10715
    const google::protobuf::RepeatedPtrField<TableIdentifierPB>& table_identifiers,
10716
    bool add_indexes,
10717
10
    bool include_parent_colocated_table) {
10718
10
  CollectFlags flags;
10719
10
  flags.SetIf(CollectFlag::kAddIndexes, add_indexes);
10720
10
  flags.SetIf(CollectFlag::kIncludeParentColocatedTable, include_parent_colocated_table);
10721
10
  return CollectTables(table_identifiers, flags);
10722
10
}
10723
10724
3.00k
Status CatalogManager::GetYQLPartitionsVTable(std::shared_ptr<SystemTablet>* tablet) {
10725
3.00k
  scoped_refptr<TableInfo> table = FindPtrOrNull(table_names_map_,
10726
3.00k
      std::make_pair(kSystemNamespaceId, kSystemPartitionsTableName));
10727
3.00k
  SCHECK(table != nullptr, NotFound, "YQL system.partitions table not found");
10728
10729
3.00k
  auto tablets = table->GetTablets();
10730
3.00k
  SCHECK(tablets.size() == 1, NotFound, "YQL system.partitions tablet not found");
10731
3.00k
  *tablet = std::dynamic_pointer_cast<SystemTablet>(
10732
3.00k
      VERIFY_RESULT(GetSystemTablet(tablets[0]->tablet_id())));
10733
0
  return Status::OK();
10734
3.00k
}
10735
10736
245k
void CatalogManager::RebuildYQLSystemPartitions() {
10737
245k
  if (YQLPartitionsVTable::GeneratePartitionsVTableWithBgTask() ||
10738
245k
      
YQLPartitionsVTable::GeneratePartitionsVTableOnChanges()245k
) {
10739
245k
    SCOPED_LEADER_SHARED_LOCK(l, this);
10740
245k
    if (l.catalog_status().ok() && 
l.leader_status().ok()243k
) {
10741
153k
      if (system_partitions_tablet_ != nullptr) {
10742
153k
        Status s;
10743
153k
        if (YQLPartitionsVTable::GeneratePartitionsVTableWithBgTask()) {
10744
          // If we are not generating the vtable on changes, then we need to do a full refresh.
10745
3
          s = ResultToStatus(GetYqlPartitionsVtable().GenerateAndCacheData());
10746
153k
        } else {
10747
          // Otherwise, we can simply update the cached vtable with the internal map.
10748
153k
          s = GetYqlPartitionsVtable().UpdateCache();
10749
153k
        }
10750
153k
        if (!s.ok()) {
10751
0
          LOG(ERROR) << "Error rebuilding system.partitions: " << s.ToString();
10752
0
        }
10753
153k
      } else {
10754
0
        LOG(ERROR) << "Error finding system.partitions vtable.";
10755
0
      }
10756
153k
    }
10757
245k
  }
10758
10759
245k
  auto wait_time = FLAGS_partitions_vtable_cache_refresh_secs * 1s;
10760
245k
  if (wait_time <= 0s) {
10761
245k
    wait_time = kDefaultYQLPartitionsRefreshBgTaskSleep;
10762
245k
  }
10763
245k
  refresh_yql_partitions_task_.Schedule([this](const Status& status) {
10764
237k
    WARN_NOT_OK(
10765
237k
        background_tasks_thread_pool_->SubmitFunc([this]() { RebuildYQLSystemPartitions(); }),
10766
237k
        "Failed to schedule: RebuildYQLSystemPartitions");
10767
237k
  }, wait_time);
10768
245k
}
10769
10770
1.56M
Status CatalogManager::SysCatalogRespectLeaderAffinity() {
10771
1.56M
  auto l = ClusterConfig()->LockForRead();
10772
10773
1.56M
  const auto& affinitized_leaders = l->pb.replication_info().affinitized_leaders();
10774
1.56M
  if (affinitized_leaders.empty()) {
10775
1.56M
    return Status::OK();
10776
1.56M
  }
10777
10778
271
  
for (const CloudInfoPB& cloud_info : affinitized_leaders)244
{
10779
    // Do nothing if already in an affinitized zone.
10780
271
    if (CatalogManagerUtil::IsCloudInfoEqual(cloud_info, server_registration_.cloud_info())) {
10781
152
      return Status::OK();
10782
152
    }
10783
271
  }
10784
10785
  // Not in affinitized zone, try finding a master to send a step down request to.
10786
92
  std::vector<ServerEntryPB> masters;
10787
92
  RETURN_NOT_OK(master_->ListMasters(&masters));
10788
10789
213
  
for (const ServerEntryPB& master : masters)92
{
10790
213
    auto master_cloud_info = master.registration().cloud_info();
10791
10792
257
    for (const CloudInfoPB& config_cloud_info : affinitized_leaders) {
10793
257
      if (CatalogManagerUtil::IsCloudInfoEqual(config_cloud_info, master_cloud_info)) {
10794
0
        if (PREDICT_FALSE(
10795
0
            GetAtomicFlag(&FLAGS_TEST_crash_server_on_sys_catalog_leader_affinity_move))) {
10796
0
          LOG_WITH_PREFIX(FATAL) << "For test: Crashing the server instead of performing sys "
10797
0
                                    "catalog leader affinity move.";
10798
0
        }
10799
0
        YB_LOG_WITH_PREFIX_EVERY_N_SECS(INFO, 10)
10800
0
            << "Sys catalog tablet is not in an affinitized zone, "
10801
0
            << "sending step down request to master uuid "
10802
0
            << master.instance_id().permanent_uuid()
10803
0
            << " in zone "
10804
0
            << TSDescriptor::generate_placement_id(master_cloud_info);
10805
0
        std::shared_ptr<TabletPeer> tablet_peer;
10806
0
        RETURN_NOT_OK(GetTabletPeer(sys_catalog_->tablet_id(), &tablet_peer));
10807
10808
0
        consensus::LeaderStepDownRequestPB req;
10809
0
        req.set_tablet_id(sys_catalog_->tablet_id());
10810
0
        req.set_dest_uuid(sys_catalog_->tablet_peer()->permanent_uuid());
10811
0
        req.set_new_leader_uuid(master.instance_id().permanent_uuid());
10812
10813
0
        consensus::LeaderStepDownResponsePB resp;
10814
0
        RETURN_NOT_OK(tablet_peer->consensus()->StepDown(&req, &resp));
10815
0
        if (resp.has_error()) {
10816
0
          YB_LOG_WITH_PREFIX_EVERY_N_SECS(INFO, 10) << "Step down failed: "
10817
0
                                                    << resp.error().status().message();
10818
0
          break;
10819
0
        }
10820
0
        LOG_WITH_PREFIX(INFO) << "Successfully stepped down to new master";
10821
0
        return Status::OK();
10822
0
      }
10823
257
    }
10824
213
  }
10825
10826
92
  return STATUS(NotFound, "Couldn't step down to a master in an affinitized zone");
10827
92
}
10828
10829
255k
Result<BlacklistSet> CatalogManager::BlacklistSetFromPB() const {
10830
255k
  auto cluster_config = ClusterConfig();
10831
255k
  if (!cluster_config) {
10832
2
    return STATUS(IllegalState, "Cluster config not found.");
10833
2
  }
10834
255k
  auto l = cluster_config->LockForRead();
10835
10836
255k
  const auto& blacklist_pb = l->pb.server_blacklist();
10837
255k
  BlacklistSet blacklist_set;
10838
255k
  for (int i = 0; i < blacklist_pb.hosts_size(); 
i++110
) {
10839
110
    blacklist_set.insert(HostPortFromPB(blacklist_pb.hosts(i)));
10840
110
  }
10841
10842
255k
  return blacklist_set;
10843
255k
}
10844
10845
void CatalogManager::ProcessTabletStorageMetadata(
10846
    const std::string& ts_uuid,
10847
2.19M
    const TabletDriveStorageMetadataPB& storage_metadata) {
10848
2.19M
  const string& tablet_id = storage_metadata.tablet_id();
10849
2.19M
  scoped_refptr<TabletInfo> tablet;
10850
2.19M
  {
10851
2.19M
    SharedLock lock(mutex_);
10852
2.19M
    tablet = FindPtrOrNull(*tablet_map_, tablet_id);
10853
2.19M
  }
10854
2.19M
  if (!tablet) {
10855
0
    VLOG(1) << Format("Tablet $0 not found on ts $1", tablet_id, ts_uuid);
10856
0
    return;
10857
0
  }
10858
2.19M
  TabletReplicaDriveInfo drive_info{
10859
2.19M
        storage_metadata.sst_file_size(),
10860
2.19M
        storage_metadata.wal_file_size(),
10861
2.19M
        storage_metadata.uncompressed_sst_file_size(),
10862
2.19M
        storage_metadata.may_have_orphaned_post_split_data()};
10863
2.19M
  tablet->UpdateReplicaDriveInfo(ts_uuid, drive_info);
10864
2.19M
}
10865
10866
173k
void CatalogManager::CheckTableDeleted(const TableInfoPtr& table) {
10867
173k
  if (!FLAGS_master_drop_table_after_task_response) {
10868
0
    return;
10869
0
  }
10870
  // Since this is called after every successful async DeleteTablet, it's possible if all tasks
10871
  // complete, for us to mark the table as DELETED/HIDDEN asap. This is desirable as clients will
10872
  // wait for this before returning success to the user.
10873
  //
10874
  // However, if tasks fail, timeout, or are aborted, we still have the background thread as a
10875
  // catch all.
10876
173k
  auto lock = MaybeTransitionTableToDeleted(table);
10877
173k
  if (!lock.locked()) {
10878
167k
    return;
10879
167k
  }
10880
5.93k
  Status s = sys_catalog_->Upsert(leader_ready_term(), table);
10881
5.93k
  if (!s.ok()) {
10882
2
    LOG_WITH_PREFIX(WARNING)
10883
2
        << "Error marking table as "
10884
2
        << (table->LockForRead()->started_deleting() ? "DELETED" : 
"HIDDEN"0
) << ": " << s;
10885
2
    return;
10886
2
  }
10887
5.93k
  lock.Commit();
10888
5.93k
}
10889
10890
635k
const YQLPartitionsVTable& CatalogManager::GetYqlPartitionsVtable() const {
10891
635k
  return down_cast<const YQLPartitionsVTable&>(system_partitions_tablet_->QLStorage());
10892
635k
}
10893
10894
void CatalogManager::InitializeTableLoadState(
10895
21.4k
    const TableId& table_id, TSDescriptorVector ts_descs, CMPerTableLoadState* state) {
10896
63.0k
  for (const auto& ts : ts_descs) {
10897
    // Touch every tserver with 0 load.
10898
63.0k
    state->per_ts_load_[ts->permanent_uuid()];
10899
    // Insert into the sorted list.
10900
63.0k
    state->sorted_load_.emplace_back(ts->permanent_uuid());
10901
63.0k
  }
10902
10903
21.4k
  auto table_info = GetTableInfo(table_id);
10904
10905
21.4k
  if (!table_info) {
10906
0
    return;
10907
0
  }
10908
21.4k
  CatalogManagerUtil::FillTableLoadState(table_info, state);
10909
21.4k
}
10910
10911
void CatalogManager::InitializeGlobalLoadState(
10912
20.5k
    TSDescriptorVector ts_descs, CMGlobalLoadState* state) {
10913
60.2k
  for (const auto& ts : ts_descs) {
10914
    // Touch every tserver with 0 load.
10915
60.2k
    state->per_ts_load_[ts->permanent_uuid()];
10916
60.2k
  }
10917
10918
20.5k
  SharedLock l(mutex_);
10919
7.36M
  for (const auto& id_and_info : *table_ids_map_) {
10920
    // Ignore system, colocated and deleting/deleted tables.
10921
7.36M
    {
10922
7.36M
      auto l = id_and_info.second->LockForRead();
10923
7.36M
      if (IsSystemTable(*(id_and_info.second)) ||
10924
7.36M
          
id_and_info.second->IsColocatedUserTable()298k
||
10925
7.36M
          
l->started_deleting()297k
) {
10926
7.20M
        continue;
10927
7.20M
      }
10928
7.36M
    }
10929
158k
    CatalogManagerUtil::FillTableLoadState(id_and_info.second, state);
10930
158k
  }
10931
20.5k
}
10932
10933
12.1M
std::shared_ptr<ClusterConfigInfo> CatalogManager::ClusterConfig() const {
10934
12.1M
  yb::SharedLock<decltype(config_mutex_)> lock(config_mutex_);
10935
12.1M
  return cluster_config_;
10936
12.1M
}
10937
10938
}  // namespace master
10939
}  // namespace yb